xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_subr.c (revision e0cf54a5673ecf2b9054101898211427b97772f7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/param.h>
27 #include <sys/types.h>
28 #include <sys/systm.h>
29 #include <sys/cred_impl.h>
30 #include <sys/proc.h>
31 #include <sys/user.h>
32 #include <sys/time.h>
33 #include <sys/buf.h>
34 #include <sys/vfs.h>
35 #include <sys/vnode.h>
36 #include <sys/socket.h>
37 #include <sys/uio.h>
38 #include <sys/tiuser.h>
39 #include <sys/swap.h>
40 #include <sys/errno.h>
41 #include <sys/debug.h>
42 #include <sys/kmem.h>
43 #include <sys/kstat.h>
44 #include <sys/cmn_err.h>
45 #include <sys/vtrace.h>
46 #include <sys/session.h>
47 #include <sys/dnlc.h>
48 #include <sys/bitmap.h>
49 #include <sys/acl.h>
50 #include <sys/ddi.h>
51 #include <sys/pathname.h>
52 #include <sys/flock.h>
53 #include <sys/dirent.h>
54 #include <sys/flock.h>
55 #include <sys/callb.h>
56 #include <sys/atomic.h>
57 #include <sys/list.h>
58 #include <sys/tsol/tnet.h>
59 #include <sys/priv.h>
60 #include <sys/sdt.h>
61 #include <sys/attr.h>
62 
63 #include <inet/ip6.h>
64 
65 #include <rpc/types.h>
66 #include <rpc/xdr.h>
67 #include <rpc/auth.h>
68 #include <rpc/clnt.h>
69 
70 #include <nfs/nfs.h>
71 #include <nfs/nfs4.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/rnode.h>
74 #include <nfs/nfs_acl.h>
75 
76 #include <sys/tsol/label.h>
77 
78 /*
79  * The hash queues for the access to active and cached rnodes
80  * are organized as doubly linked lists.  A reader/writer lock
81  * for each hash bucket is used to control access and to synchronize
82  * lookups, additions, and deletions from the hash queue.
83  *
84  * The rnode freelist is organized as a doubly linked list with
85  * a head pointer.  Additions and deletions are synchronized via
86  * a single mutex.
87  *
88  * In order to add an rnode to the free list, it must be hashed into
89  * a hash queue and the exclusive lock to the hash queue be held.
90  * If an rnode is not hashed into a hash queue, then it is destroyed
91  * because it represents no valuable information that can be reused
92  * about the file.  The exclusive lock to the hash queue must be
93  * held in order to prevent a lookup in the hash queue from finding
94  * the rnode and using it and assuming that the rnode is not on the
95  * freelist.  The lookup in the hash queue will have the hash queue
96  * locked, either exclusive or shared.
97  *
98  * The vnode reference count for each rnode is not allowed to drop
99  * below 1.  This prevents external entities, such as the VM
100  * subsystem, from acquiring references to vnodes already on the
101  * freelist and then trying to place them back on the freelist
102  * when their reference is released.  This means that the when an
103  * rnode is looked up in the hash queues, then either the rnode
104  * is removed from the freelist and that reference is transferred to
105  * the new reference or the vnode reference count must be incremented
106  * accordingly.  The mutex for the freelist must be held in order to
107  * accurately test to see if the rnode is on the freelist or not.
108  * The hash queue lock might be held shared and it is possible that
109  * two different threads may race to remove the rnode from the
110  * freelist.  This race can be resolved by holding the mutex for the
111  * freelist.  Please note that the mutex for the freelist does not
112  * need to held if the rnode is not on the freelist.  It can not be
113  * placed on the freelist due to the requirement that the thread
114  * putting the rnode on the freelist must hold the exclusive lock
115  * to the hash queue and the thread doing the lookup in the hash
116  * queue is holding either a shared or exclusive lock to the hash
117  * queue.
118  *
119  * The lock ordering is:
120  *
121  *	hash bucket lock -> vnode lock
122  *	hash bucket lock -> freelist lock
123  */
124 static rhashq_t *rtable;
125 
126 static kmutex_t rpfreelist_lock;
127 static rnode_t *rpfreelist = NULL;
128 static long rnew = 0;
129 long nrnode = 0;
130 
131 static int rtablesize;
132 static int rtablemask;
133 
134 static int hashlen = 4;
135 
136 static struct kmem_cache *rnode_cache;
137 
138 /*
139  * Mutex to protect the following variables:
140  *	nfs_major
141  *	nfs_minor
142  */
143 kmutex_t nfs_minor_lock;
144 int nfs_major;
145 int nfs_minor;
146 
147 /* Do we allow preepoch (negative) time values otw? */
148 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
149 
150 /*
151  * Access cache
152  */
153 static acache_hash_t *acache;
154 static long nacache;	/* used strictly to size the number of hash queues */
155 
156 static int acachesize;
157 static int acachemask;
158 static struct kmem_cache *acache_cache;
159 
160 /*
161  * Client side utilities
162  */
163 
164 /*
165  * client side statistics
166  */
167 static const struct clstat clstat_tmpl = {
168 	{ "calls",	KSTAT_DATA_UINT64 },
169 	{ "badcalls",	KSTAT_DATA_UINT64 },
170 	{ "clgets",	KSTAT_DATA_UINT64 },
171 	{ "cltoomany",	KSTAT_DATA_UINT64 },
172 #ifdef DEBUG
173 	{ "clalloc",	KSTAT_DATA_UINT64 },
174 	{ "noresponse",	KSTAT_DATA_UINT64 },
175 	{ "failover",	KSTAT_DATA_UINT64 },
176 	{ "remap",	KSTAT_DATA_UINT64 },
177 #endif
178 };
179 
180 /*
181  * The following are statistics that describe behavior of the system as a whole
182  * and doesn't correspond to any one particular zone.
183  */
184 #ifdef DEBUG
185 static struct clstat_debug {
186 	kstat_named_t	nrnode;			/* number of allocated rnodes */
187 	kstat_named_t	access;			/* size of access cache */
188 	kstat_named_t	dirent;			/* size of readdir cache */
189 	kstat_named_t	dirents;		/* size of readdir buf cache */
190 	kstat_named_t	reclaim;		/* number of reclaims */
191 	kstat_named_t	clreclaim;		/* number of cl reclaims */
192 	kstat_named_t	f_reclaim;		/* number of free reclaims */
193 	kstat_named_t	a_reclaim;		/* number of active reclaims */
194 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
195 	kstat_named_t	rpath;			/* bytes used to store rpaths */
196 } clstat_debug = {
197 	{ "nrnode",	KSTAT_DATA_UINT64 },
198 	{ "access",	KSTAT_DATA_UINT64 },
199 	{ "dirent",	KSTAT_DATA_UINT64 },
200 	{ "dirents",	KSTAT_DATA_UINT64 },
201 	{ "reclaim",	KSTAT_DATA_UINT64 },
202 	{ "clreclaim",	KSTAT_DATA_UINT64 },
203 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
204 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
205 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
206 	{ "r_path",	KSTAT_DATA_UINT64 },
207 };
208 #endif	/* DEBUG */
209 
210 /*
211  * We keep a global list of per-zone client data, so we can clean up all zones
212  * if we get low on memory.
213  */
214 static list_t nfs_clnt_list;
215 static kmutex_t nfs_clnt_list_lock;
216 static zone_key_t nfsclnt_zone_key;
217 
218 static struct kmem_cache *chtab_cache;
219 
220 /*
221  * Some servers do not properly update the attributes of the
222  * directory when changes are made.  To allow interoperability
223  * with these broken servers, the nfs_disable_rddir_cache
224  * parameter must be set in /etc/system
225  */
226 int nfs_disable_rddir_cache = 0;
227 
228 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
229 		    struct chtab **);
230 void		clfree(CLIENT *, struct chtab *);
231 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
232 		    struct chtab **, struct nfs_clnt *);
233 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234 		    struct chtab **, struct nfs_clnt *);
235 static void	clreclaim(void *);
236 static int	nfs_feedback(int, int, mntinfo_t *);
237 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
238 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
239 		    failinfo_t *);
240 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
241 		    caddr_t, cred_t *, int *, int, failinfo_t *);
242 static void	rinactive(rnode_t *, cred_t *);
243 static int	rtablehash(nfs_fhandle *);
244 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
245 		    struct vnodeops *,
246 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
247 			cred_t *),
248 		    int (*)(const void *, const void *), int *, cred_t *,
249 		    char *, char *);
250 static void	rp_rmfree(rnode_t *);
251 static void	rp_addhash(rnode_t *);
252 static void	rp_rmhash_locked(rnode_t *);
253 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
254 static void	destroy_rnode(rnode_t *);
255 static void	rddir_cache_free(rddir_cache *);
256 static int	nfs_free_data_reclaim(rnode_t *);
257 static int	nfs_active_data_reclaim(rnode_t *);
258 static int	nfs_free_reclaim(void);
259 static int	nfs_active_reclaim(void);
260 static int	nfs_rnode_reclaim(void);
261 static void	nfs_reclaim(void *);
262 static int	failover_safe(failinfo_t *);
263 static void	failover_newserver(mntinfo_t *mi);
264 static void	failover_thread(mntinfo_t *mi);
265 static int	failover_wait(mntinfo_t *);
266 static int	failover_remap(failinfo_t *);
267 static int	failover_lookup(char *, vnode_t *,
268 		    int (*)(vnode_t *, char *, vnode_t **,
269 			struct pathname *, int, vnode_t *, cred_t *, int),
270 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
271 		    vnode_t **);
272 static void	nfs_free_r_path(rnode_t *);
273 static void	nfs_set_vroot(vnode_t *);
274 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
275 
276 /*
277  * from rpcsec module (common/rpcsec)
278  */
279 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
280 extern void sec_clnt_freeh(AUTH *);
281 extern void sec_clnt_freeinfo(struct sec_data *);
282 
283 /*
284  * used in mount policy
285  */
286 extern ts_label_t *getflabel_cipso(vfs_t *);
287 
288 /*
289  * EIO or EINTR are not recoverable errors.
290  */
291 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
292 
293 #ifdef DEBUG
294 #define	SRV_QFULL_MSG	"send queue to NFS%d server %s is full; still trying\n"
295 #define	SRV_NOTRESP_MSG	"NFS%d server %s not responding still trying\n"
296 #else
297 #define	SRV_QFULL_MSG	"send queue to NFS server %s is full still trying\n"
298 #define	SRV_NOTRESP_MSG	"NFS server %s not responding still trying\n"
299 #endif
300 /*
301  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
302  */
303 static int
304 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
305     struct chtab **chp, struct nfs_clnt *nfscl)
306 {
307 	struct chhead *ch, *newch;
308 	struct chhead **plistp;
309 	struct chtab *cp;
310 	int error;
311 	k_sigset_t smask;
312 
313 	if (newcl == NULL || chp == NULL || ci == NULL)
314 		return (EINVAL);
315 
316 	*newcl = NULL;
317 	*chp = NULL;
318 
319 	/*
320 	 * Find an unused handle or create one
321 	 */
322 	newch = NULL;
323 	nfscl->nfscl_stat.clgets.value.ui64++;
324 top:
325 	/*
326 	 * Find the correct entry in the cache to check for free
327 	 * client handles.  The search is based on the RPC program
328 	 * number, program version number, dev_t for the transport
329 	 * device, and the protocol family.
330 	 */
331 	mutex_enter(&nfscl->nfscl_chtable_lock);
332 	plistp = &nfscl->nfscl_chtable;
333 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
334 		if (ch->ch_prog == ci->cl_prog &&
335 		    ch->ch_vers == ci->cl_vers &&
336 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
337 		    (strcmp(ch->ch_protofmly,
338 		    svp->sv_knconf->knc_protofmly) == 0))
339 			break;
340 		plistp = &ch->ch_next;
341 	}
342 
343 	/*
344 	 * If we didn't find a cache entry for this quadruple, then
345 	 * create one.  If we don't have one already preallocated,
346 	 * then drop the cache lock, create one, and then start over.
347 	 * If we did have a preallocated entry, then just add it to
348 	 * the front of the list.
349 	 */
350 	if (ch == NULL) {
351 		if (newch == NULL) {
352 			mutex_exit(&nfscl->nfscl_chtable_lock);
353 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
354 			newch->ch_timesused = 0;
355 			newch->ch_prog = ci->cl_prog;
356 			newch->ch_vers = ci->cl_vers;
357 			newch->ch_dev = svp->sv_knconf->knc_rdev;
358 			newch->ch_protofmly = kmem_alloc(
359 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
360 			    KM_SLEEP);
361 			(void) strcpy(newch->ch_protofmly,
362 			    svp->sv_knconf->knc_protofmly);
363 			newch->ch_list = NULL;
364 			goto top;
365 		}
366 		ch = newch;
367 		newch = NULL;
368 		ch->ch_next = nfscl->nfscl_chtable;
369 		nfscl->nfscl_chtable = ch;
370 	/*
371 	 * We found a cache entry, but if it isn't on the front of the
372 	 * list, then move it to the front of the list to try to take
373 	 * advantage of locality of operations.
374 	 */
375 	} else if (ch != nfscl->nfscl_chtable) {
376 		*plistp = ch->ch_next;
377 		ch->ch_next = nfscl->nfscl_chtable;
378 		nfscl->nfscl_chtable = ch;
379 	}
380 
381 	/*
382 	 * If there was a free client handle cached, then remove it
383 	 * from the list, init it, and use it.
384 	 */
385 	if (ch->ch_list != NULL) {
386 		cp = ch->ch_list;
387 		ch->ch_list = cp->ch_list;
388 		mutex_exit(&nfscl->nfscl_chtable_lock);
389 		if (newch != NULL) {
390 			kmem_free(newch->ch_protofmly,
391 			    strlen(newch->ch_protofmly) + 1);
392 			kmem_free(newch, sizeof (*newch));
393 		}
394 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
395 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
396 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
397 		    &cp->ch_client->cl_auth);
398 		if (error || cp->ch_client->cl_auth == NULL) {
399 			CLNT_DESTROY(cp->ch_client);
400 			kmem_cache_free(chtab_cache, cp);
401 			return ((error != 0) ? error : EINTR);
402 		}
403 		ch->ch_timesused++;
404 		*newcl = cp->ch_client;
405 		*chp = cp;
406 		return (0);
407 	}
408 
409 	/*
410 	 * There weren't any free client handles which fit, so allocate
411 	 * a new one and use that.
412 	 */
413 #ifdef DEBUG
414 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
415 #endif
416 	mutex_exit(&nfscl->nfscl_chtable_lock);
417 
418 	nfscl->nfscl_stat.cltoomany.value.ui64++;
419 	if (newch != NULL) {
420 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
421 		kmem_free(newch, sizeof (*newch));
422 	}
423 
424 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
425 	cp->ch_head = ch;
426 
427 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
428 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
429 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
430 	sigunintr(&smask);
431 
432 	if (error != 0) {
433 		kmem_cache_free(chtab_cache, cp);
434 #ifdef DEBUG
435 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
436 #endif
437 		/*
438 		 * Warning is unnecessary if error is EINTR.
439 		 */
440 		if (error != EINTR) {
441 			nfs_cmn_err(error, CE_WARN,
442 			    "clget: couldn't create handle: %m\n");
443 		}
444 		return (error);
445 	}
446 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
447 	auth_destroy(cp->ch_client->cl_auth);
448 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
449 	    &cp->ch_client->cl_auth);
450 	if (error || cp->ch_client->cl_auth == NULL) {
451 		CLNT_DESTROY(cp->ch_client);
452 		kmem_cache_free(chtab_cache, cp);
453 #ifdef DEBUG
454 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
455 #endif
456 		return ((error != 0) ? error : EINTR);
457 	}
458 	ch->ch_timesused++;
459 	*newcl = cp->ch_client;
460 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
461 	*chp = cp;
462 	return (0);
463 }
464 
465 int
466 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
467     struct chtab **chp)
468 {
469 	struct nfs_clnt *nfscl;
470 
471 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
472 	ASSERT(nfscl != NULL);
473 
474 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
475 }
476 
477 static int
478 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
479     struct chtab **chp, struct nfs_clnt *nfscl)
480 {
481 	clinfo_t ci;
482 	int error;
483 
484 	/*
485 	 * Set read buffer size to rsize
486 	 * and add room for RPC headers.
487 	 */
488 	ci.cl_readsize = mi->mi_tsize;
489 	if (ci.cl_readsize != 0)
490 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
491 
492 	/*
493 	 * If soft mount and server is down just try once.
494 	 * meaning: do not retransmit.
495 	 */
496 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
497 		ci.cl_retrans = 0;
498 	else
499 		ci.cl_retrans = mi->mi_retrans;
500 
501 	ci.cl_prog = NFS_ACL_PROGRAM;
502 	ci.cl_vers = mi->mi_vers;
503 	ci.cl_flags = mi->mi_flags;
504 
505 	/*
506 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
507 	 * security flavor, the client tries to establish a security context
508 	 * by contacting the server. If the connection is timed out or reset,
509 	 * e.g. server reboot, we will try again.
510 	 */
511 	do {
512 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
513 
514 		if (error == 0)
515 			break;
516 
517 		/*
518 		 * For forced unmount or zone shutdown, bail out, no retry.
519 		 */
520 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
521 			error = EIO;
522 			break;
523 		}
524 
525 		/* do not retry for softmount */
526 		if (!(mi->mi_flags & MI_HARD))
527 			break;
528 
529 		/* let the caller deal with the failover case */
530 		if (FAILOVER_MOUNT(mi))
531 			break;
532 
533 	} while (error == ETIMEDOUT || error == ECONNRESET);
534 
535 	return (error);
536 }
537 
538 static int
539 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
540     struct chtab **chp, struct nfs_clnt *nfscl)
541 {
542 	clinfo_t ci;
543 	int error;
544 
545 	/*
546 	 * Set read buffer size to rsize
547 	 * and add room for RPC headers.
548 	 */
549 	ci.cl_readsize = mi->mi_tsize;
550 	if (ci.cl_readsize != 0)
551 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
552 
553 	/*
554 	 * If soft mount and server is down just try once.
555 	 * meaning: do not retransmit.
556 	 */
557 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
558 		ci.cl_retrans = 0;
559 	else
560 		ci.cl_retrans = mi->mi_retrans;
561 
562 	ci.cl_prog = mi->mi_prog;
563 	ci.cl_vers = mi->mi_vers;
564 	ci.cl_flags = mi->mi_flags;
565 
566 	/*
567 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
568 	 * security flavor, the client tries to establish a security context
569 	 * by contacting the server. If the connection is timed out or reset,
570 	 * e.g. server reboot, we will try again.
571 	 */
572 	do {
573 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
574 
575 		if (error == 0)
576 			break;
577 
578 		/*
579 		 * For forced unmount or zone shutdown, bail out, no retry.
580 		 */
581 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
582 			error = EIO;
583 			break;
584 		}
585 
586 		/* do not retry for softmount */
587 		if (!(mi->mi_flags & MI_HARD))
588 			break;
589 
590 		/* let the caller deal with the failover case */
591 		if (FAILOVER_MOUNT(mi))
592 			break;
593 
594 	} while (error == ETIMEDOUT || error == ECONNRESET);
595 
596 	return (error);
597 }
598 
599 static void
600 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
601 {
602 	if (cl->cl_auth != NULL) {
603 		sec_clnt_freeh(cl->cl_auth);
604 		cl->cl_auth = NULL;
605 	}
606 
607 	/*
608 	 * Timestamp this cache entry so that we know when it was last
609 	 * used.
610 	 */
611 	cp->ch_freed = gethrestime_sec();
612 
613 	/*
614 	 * Add the free client handle to the front of the list.
615 	 * This way, the list will be sorted in youngest to oldest
616 	 * order.
617 	 */
618 	mutex_enter(&nfscl->nfscl_chtable_lock);
619 	cp->ch_list = cp->ch_head->ch_list;
620 	cp->ch_head->ch_list = cp;
621 	mutex_exit(&nfscl->nfscl_chtable_lock);
622 }
623 
624 void
625 clfree(CLIENT *cl, struct chtab *cp)
626 {
627 	struct nfs_clnt *nfscl;
628 
629 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
630 	ASSERT(nfscl != NULL);
631 
632 	clfree_impl(cl, cp, nfscl);
633 }
634 
635 #define	CL_HOLDTIME	60	/* time to hold client handles */
636 
637 static void
638 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
639 {
640 	struct chhead *ch;
641 	struct chtab *cp;	/* list of objects that can be reclaimed */
642 	struct chtab *cpe;
643 	struct chtab *cpl;
644 	struct chtab **cpp;
645 #ifdef DEBUG
646 	int n = 0;
647 #endif
648 
649 	/*
650 	 * Need to reclaim some memory, so step through the cache
651 	 * looking through the lists for entries which can be freed.
652 	 */
653 	cp = NULL;
654 
655 	mutex_enter(&nfscl->nfscl_chtable_lock);
656 
657 	/*
658 	 * Here we step through each non-NULL quadruple and start to
659 	 * construct the reclaim list pointed to by cp.  Note that
660 	 * cp will contain all eligible chtab entries.  When this traversal
661 	 * completes, chtab entries from the last quadruple will be at the
662 	 * front of cp and entries from previously inspected quadruples have
663 	 * been appended to the rear of cp.
664 	 */
665 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
666 		if (ch->ch_list == NULL)
667 			continue;
668 		/*
669 		 * Search each list for entries older then
670 		 * cl_holdtime seconds.  The lists are maintained
671 		 * in youngest to oldest order so that when the
672 		 * first entry is found which is old enough, then
673 		 * all of the rest of the entries on the list will
674 		 * be old enough as well.
675 		 */
676 		cpl = ch->ch_list;
677 		cpp = &ch->ch_list;
678 		while (cpl != NULL &&
679 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
680 			cpp = &cpl->ch_list;
681 			cpl = cpl->ch_list;
682 		}
683 		if (cpl != NULL) {
684 			*cpp = NULL;
685 			if (cp != NULL) {
686 				cpe = cpl;
687 				while (cpe->ch_list != NULL)
688 					cpe = cpe->ch_list;
689 				cpe->ch_list = cp;
690 			}
691 			cp = cpl;
692 		}
693 	}
694 
695 	mutex_exit(&nfscl->nfscl_chtable_lock);
696 
697 	/*
698 	 * If cp is empty, then there is nothing to reclaim here.
699 	 */
700 	if (cp == NULL)
701 		return;
702 
703 	/*
704 	 * Step through the list of entries to free, destroying each client
705 	 * handle and kmem_free'ing the memory for each entry.
706 	 */
707 	while (cp != NULL) {
708 #ifdef DEBUG
709 		n++;
710 #endif
711 		CLNT_DESTROY(cp->ch_client);
712 		cpl = cp->ch_list;
713 		kmem_cache_free(chtab_cache, cp);
714 		cp = cpl;
715 	}
716 
717 #ifdef DEBUG
718 	/*
719 	 * Update clalloc so that nfsstat shows the current number
720 	 * of allocated client handles.
721 	 */
722 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
723 #endif
724 }
725 
726 /* ARGSUSED */
727 static void
728 clreclaim(void *all)
729 {
730 	struct nfs_clnt *nfscl;
731 
732 #ifdef DEBUG
733 	clstat_debug.clreclaim.value.ui64++;
734 #endif
735 	/*
736 	 * The system is low on memory; go through and try to reclaim some from
737 	 * every zone on the system.
738 	 */
739 	mutex_enter(&nfs_clnt_list_lock);
740 	nfscl = list_head(&nfs_clnt_list);
741 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
742 		clreclaim_zone(nfscl, CL_HOLDTIME);
743 	mutex_exit(&nfs_clnt_list_lock);
744 }
745 
746 /*
747  * Minimum time-out values indexed by call type
748  * These units are in "eights" of a second to avoid multiplies
749  */
750 static unsigned int minimum_timeo[] = {
751 	6, 7, 10
752 };
753 
754 /*
755  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
756  */
757 #define	MAXTIMO	(20*hz)
758 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
759 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
760 
761 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
762 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
763 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
764 
765 /*
766  * Function called when rfscall notices that we have been
767  * re-transmitting, or when we get a response without retransmissions.
768  * Return 1 if the transfer size was adjusted down - 0 if no change.
769  */
770 static int
771 nfs_feedback(int flag, int which, mntinfo_t *mi)
772 {
773 	int kind;
774 	int r = 0;
775 
776 	mutex_enter(&mi->mi_lock);
777 	if (flag == FEEDBACK_REXMIT1) {
778 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
779 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
780 			goto done;
781 		if (mi->mi_curread > MIN_NFS_TSIZE) {
782 			mi->mi_curread /= 2;
783 			if (mi->mi_curread < MIN_NFS_TSIZE)
784 				mi->mi_curread = MIN_NFS_TSIZE;
785 			r = 1;
786 		}
787 
788 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
789 			mi->mi_curwrite /= 2;
790 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
791 				mi->mi_curwrite = MIN_NFS_TSIZE;
792 			r = 1;
793 		}
794 	} else if (flag == FEEDBACK_OK) {
795 		kind = mi->mi_timer_type[which];
796 		if (kind == 0 ||
797 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
798 			goto done;
799 		if (kind == 1) {
800 			if (mi->mi_curread >= mi->mi_tsize)
801 				goto done;
802 			mi->mi_curread +=  MIN_NFS_TSIZE;
803 			if (mi->mi_curread > mi->mi_tsize/2)
804 				mi->mi_curread = mi->mi_tsize;
805 		} else if (kind == 2) {
806 			if (mi->mi_curwrite >= mi->mi_stsize)
807 				goto done;
808 			mi->mi_curwrite += MIN_NFS_TSIZE;
809 			if (mi->mi_curwrite > mi->mi_stsize/2)
810 				mi->mi_curwrite = mi->mi_stsize;
811 		}
812 	}
813 done:
814 	mutex_exit(&mi->mi_lock);
815 	return (r);
816 }
817 
818 #ifdef DEBUG
819 static int rfs2call_hits = 0;
820 static int rfs2call_misses = 0;
821 #endif
822 
823 int
824 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
825     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
826     enum nfsstat *statusp, int flags, failinfo_t *fi)
827 {
828 	int rpcerror;
829 	enum clnt_stat rpc_status;
830 
831 	ASSERT(statusp != NULL);
832 
833 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
834 	    cr, douprintf, &rpc_status, flags, fi);
835 	if (!rpcerror) {
836 		/*
837 		 * See crnetadjust() for comments.
838 		 */
839 		if (*statusp == NFSERR_ACCES &&
840 		    (cr = crnetadjust(cr)) != NULL) {
841 #ifdef DEBUG
842 			rfs2call_hits++;
843 #endif
844 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
845 			    resp, cr, douprintf, NULL, flags, fi);
846 			crfree(cr);
847 #ifdef DEBUG
848 			if (*statusp == NFSERR_ACCES)
849 				rfs2call_misses++;
850 #endif
851 		}
852 	} else if (rpc_status == RPC_PROCUNAVAIL) {
853 		*statusp = NFSERR_OPNOTSUPP;
854 		rpcerror = 0;
855 	}
856 
857 	return (rpcerror);
858 }
859 
860 #define	NFS3_JUKEBOX_DELAY	10 * hz
861 
862 static clock_t nfs3_jukebox_delay = 0;
863 
864 #ifdef DEBUG
865 static int rfs3call_hits = 0;
866 static int rfs3call_misses = 0;
867 #endif
868 
869 int
870 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
871     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
872     nfsstat3 *statusp, int flags, failinfo_t *fi)
873 {
874 	int rpcerror;
875 	int user_informed;
876 
877 	user_informed = 0;
878 	do {
879 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
880 		    cr, douprintf, NULL, flags, fi);
881 		if (!rpcerror) {
882 			cred_t *crr;
883 			if (*statusp == NFS3ERR_JUKEBOX) {
884 				if (ttoproc(curthread) == &p0) {
885 					rpcerror = EAGAIN;
886 					break;
887 				}
888 				if (!user_informed) {
889 					user_informed = 1;
890 					uprintf(
891 		"file temporarily unavailable on the server, retrying...\n");
892 				}
893 				delay(nfs3_jukebox_delay);
894 			}
895 			/*
896 			 * See crnetadjust() for comments.
897 			 */
898 			else if (*statusp == NFS3ERR_ACCES &&
899 			    (crr = crnetadjust(cr)) != NULL) {
900 #ifdef DEBUG
901 				rfs3call_hits++;
902 #endif
903 				rpcerror = rfscall(mi, which, xdrargs, argsp,
904 				    xdrres, resp, crr, douprintf,
905 				    NULL, flags, fi);
906 
907 				crfree(crr);
908 #ifdef DEBUG
909 				if (*statusp == NFS3ERR_ACCES)
910 					rfs3call_misses++;
911 #endif
912 			}
913 		}
914 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
915 
916 	return (rpcerror);
917 }
918 
919 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
920 #define	INC_READERS(mi)		{ \
921 	mi->mi_readers++; \
922 }
923 #define	DEC_READERS(mi)		{ \
924 	mi->mi_readers--; \
925 	if (mi->mi_readers == 0) \
926 		cv_broadcast(&mi->mi_failover_cv); \
927 }
928 
929 static int
930 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
931     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
932     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
933 {
934 	CLIENT *client;
935 	struct chtab *ch;
936 	cred_t *cr = icr;
937 	enum clnt_stat status;
938 	struct rpc_err rpcerr, rpcerr_tmp;
939 	struct timeval wait;
940 	int timeo;		/* in units of hz */
941 	int my_rsize, my_wsize;
942 	bool_t tryagain;
943 	bool_t cred_cloned = FALSE;
944 	k_sigset_t smask;
945 	servinfo_t *svp;
946 	struct nfs_clnt *nfscl;
947 	zoneid_t zoneid = getzoneid();
948 	char *msg;
949 #ifdef DEBUG
950 	char *bufp;
951 #endif
952 
953 
954 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
955 	    "rfscall_start:which %d mi %p", which, mi);
956 
957 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
958 	ASSERT(nfscl != NULL);
959 
960 	nfscl->nfscl_stat.calls.value.ui64++;
961 	mi->mi_reqs[which].value.ui64++;
962 
963 	rpcerr.re_status = RPC_SUCCESS;
964 
965 	/*
966 	 * In case of forced unmount or zone shutdown, return EIO.
967 	 */
968 
969 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
970 		rpcerr.re_status = RPC_FAILED;
971 		rpcerr.re_errno = EIO;
972 		return (rpcerr.re_errno);
973 	}
974 
975 	/*
976 	 * Remember the transfer sizes in case
977 	 * nfs_feedback changes them underneath us.
978 	 */
979 	my_rsize = mi->mi_curread;
980 	my_wsize = mi->mi_curwrite;
981 
982 	/*
983 	 * NFS client failover support
984 	 *
985 	 * If this rnode is not in sync with the current server (VALID_FH),
986 	 * we'd like to do a remap to get in sync.  We can be interrupted
987 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
988 	 * use the best info we have to try the RPC.  Part of that is
989 	 * unconditionally updating the filehandle copy kept for V3.
990 	 *
991 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
992 	 * rw_enter(); we're trying to keep the current server from being
993 	 * changed on us until we're done with the remapping and have a
994 	 * matching client handle.  We don't want to sending a filehandle
995 	 * to the wrong host.
996 	 */
997 failoverretry:
998 	if (FAILOVER_MOUNT(mi)) {
999 		mutex_enter(&mi->mi_lock);
1000 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1001 			if (failover_wait(mi)) {
1002 				mutex_exit(&mi->mi_lock);
1003 				return (EINTR);
1004 			}
1005 		}
1006 		INC_READERS(mi);
1007 		mutex_exit(&mi->mi_lock);
1008 		if (fi) {
1009 			if (!VALID_FH(fi) &&
1010 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1011 				int remaperr;
1012 
1013 				svp = mi->mi_curr_serv;
1014 				remaperr = failover_remap(fi);
1015 				if (remaperr != 0) {
1016 #ifdef DEBUG
1017 					if (remaperr != EINTR)
1018 						nfs_cmn_err(remaperr, CE_WARN,
1019 					    "rfscall couldn't failover: %m");
1020 #endif
1021 					mutex_enter(&mi->mi_lock);
1022 					DEC_READERS(mi);
1023 					mutex_exit(&mi->mi_lock);
1024 					/*
1025 					 * If failover_remap returns ETIMEDOUT
1026 					 * and the filesystem is hard mounted
1027 					 * we have to retry the call with a new
1028 					 * server.
1029 					 */
1030 					if ((mi->mi_flags & MI_HARD) &&
1031 					    IS_RECOVERABLE_ERROR(remaperr)) {
1032 						if (svp == mi->mi_curr_serv)
1033 							failover_newserver(mi);
1034 						rpcerr.re_status = RPC_SUCCESS;
1035 						goto failoverretry;
1036 					}
1037 					rpcerr.re_errno = remaperr;
1038 					return (remaperr);
1039 				}
1040 			}
1041 			if (fi->fhp && fi->copyproc)
1042 				(*fi->copyproc)(fi->fhp, fi->vp);
1043 		}
1044 	}
1045 
1046 	/* For TSOL, use a new cred which has net_mac_aware flag */
1047 	if (!cred_cloned && is_system_labeled()) {
1048 		cred_cloned = TRUE;
1049 		cr = crdup(icr);
1050 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1051 	}
1052 
1053 	/*
1054 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1055 	 * are guaranteed to reprocess the retry as a new request.
1056 	 */
1057 	svp = mi->mi_curr_serv;
1058 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1059 
1060 	if (FAILOVER_MOUNT(mi)) {
1061 		mutex_enter(&mi->mi_lock);
1062 		DEC_READERS(mi);
1063 		mutex_exit(&mi->mi_lock);
1064 
1065 		if ((rpcerr.re_errno == ETIMEDOUT ||
1066 		    rpcerr.re_errno == ECONNRESET) &&
1067 		    failover_safe(fi)) {
1068 			if (svp == mi->mi_curr_serv)
1069 				failover_newserver(mi);
1070 			goto failoverretry;
1071 		}
1072 	}
1073 	if (rpcerr.re_errno != 0)
1074 		return (rpcerr.re_errno);
1075 
1076 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1077 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1078 		timeo = (mi->mi_timeo * hz) / 10;
1079 	} else {
1080 		mutex_enter(&mi->mi_lock);
1081 		timeo = CLNT_SETTIMERS(client,
1082 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1083 		    &(mi->mi_timers[NFS_CALLTYPES]),
1084 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1085 		    (void (*)())NULL, (caddr_t)mi, 0);
1086 		mutex_exit(&mi->mi_lock);
1087 	}
1088 
1089 	/*
1090 	 * If hard mounted fs, retry call forever unless hard error occurs.
1091 	 */
1092 	do {
1093 		tryagain = FALSE;
1094 
1095 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1096 			status = RPC_FAILED;
1097 			rpcerr.re_status = RPC_FAILED;
1098 			rpcerr.re_errno = EIO;
1099 			break;
1100 		}
1101 
1102 		TICK_TO_TIMEVAL(timeo, &wait);
1103 
1104 		/*
1105 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1106 		 * and SIGTERM. (Preserving the existing masks).
1107 		 * Mask out SIGINT if mount option nointr is specified.
1108 		 */
1109 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1110 		if (!(mi->mi_flags & MI_INT))
1111 			client->cl_nosignal = TRUE;
1112 
1113 		/*
1114 		 * If there is a current signal, then don't bother
1115 		 * even trying to send out the request because we
1116 		 * won't be able to block waiting for the response.
1117 		 * Simply assume RPC_INTR and get on with it.
1118 		 */
1119 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1120 			status = RPC_INTR;
1121 		else {
1122 			status = CLNT_CALL(client, which, xdrargs, argsp,
1123 			    xdrres, resp, wait);
1124 		}
1125 
1126 		if (!(mi->mi_flags & MI_INT))
1127 			client->cl_nosignal = FALSE;
1128 		/*
1129 		 * restore original signal mask
1130 		 */
1131 		sigunintr(&smask);
1132 
1133 		switch (status) {
1134 		case RPC_SUCCESS:
1135 			if ((mi->mi_flags & MI_DYNAMIC) &&
1136 			    mi->mi_timer_type[which] != 0 &&
1137 			    (mi->mi_curread != my_rsize ||
1138 			    mi->mi_curwrite != my_wsize))
1139 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1140 			break;
1141 
1142 		case RPC_INTR:
1143 			/*
1144 			 * There is no way to recover from this error,
1145 			 * even if mount option nointr is specified.
1146 			 * SIGKILL, for example, cannot be blocked.
1147 			 */
1148 			rpcerr.re_status = RPC_INTR;
1149 			rpcerr.re_errno = EINTR;
1150 			break;
1151 
1152 		case RPC_UDERROR:
1153 			/*
1154 			 * If the NFS server is local (vold) and
1155 			 * it goes away then we get RPC_UDERROR.
1156 			 * This is a retryable error, so we would
1157 			 * loop, so check to see if the specific
1158 			 * error was ECONNRESET, indicating that
1159 			 * target did not exist at all.  If so,
1160 			 * return with RPC_PROGUNAVAIL and
1161 			 * ECONNRESET to indicate why.
1162 			 */
1163 			CLNT_GETERR(client, &rpcerr);
1164 			if (rpcerr.re_errno == ECONNRESET) {
1165 				rpcerr.re_status = RPC_PROGUNAVAIL;
1166 				rpcerr.re_errno = ECONNRESET;
1167 				break;
1168 			}
1169 			/*FALLTHROUGH*/
1170 
1171 		default:		/* probably RPC_TIMEDOUT */
1172 			if (IS_UNRECOVERABLE_RPC(status))
1173 				break;
1174 
1175 			/*
1176 			 * increment server not responding count
1177 			 */
1178 			mutex_enter(&mi->mi_lock);
1179 			mi->mi_noresponse++;
1180 			mutex_exit(&mi->mi_lock);
1181 #ifdef DEBUG
1182 			nfscl->nfscl_stat.noresponse.value.ui64++;
1183 #endif
1184 
1185 			if (!(mi->mi_flags & MI_HARD)) {
1186 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1187 				    (mi->mi_ss_call_type[which] == 0))
1188 					break;
1189 			}
1190 
1191 			/*
1192 			 * The call is in progress (over COTS).
1193 			 * Try the CLNT_CALL again, but don't
1194 			 * print a noisy error message.
1195 			 */
1196 			if (status == RPC_INPROGRESS) {
1197 				tryagain = TRUE;
1198 				break;
1199 			}
1200 
1201 			if (flags & RFSCALL_SOFT)
1202 				break;
1203 
1204 			/*
1205 			 * On zone shutdown, just move on.
1206 			 */
1207 			if (zone_status_get(curproc->p_zone) >=
1208 			    ZONE_IS_SHUTTING_DOWN) {
1209 				rpcerr.re_status = RPC_FAILED;
1210 				rpcerr.re_errno = EIO;
1211 				break;
1212 			}
1213 
1214 			/*
1215 			 * NFS client failover support
1216 			 *
1217 			 * If the current server just failed us, we'll
1218 			 * start the process of finding a new server.
1219 			 * After that, we can just retry.
1220 			 */
1221 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1222 				if (svp == mi->mi_curr_serv)
1223 					failover_newserver(mi);
1224 				clfree_impl(client, ch, nfscl);
1225 				goto failoverretry;
1226 			}
1227 
1228 			tryagain = TRUE;
1229 			timeo = backoff(timeo);
1230 
1231 			CLNT_GETERR(client, &rpcerr_tmp);
1232 			if ((status == RPC_CANTSEND) &&
1233 			    (rpcerr_tmp.re_errno == ENOBUFS))
1234 				msg = SRV_QFULL_MSG;
1235 			else
1236 				msg = SRV_NOTRESP_MSG;
1237 
1238 			mutex_enter(&mi->mi_lock);
1239 			if (!(mi->mi_flags & MI_PRINTED)) {
1240 				mi->mi_flags |= MI_PRINTED;
1241 				mutex_exit(&mi->mi_lock);
1242 #ifdef DEBUG
1243 				zprintf(zoneid, msg, mi->mi_vers,
1244 				    svp->sv_hostname);
1245 #else
1246 				zprintf(zoneid, msg, svp->sv_hostname);
1247 #endif
1248 			} else
1249 				mutex_exit(&mi->mi_lock);
1250 			if (*douprintf && nfs_has_ctty()) {
1251 				*douprintf = 0;
1252 				if (!(mi->mi_flags & MI_NOPRINT))
1253 #ifdef DEBUG
1254 					uprintf(msg, mi->mi_vers,
1255 					    svp->sv_hostname);
1256 #else
1257 					uprintf(msg, svp->sv_hostname);
1258 #endif
1259 			}
1260 
1261 			/*
1262 			 * If doing dynamic adjustment of transfer
1263 			 * size and if it's a read or write call
1264 			 * and if the transfer size changed while
1265 			 * retransmitting or if the feedback routine
1266 			 * changed the transfer size,
1267 			 * then exit rfscall so that the transfer
1268 			 * size can be adjusted at the vnops level.
1269 			 */
1270 			if ((mi->mi_flags & MI_DYNAMIC) &&
1271 			    mi->mi_timer_type[which] != 0 &&
1272 			    (mi->mi_curread != my_rsize ||
1273 			    mi->mi_curwrite != my_wsize ||
1274 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1275 				/*
1276 				 * On read or write calls, return
1277 				 * back to the vnode ops level if
1278 				 * the transfer size changed.
1279 				 */
1280 				clfree_impl(client, ch, nfscl);
1281 				if (cred_cloned)
1282 					crfree(cr);
1283 				return (ENFS_TRYAGAIN);
1284 			}
1285 		}
1286 	} while (tryagain);
1287 
1288 	if (status != RPC_SUCCESS) {
1289 		/*
1290 		 * Let soft mounts use the timed out message.
1291 		 */
1292 		if (status == RPC_INPROGRESS)
1293 			status = RPC_TIMEDOUT;
1294 		nfscl->nfscl_stat.badcalls.value.ui64++;
1295 		if (status != RPC_INTR) {
1296 			mutex_enter(&mi->mi_lock);
1297 			mi->mi_flags |= MI_DOWN;
1298 			mutex_exit(&mi->mi_lock);
1299 			CLNT_GETERR(client, &rpcerr);
1300 #ifdef DEBUG
1301 			bufp = clnt_sperror(client, svp->sv_hostname);
1302 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1303 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1304 			if (nfs_has_ctty()) {
1305 				if (!(mi->mi_flags & MI_NOPRINT)) {
1306 					uprintf("NFS%d %s failed for %s\n",
1307 					    mi->mi_vers, mi->mi_rfsnames[which],
1308 					    bufp);
1309 				}
1310 			}
1311 			kmem_free(bufp, MAXPATHLEN);
1312 #else
1313 			zprintf(zoneid,
1314 			    "NFS %s failed for server %s: error %d (%s)\n",
1315 			    mi->mi_rfsnames[which], svp->sv_hostname,
1316 			    status, clnt_sperrno(status));
1317 			if (nfs_has_ctty()) {
1318 				if (!(mi->mi_flags & MI_NOPRINT)) {
1319 					uprintf(
1320 				"NFS %s failed for server %s: error %d (%s)\n",
1321 					    mi->mi_rfsnames[which],
1322 					    svp->sv_hostname, status,
1323 					    clnt_sperrno(status));
1324 				}
1325 			}
1326 #endif
1327 			/*
1328 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1329 			 * re_errno is set appropriately depending on
1330 			 * the authentication error
1331 			 */
1332 			if (status == RPC_VERSMISMATCH ||
1333 			    status == RPC_PROGVERSMISMATCH)
1334 				rpcerr.re_errno = EIO;
1335 		}
1336 	} else {
1337 		/*
1338 		 * Test the value of mi_down and mi_printed without
1339 		 * holding the mi_lock mutex.  If they are both zero,
1340 		 * then it is okay to skip the down and printed
1341 		 * processing.  This saves on a mutex_enter and
1342 		 * mutex_exit pair for a normal, successful RPC.
1343 		 * This was just complete overhead.
1344 		 */
1345 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1346 			mutex_enter(&mi->mi_lock);
1347 			mi->mi_flags &= ~MI_DOWN;
1348 			if (mi->mi_flags & MI_PRINTED) {
1349 				mi->mi_flags &= ~MI_PRINTED;
1350 				mutex_exit(&mi->mi_lock);
1351 #ifdef DEBUG
1352 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1353 				zprintf(zoneid, "NFS%d server %s ok\n",
1354 				    mi->mi_vers, svp->sv_hostname);
1355 #else
1356 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1357 				zprintf(zoneid, "NFS server %s ok\n",
1358 				    svp->sv_hostname);
1359 #endif
1360 			} else
1361 				mutex_exit(&mi->mi_lock);
1362 		}
1363 
1364 		if (*douprintf == 0) {
1365 			if (!(mi->mi_flags & MI_NOPRINT))
1366 #ifdef DEBUG
1367 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1368 					uprintf("NFS%d server %s ok\n",
1369 					    mi->mi_vers, svp->sv_hostname);
1370 #else
1371 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1372 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1373 #endif
1374 			*douprintf = 1;
1375 		}
1376 	}
1377 
1378 	clfree_impl(client, ch, nfscl);
1379 	if (cred_cloned)
1380 		crfree(cr);
1381 
1382 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1383 
1384 	if (rpc_status != NULL)
1385 		*rpc_status = rpcerr.re_status;
1386 
1387 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1388 	    rpcerr.re_errno);
1389 
1390 	return (rpcerr.re_errno);
1391 }
1392 
1393 #ifdef DEBUG
1394 static int acl2call_hits = 0;
1395 static int acl2call_misses = 0;
1396 #endif
1397 
1398 int
1399 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1400     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1401     enum nfsstat *statusp, int flags, failinfo_t *fi)
1402 {
1403 	int rpcerror;
1404 
1405 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1406 	    cr, douprintf, flags, fi);
1407 	if (!rpcerror) {
1408 		/*
1409 		 * See comments with crnetadjust().
1410 		 */
1411 		if (*statusp == NFSERR_ACCES &&
1412 		    (cr = crnetadjust(cr)) != NULL) {
1413 #ifdef DEBUG
1414 			acl2call_hits++;
1415 #endif
1416 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1417 			    resp, cr, douprintf, flags, fi);
1418 			crfree(cr);
1419 #ifdef DEBUG
1420 			if (*statusp == NFSERR_ACCES)
1421 				acl2call_misses++;
1422 #endif
1423 		}
1424 	}
1425 
1426 	return (rpcerror);
1427 }
1428 
1429 #ifdef DEBUG
1430 static int acl3call_hits = 0;
1431 static int acl3call_misses = 0;
1432 #endif
1433 
1434 int
1435 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1436     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1437     nfsstat3 *statusp, int flags, failinfo_t *fi)
1438 {
1439 	int rpcerror;
1440 	int user_informed;
1441 
1442 	user_informed = 0;
1443 
1444 	do {
1445 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1446 		    cr, douprintf, flags, fi);
1447 		if (!rpcerror) {
1448 			cred_t *crr;
1449 			if (*statusp == NFS3ERR_JUKEBOX) {
1450 				if (!user_informed) {
1451 					user_informed = 1;
1452 					uprintf(
1453 		"file temporarily unavailable on the server, retrying...\n");
1454 				}
1455 				delay(nfs3_jukebox_delay);
1456 			}
1457 			/*
1458 			 * See crnetadjust() for comments.
1459 			 */
1460 			else if (*statusp == NFS3ERR_ACCES &&
1461 			    (crr = crnetadjust(cr)) != NULL) {
1462 #ifdef DEBUG
1463 				acl3call_hits++;
1464 #endif
1465 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1466 				    xdrres, resp, crr, douprintf, flags, fi);
1467 
1468 				crfree(crr);
1469 #ifdef DEBUG
1470 				if (*statusp == NFS3ERR_ACCES)
1471 					acl3call_misses++;
1472 #endif
1473 			}
1474 		}
1475 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1476 
1477 	return (rpcerror);
1478 }
1479 
1480 static int
1481 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1482     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1483     int flags, failinfo_t *fi)
1484 {
1485 	CLIENT *client;
1486 	struct chtab *ch;
1487 	cred_t *cr = icr;
1488 	bool_t cred_cloned = FALSE;
1489 	enum clnt_stat status;
1490 	struct rpc_err rpcerr;
1491 	struct timeval wait;
1492 	int timeo;		/* in units of hz */
1493 #if 0 /* notyet */
1494 	int my_rsize, my_wsize;
1495 #endif
1496 	bool_t tryagain;
1497 	k_sigset_t smask;
1498 	servinfo_t *svp;
1499 	struct nfs_clnt *nfscl;
1500 	zoneid_t zoneid = getzoneid();
1501 #ifdef DEBUG
1502 	char *bufp;
1503 #endif
1504 
1505 #if 0 /* notyet */
1506 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1507 	    "rfscall_start:which %d mi %p", which, mi);
1508 #endif
1509 
1510 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1511 	ASSERT(nfscl != NULL);
1512 
1513 	nfscl->nfscl_stat.calls.value.ui64++;
1514 	mi->mi_aclreqs[which].value.ui64++;
1515 
1516 	rpcerr.re_status = RPC_SUCCESS;
1517 
1518 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1519 		rpcerr.re_status = RPC_FAILED;
1520 		rpcerr.re_errno = EIO;
1521 		return (rpcerr.re_errno);
1522 	}
1523 
1524 #if 0 /* notyet */
1525 	/*
1526 	 * Remember the transfer sizes in case
1527 	 * nfs_feedback changes them underneath us.
1528 	 */
1529 	my_rsize = mi->mi_curread;
1530 	my_wsize = mi->mi_curwrite;
1531 #endif
1532 
1533 	/*
1534 	 * NFS client failover support
1535 	 *
1536 	 * If this rnode is not in sync with the current server (VALID_FH),
1537 	 * we'd like to do a remap to get in sync.  We can be interrupted
1538 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1539 	 * use the best info we have to try the RPC.  Part of that is
1540 	 * unconditionally updating the filehandle copy kept for V3.
1541 	 *
1542 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1543 	 * rw_enter(); we're trying to keep the current server from being
1544 	 * changed on us until we're done with the remapping and have a
1545 	 * matching client handle.  We don't want to sending a filehandle
1546 	 * to the wrong host.
1547 	 */
1548 failoverretry:
1549 	if (FAILOVER_MOUNT(mi)) {
1550 		mutex_enter(&mi->mi_lock);
1551 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1552 			if (failover_wait(mi)) {
1553 				mutex_exit(&mi->mi_lock);
1554 				return (EINTR);
1555 			}
1556 		}
1557 		INC_READERS(mi);
1558 		mutex_exit(&mi->mi_lock);
1559 		if (fi) {
1560 			if (!VALID_FH(fi) &&
1561 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1562 				int remaperr;
1563 
1564 				svp = mi->mi_curr_serv;
1565 				remaperr = failover_remap(fi);
1566 				if (remaperr != 0) {
1567 #ifdef DEBUG
1568 					if (remaperr != EINTR)
1569 						nfs_cmn_err(remaperr, CE_WARN,
1570 					    "aclcall couldn't failover: %m");
1571 #endif
1572 					mutex_enter(&mi->mi_lock);
1573 					DEC_READERS(mi);
1574 					mutex_exit(&mi->mi_lock);
1575 
1576 					/*
1577 					 * If failover_remap returns ETIMEDOUT
1578 					 * and the filesystem is hard mounted
1579 					 * we have to retry the call with a new
1580 					 * server.
1581 					 */
1582 					if ((mi->mi_flags & MI_HARD) &&
1583 					    IS_RECOVERABLE_ERROR(remaperr)) {
1584 						if (svp == mi->mi_curr_serv)
1585 							failover_newserver(mi);
1586 						rpcerr.re_status = RPC_SUCCESS;
1587 						goto failoverretry;
1588 					}
1589 					return (remaperr);
1590 				}
1591 			}
1592 			if (fi->fhp && fi->copyproc)
1593 				(*fi->copyproc)(fi->fhp, fi->vp);
1594 		}
1595 	}
1596 
1597 	/* For TSOL, use a new cred which has net_mac_aware flag */
1598 	if (!cred_cloned && is_system_labeled()) {
1599 		cred_cloned = TRUE;
1600 		cr = crdup(icr);
1601 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1602 	}
1603 
1604 	/*
1605 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1606 	 * are guaranteed to reprocess the retry as a new request.
1607 	 */
1608 	svp = mi->mi_curr_serv;
1609 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1610 	if (FAILOVER_MOUNT(mi)) {
1611 		mutex_enter(&mi->mi_lock);
1612 		DEC_READERS(mi);
1613 		mutex_exit(&mi->mi_lock);
1614 
1615 		if ((rpcerr.re_errno == ETIMEDOUT ||
1616 		    rpcerr.re_errno == ECONNRESET) &&
1617 		    failover_safe(fi)) {
1618 			if (svp == mi->mi_curr_serv)
1619 				failover_newserver(mi);
1620 			goto failoverretry;
1621 		}
1622 	}
1623 	if (rpcerr.re_errno != 0) {
1624 		if (cred_cloned)
1625 			crfree(cr);
1626 		return (rpcerr.re_errno);
1627 	}
1628 
1629 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1630 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1631 		timeo = (mi->mi_timeo * hz) / 10;
1632 	} else {
1633 		mutex_enter(&mi->mi_lock);
1634 		timeo = CLNT_SETTIMERS(client,
1635 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1636 		    &(mi->mi_timers[NFS_CALLTYPES]),
1637 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1638 		    (void (*)()) 0, (caddr_t)mi, 0);
1639 		mutex_exit(&mi->mi_lock);
1640 	}
1641 
1642 	/*
1643 	 * If hard mounted fs, retry call forever unless hard error occurs.
1644 	 */
1645 	do {
1646 		tryagain = FALSE;
1647 
1648 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1649 			status = RPC_FAILED;
1650 			rpcerr.re_status = RPC_FAILED;
1651 			rpcerr.re_errno = EIO;
1652 			break;
1653 		}
1654 
1655 		TICK_TO_TIMEVAL(timeo, &wait);
1656 
1657 		/*
1658 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1659 		 * and SIGTERM. (Preserving the existing masks).
1660 		 * Mask out SIGINT if mount option nointr is specified.
1661 		 */
1662 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1663 		if (!(mi->mi_flags & MI_INT))
1664 			client->cl_nosignal = TRUE;
1665 
1666 		/*
1667 		 * If there is a current signal, then don't bother
1668 		 * even trying to send out the request because we
1669 		 * won't be able to block waiting for the response.
1670 		 * Simply assume RPC_INTR and get on with it.
1671 		 */
1672 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1673 			status = RPC_INTR;
1674 		else {
1675 			status = CLNT_CALL(client, which, xdrargs, argsp,
1676 			    xdrres, resp, wait);
1677 		}
1678 
1679 		if (!(mi->mi_flags & MI_INT))
1680 			client->cl_nosignal = FALSE;
1681 		/*
1682 		 * restore original signal mask
1683 		 */
1684 		sigunintr(&smask);
1685 
1686 		switch (status) {
1687 		case RPC_SUCCESS:
1688 #if 0 /* notyet */
1689 			if ((mi->mi_flags & MI_DYNAMIC) &&
1690 			    mi->mi_timer_type[which] != 0 &&
1691 			    (mi->mi_curread != my_rsize ||
1692 			    mi->mi_curwrite != my_wsize))
1693 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1694 #endif
1695 			break;
1696 
1697 		/*
1698 		 * Unfortunately, there are servers in the world which
1699 		 * are not coded correctly.  They are not prepared to
1700 		 * handle RPC requests to the NFS port which are not
1701 		 * NFS requests.  Thus, they may try to process the
1702 		 * NFS_ACL request as if it were an NFS request.  This
1703 		 * does not work.  Generally, an error will be generated
1704 		 * on the client because it will not be able to decode
1705 		 * the response from the server.  However, it seems
1706 		 * possible that the server may not be able to decode
1707 		 * the arguments.  Thus, the criteria for deciding
1708 		 * whether the server supports NFS_ACL or not is whether
1709 		 * the following RPC errors are returned from CLNT_CALL.
1710 		 */
1711 		case RPC_CANTDECODERES:
1712 		case RPC_PROGUNAVAIL:
1713 		case RPC_CANTDECODEARGS:
1714 		case RPC_PROGVERSMISMATCH:
1715 			mutex_enter(&mi->mi_lock);
1716 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1717 			mutex_exit(&mi->mi_lock);
1718 			break;
1719 
1720 		/*
1721 		 * If the server supports NFS_ACL but not the new ops
1722 		 * for extended attributes, make sure we don't retry.
1723 		 */
1724 		case RPC_PROCUNAVAIL:
1725 			mutex_enter(&mi->mi_lock);
1726 			mi->mi_flags &= ~MI_EXTATTR;
1727 			mutex_exit(&mi->mi_lock);
1728 			break;
1729 
1730 		case RPC_INTR:
1731 			/*
1732 			 * There is no way to recover from this error,
1733 			 * even if mount option nointr is specified.
1734 			 * SIGKILL, for example, cannot be blocked.
1735 			 */
1736 			rpcerr.re_status = RPC_INTR;
1737 			rpcerr.re_errno = EINTR;
1738 			break;
1739 
1740 		case RPC_UDERROR:
1741 			/*
1742 			 * If the NFS server is local (vold) and
1743 			 * it goes away then we get RPC_UDERROR.
1744 			 * This is a retryable error, so we would
1745 			 * loop, so check to see if the specific
1746 			 * error was ECONNRESET, indicating that
1747 			 * target did not exist at all.  If so,
1748 			 * return with RPC_PROGUNAVAIL and
1749 			 * ECONNRESET to indicate why.
1750 			 */
1751 			CLNT_GETERR(client, &rpcerr);
1752 			if (rpcerr.re_errno == ECONNRESET) {
1753 				rpcerr.re_status = RPC_PROGUNAVAIL;
1754 				rpcerr.re_errno = ECONNRESET;
1755 				break;
1756 			}
1757 			/*FALLTHROUGH*/
1758 
1759 		default:		/* probably RPC_TIMEDOUT */
1760 			if (IS_UNRECOVERABLE_RPC(status))
1761 				break;
1762 
1763 			/*
1764 			 * increment server not responding count
1765 			 */
1766 			mutex_enter(&mi->mi_lock);
1767 			mi->mi_noresponse++;
1768 			mutex_exit(&mi->mi_lock);
1769 #ifdef DEBUG
1770 			nfscl->nfscl_stat.noresponse.value.ui64++;
1771 #endif
1772 
1773 			if (!(mi->mi_flags & MI_HARD)) {
1774 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1775 				    (mi->mi_acl_ss_call_type[which] == 0))
1776 					break;
1777 			}
1778 
1779 			/*
1780 			 * The call is in progress (over COTS).
1781 			 * Try the CLNT_CALL again, but don't
1782 			 * print a noisy error message.
1783 			 */
1784 			if (status == RPC_INPROGRESS) {
1785 				tryagain = TRUE;
1786 				break;
1787 			}
1788 
1789 			if (flags & RFSCALL_SOFT)
1790 				break;
1791 
1792 			/*
1793 			 * On zone shutdown, just move on.
1794 			 */
1795 			if (zone_status_get(curproc->p_zone) >=
1796 			    ZONE_IS_SHUTTING_DOWN) {
1797 				rpcerr.re_status = RPC_FAILED;
1798 				rpcerr.re_errno = EIO;
1799 				break;
1800 			}
1801 
1802 			/*
1803 			 * NFS client failover support
1804 			 *
1805 			 * If the current server just failed us, we'll
1806 			 * start the process of finding a new server.
1807 			 * After that, we can just retry.
1808 			 */
1809 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1810 				if (svp == mi->mi_curr_serv)
1811 					failover_newserver(mi);
1812 				clfree_impl(client, ch, nfscl);
1813 				goto failoverretry;
1814 			}
1815 
1816 			tryagain = TRUE;
1817 			timeo = backoff(timeo);
1818 			mutex_enter(&mi->mi_lock);
1819 			if (!(mi->mi_flags & MI_PRINTED)) {
1820 				mi->mi_flags |= MI_PRINTED;
1821 				mutex_exit(&mi->mi_lock);
1822 #ifdef DEBUG
1823 				zprintf(zoneid,
1824 			"NFS_ACL%d server %s not responding still trying\n",
1825 				    mi->mi_vers, svp->sv_hostname);
1826 #else
1827 				zprintf(zoneid,
1828 			    "NFS server %s not responding still trying\n",
1829 				    svp->sv_hostname);
1830 #endif
1831 			} else
1832 				mutex_exit(&mi->mi_lock);
1833 			if (*douprintf && nfs_has_ctty()) {
1834 				*douprintf = 0;
1835 				if (!(mi->mi_flags & MI_NOPRINT))
1836 #ifdef DEBUG
1837 					uprintf(
1838 			"NFS_ACL%d server %s not responding still trying\n",
1839 					    mi->mi_vers, svp->sv_hostname);
1840 #else
1841 					uprintf(
1842 			    "NFS server %s not responding still trying\n",
1843 					    svp->sv_hostname);
1844 #endif
1845 			}
1846 
1847 #if 0 /* notyet */
1848 			/*
1849 			 * If doing dynamic adjustment of transfer
1850 			 * size and if it's a read or write call
1851 			 * and if the transfer size changed while
1852 			 * retransmitting or if the feedback routine
1853 			 * changed the transfer size,
1854 			 * then exit rfscall so that the transfer
1855 			 * size can be adjusted at the vnops level.
1856 			 */
1857 			if ((mi->mi_flags & MI_DYNAMIC) &&
1858 			    mi->mi_acl_timer_type[which] != 0 &&
1859 			    (mi->mi_curread != my_rsize ||
1860 			    mi->mi_curwrite != my_wsize ||
1861 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1862 				/*
1863 				 * On read or write calls, return
1864 				 * back to the vnode ops level if
1865 				 * the transfer size changed.
1866 				 */
1867 				clfree_impl(client, ch, nfscl);
1868 				if (cred_cloned)
1869 					crfree(cr);
1870 				return (ENFS_TRYAGAIN);
1871 			}
1872 #endif
1873 		}
1874 	} while (tryagain);
1875 
1876 	if (status != RPC_SUCCESS) {
1877 		/*
1878 		 * Let soft mounts use the timed out message.
1879 		 */
1880 		if (status == RPC_INPROGRESS)
1881 			status = RPC_TIMEDOUT;
1882 		nfscl->nfscl_stat.badcalls.value.ui64++;
1883 		if (status == RPC_CANTDECODERES ||
1884 		    status == RPC_PROGUNAVAIL ||
1885 		    status == RPC_PROCUNAVAIL ||
1886 		    status == RPC_CANTDECODEARGS ||
1887 		    status == RPC_PROGVERSMISMATCH)
1888 			CLNT_GETERR(client, &rpcerr);
1889 		else if (status != RPC_INTR) {
1890 			mutex_enter(&mi->mi_lock);
1891 			mi->mi_flags |= MI_DOWN;
1892 			mutex_exit(&mi->mi_lock);
1893 			CLNT_GETERR(client, &rpcerr);
1894 #ifdef DEBUG
1895 			bufp = clnt_sperror(client, svp->sv_hostname);
1896 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1897 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1898 			if (nfs_has_ctty()) {
1899 				if (!(mi->mi_flags & MI_NOPRINT)) {
1900 					uprintf("NFS_ACL%d %s failed for %s\n",
1901 					    mi->mi_vers, mi->mi_aclnames[which],
1902 					    bufp);
1903 				}
1904 			}
1905 			kmem_free(bufp, MAXPATHLEN);
1906 #else
1907 			zprintf(zoneid,
1908 			    "NFS %s failed for server %s: error %d (%s)\n",
1909 			    mi->mi_aclnames[which], svp->sv_hostname,
1910 			    status, clnt_sperrno(status));
1911 			if (nfs_has_ctty()) {
1912 				if (!(mi->mi_flags & MI_NOPRINT))
1913 					uprintf(
1914 				"NFS %s failed for server %s: error %d (%s)\n",
1915 					    mi->mi_aclnames[which],
1916 					    svp->sv_hostname, status,
1917 					    clnt_sperrno(status));
1918 			}
1919 #endif
1920 			/*
1921 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1922 			 * re_errno is set appropriately depending on
1923 			 * the authentication error
1924 			 */
1925 			if (status == RPC_VERSMISMATCH ||
1926 			    status == RPC_PROGVERSMISMATCH)
1927 				rpcerr.re_errno = EIO;
1928 		}
1929 	} else {
1930 		/*
1931 		 * Test the value of mi_down and mi_printed without
1932 		 * holding the mi_lock mutex.  If they are both zero,
1933 		 * then it is okay to skip the down and printed
1934 		 * processing.  This saves on a mutex_enter and
1935 		 * mutex_exit pair for a normal, successful RPC.
1936 		 * This was just complete overhead.
1937 		 */
1938 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1939 			mutex_enter(&mi->mi_lock);
1940 			mi->mi_flags &= ~MI_DOWN;
1941 			if (mi->mi_flags & MI_PRINTED) {
1942 				mi->mi_flags &= ~MI_PRINTED;
1943 				mutex_exit(&mi->mi_lock);
1944 #ifdef DEBUG
1945 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1946 				    mi->mi_vers, svp->sv_hostname);
1947 #else
1948 				zprintf(zoneid, "NFS server %s ok\n",
1949 				    svp->sv_hostname);
1950 #endif
1951 			} else
1952 				mutex_exit(&mi->mi_lock);
1953 		}
1954 
1955 		if (*douprintf == 0) {
1956 			if (!(mi->mi_flags & MI_NOPRINT))
1957 #ifdef DEBUG
1958 				uprintf("NFS_ACL%d server %s ok\n",
1959 				    mi->mi_vers, svp->sv_hostname);
1960 #else
1961 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1962 #endif
1963 			*douprintf = 1;
1964 		}
1965 	}
1966 
1967 	clfree_impl(client, ch, nfscl);
1968 	if (cred_cloned)
1969 		crfree(cr);
1970 
1971 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1972 
1973 #if 0 /* notyet */
1974 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1975 	    rpcerr.re_errno);
1976 #endif
1977 
1978 	return (rpcerr.re_errno);
1979 }
1980 
1981 int
1982 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1983 {
1984 	uint_t mask = vap->va_mask;
1985 
1986 	if (!(mask & AT_MODE))
1987 		sa->sa_mode = (uint32_t)-1;
1988 	else
1989 		sa->sa_mode = vap->va_mode;
1990 	if (!(mask & AT_UID))
1991 		sa->sa_uid = (uint32_t)-1;
1992 	else
1993 		sa->sa_uid = (uint32_t)vap->va_uid;
1994 	if (!(mask & AT_GID))
1995 		sa->sa_gid = (uint32_t)-1;
1996 	else
1997 		sa->sa_gid = (uint32_t)vap->va_gid;
1998 	if (!(mask & AT_SIZE))
1999 		sa->sa_size = (uint32_t)-1;
2000 	else
2001 		sa->sa_size = (uint32_t)vap->va_size;
2002 	if (!(mask & AT_ATIME))
2003 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2004 	else {
2005 		/* check time validity */
2006 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2007 			return (EOVERFLOW);
2008 		}
2009 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2010 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2011 	}
2012 	if (!(mask & AT_MTIME))
2013 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2014 	else {
2015 		/* check time validity */
2016 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2017 			return (EOVERFLOW);
2018 		}
2019 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2020 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2021 	}
2022 	return (0);
2023 }
2024 
2025 int
2026 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2027 {
2028 	uint_t mask = vap->va_mask;
2029 
2030 	if (!(mask & AT_MODE))
2031 		sa->mode.set_it = FALSE;
2032 	else {
2033 		sa->mode.set_it = TRUE;
2034 		sa->mode.mode = (mode3)vap->va_mode;
2035 	}
2036 	if (!(mask & AT_UID))
2037 		sa->uid.set_it = FALSE;
2038 	else {
2039 		sa->uid.set_it = TRUE;
2040 		sa->uid.uid = (uid3)vap->va_uid;
2041 	}
2042 	if (!(mask & AT_GID))
2043 		sa->gid.set_it = FALSE;
2044 	else {
2045 		sa->gid.set_it = TRUE;
2046 		sa->gid.gid = (gid3)vap->va_gid;
2047 	}
2048 	if (!(mask & AT_SIZE))
2049 		sa->size.set_it = FALSE;
2050 	else {
2051 		sa->size.set_it = TRUE;
2052 		sa->size.size = (size3)vap->va_size;
2053 	}
2054 	if (!(mask & AT_ATIME))
2055 		sa->atime.set_it = DONT_CHANGE;
2056 	else {
2057 		/* check time validity */
2058 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2059 			return (EOVERFLOW);
2060 		}
2061 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2062 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2063 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2064 	}
2065 	if (!(mask & AT_MTIME))
2066 		sa->mtime.set_it = DONT_CHANGE;
2067 	else {
2068 		/* check time validity */
2069 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2070 			return (EOVERFLOW);
2071 		}
2072 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2073 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2074 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2075 	}
2076 	return (0);
2077 }
2078 
2079 void
2080 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2081 {
2082 
2083 	da->da_fhandle = VTOFH(dvp);
2084 	da->da_name = nm;
2085 	da->da_flags = 0;
2086 }
2087 
2088 void
2089 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2090 {
2091 
2092 	da->dirp = VTOFH3(dvp);
2093 	da->name = nm;
2094 }
2095 
2096 int
2097 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2098 {
2099 	int error;
2100 	rnode_t *rp;
2101 	struct vattr va;
2102 
2103 	va.va_mask = AT_MODE | AT_GID;
2104 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2105 	if (error)
2106 		return (error);
2107 
2108 	/*
2109 	 * To determine the expected group-id of the created file:
2110 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2111 	 *	GRPID option, and the directory's set-gid bit is clear,
2112 	 *	then use the process's gid.
2113 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2114 	 */
2115 	rp = VTOR(dvp);
2116 	mutex_enter(&rp->r_statelock);
2117 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2118 		*gidp = crgetgid(cr);
2119 	else
2120 		*gidp = va.va_gid;
2121 	mutex_exit(&rp->r_statelock);
2122 	return (0);
2123 }
2124 
2125 int
2126 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2127 {
2128 	int error;
2129 	struct vattr va;
2130 
2131 	va.va_mask = AT_MODE;
2132 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2133 	if (error)
2134 		return (error);
2135 
2136 	/*
2137 	 * Modify the expected mode (om) so that the set-gid bit matches
2138 	 * that of the parent directory (dvp).
2139 	 */
2140 	if (va.va_mode & VSGID)
2141 		*omp |= VSGID;
2142 	else
2143 		*omp &= ~VSGID;
2144 	return (0);
2145 }
2146 
2147 void
2148 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2149 {
2150 
2151 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2152 		if (!(vp->v_flag & VSWAPLIKE)) {
2153 			mutex_enter(&vp->v_lock);
2154 			vp->v_flag |= VSWAPLIKE;
2155 			mutex_exit(&vp->v_lock);
2156 		}
2157 	} else {
2158 		if (vp->v_flag & VSWAPLIKE) {
2159 			mutex_enter(&vp->v_lock);
2160 			vp->v_flag &= ~VSWAPLIKE;
2161 			mutex_exit(&vp->v_lock);
2162 		}
2163 	}
2164 }
2165 
2166 /*
2167  * Free the resources associated with an rnode.
2168  */
2169 static void
2170 rinactive(rnode_t *rp, cred_t *cr)
2171 {
2172 	vnode_t *vp;
2173 	cred_t *cred;
2174 	char *contents;
2175 	int size;
2176 	vsecattr_t *vsp;
2177 	int error;
2178 	nfs3_pathconf_info *info;
2179 
2180 	/*
2181 	 * Before freeing anything, wait until all asynchronous
2182 	 * activity is done on this rnode.  This will allow all
2183 	 * asynchronous read ahead and write behind i/o's to
2184 	 * finish.
2185 	 */
2186 	mutex_enter(&rp->r_statelock);
2187 	while (rp->r_count > 0)
2188 		cv_wait(&rp->r_cv, &rp->r_statelock);
2189 	mutex_exit(&rp->r_statelock);
2190 
2191 	/*
2192 	 * Flush and invalidate all pages associated with the vnode.
2193 	 */
2194 	vp = RTOV(rp);
2195 	if (vn_has_cached_data(vp)) {
2196 		ASSERT(vp->v_type != VCHR);
2197 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2198 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2199 			if (error && (error == ENOSPC || error == EDQUOT)) {
2200 				mutex_enter(&rp->r_statelock);
2201 				if (!rp->r_error)
2202 					rp->r_error = error;
2203 				mutex_exit(&rp->r_statelock);
2204 			}
2205 		}
2206 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2207 	}
2208 
2209 	/*
2210 	 * Free any held credentials and caches which may be associated
2211 	 * with this rnode.
2212 	 */
2213 	mutex_enter(&rp->r_statelock);
2214 	cred = rp->r_cred;
2215 	rp->r_cred = NULL;
2216 	contents = rp->r_symlink.contents;
2217 	size = rp->r_symlink.size;
2218 	rp->r_symlink.contents = NULL;
2219 	vsp = rp->r_secattr;
2220 	rp->r_secattr = NULL;
2221 	info = rp->r_pathconf;
2222 	rp->r_pathconf = NULL;
2223 	mutex_exit(&rp->r_statelock);
2224 
2225 	/*
2226 	 * Free the held credential.
2227 	 */
2228 	if (cred != NULL)
2229 		crfree(cred);
2230 
2231 	/*
2232 	 * Free the access cache entries.
2233 	 */
2234 	(void) nfs_access_purge_rp(rp);
2235 
2236 	/*
2237 	 * Free the readdir cache entries.
2238 	 */
2239 	if (HAVE_RDDIR_CACHE(rp))
2240 		nfs_purge_rddir_cache(vp);
2241 
2242 	/*
2243 	 * Free the symbolic link cache.
2244 	 */
2245 	if (contents != NULL) {
2246 
2247 		kmem_free((void *)contents, size);
2248 	}
2249 
2250 	/*
2251 	 * Free any cached ACL.
2252 	 */
2253 	if (vsp != NULL)
2254 		nfs_acl_free(vsp);
2255 
2256 	/*
2257 	 * Free any cached pathconf information.
2258 	 */
2259 	if (info != NULL)
2260 		kmem_free(info, sizeof (*info));
2261 }
2262 
2263 /*
2264  * Return a vnode for the given NFS Version 2 file handle.
2265  * If no rnode exists for this fhandle, create one and put it
2266  * into the hash queues.  If the rnode for this fhandle
2267  * already exists, return it.
2268  *
2269  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2270  */
2271 vnode_t *
2272 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2273     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2274 {
2275 	int newnode;
2276 	int index;
2277 	vnode_t *vp;
2278 	nfs_fhandle nfh;
2279 	vattr_t va;
2280 
2281 	nfh.fh_len = NFS_FHSIZE;
2282 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2283 
2284 	index = rtablehash(&nfh);
2285 	rw_enter(&rtable[index].r_lock, RW_READER);
2286 
2287 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2288 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2289 
2290 	if (attr != NULL) {
2291 		if (!newnode) {
2292 			rw_exit(&rtable[index].r_lock);
2293 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2294 		} else {
2295 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2296 				vp->v_type = VBAD;
2297 			else
2298 				vp->v_type = n2v_type(attr);
2299 			/*
2300 			 * A translation here seems to be necessary
2301 			 * because this function can be called
2302 			 * with `attr' that has come from the wire,
2303 			 * and been operated on by vattr_to_nattr().
2304 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2305 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2306 			 * ->makenfsnode().
2307 			 */
2308 			if ((attr->na_rdev & 0xffff0000) == 0)
2309 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2310 			else
2311 				vp->v_rdev = expldev(n2v_rdev(attr));
2312 			nfs_attrcache(vp, attr, t);
2313 			rw_exit(&rtable[index].r_lock);
2314 		}
2315 	} else {
2316 		if (newnode) {
2317 			PURGE_ATTRCACHE(vp);
2318 		}
2319 		rw_exit(&rtable[index].r_lock);
2320 	}
2321 
2322 	return (vp);
2323 }
2324 
2325 /*
2326  * Return a vnode for the given NFS Version 3 file handle.
2327  * If no rnode exists for this fhandle, create one and put it
2328  * into the hash queues.  If the rnode for this fhandle
2329  * already exists, return it.
2330  *
2331  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2332  */
2333 vnode_t *
2334 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2335     cred_t *cr, char *dnm, char *nm)
2336 {
2337 	int newnode;
2338 	int index;
2339 	vnode_t *vp;
2340 
2341 	index = rtablehash((nfs_fhandle *)fh);
2342 	rw_enter(&rtable[index].r_lock, RW_READER);
2343 
2344 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2345 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2346 	    dnm, nm);
2347 
2348 	if (vap == NULL) {
2349 		if (newnode) {
2350 			PURGE_ATTRCACHE(vp);
2351 		}
2352 		rw_exit(&rtable[index].r_lock);
2353 		return (vp);
2354 	}
2355 
2356 	if (!newnode) {
2357 		rw_exit(&rtable[index].r_lock);
2358 		nfs_attr_cache(vp, vap, t, cr);
2359 	} else {
2360 		rnode_t *rp = VTOR(vp);
2361 
2362 		vp->v_type = vap->va_type;
2363 		vp->v_rdev = vap->va_rdev;
2364 
2365 		mutex_enter(&rp->r_statelock);
2366 		if (rp->r_mtime <= t)
2367 			nfs_attrcache_va(vp, vap);
2368 		mutex_exit(&rp->r_statelock);
2369 		rw_exit(&rtable[index].r_lock);
2370 	}
2371 
2372 	return (vp);
2373 }
2374 
2375 vnode_t *
2376 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2377     cred_t *cr, char *dnm, char *nm)
2378 {
2379 	int newnode;
2380 	int index;
2381 	vnode_t *vp;
2382 	vattr_t va;
2383 
2384 	index = rtablehash((nfs_fhandle *)fh);
2385 	rw_enter(&rtable[index].r_lock, RW_READER);
2386 
2387 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2388 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2389 	    dnm, nm);
2390 
2391 	if (attr == NULL) {
2392 		if (newnode) {
2393 			PURGE_ATTRCACHE(vp);
2394 		}
2395 		rw_exit(&rtable[index].r_lock);
2396 		return (vp);
2397 	}
2398 
2399 	if (!newnode) {
2400 		rw_exit(&rtable[index].r_lock);
2401 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2402 	} else {
2403 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2404 			vp->v_type = VBAD;
2405 		else
2406 			vp->v_type = nf3_to_vt[attr->type];
2407 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2408 		    attr->rdev.specdata2);
2409 		nfs3_attrcache(vp, attr, t);
2410 		rw_exit(&rtable[index].r_lock);
2411 	}
2412 
2413 	return (vp);
2414 }
2415 
2416 /*
2417  * Read this comment before making changes to rtablehash()!
2418  * This is a hash function in which seemingly obvious and harmless
2419  * changes can cause escalations costing million dollars!
2420  * Know what you are doing.
2421  *
2422  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2423  * algorithm is currently detailed here:
2424  *
2425  *   http://burtleburtle.net/bob/hash/doobs.html
2426  *
2427  * Of course, the above link may not be valid by the time you are reading
2428  * this, but suffice it to say that the one-at-a-time algorithm works well in
2429  * almost all cases.  If you are changing the algorithm be sure to verify that
2430  * the hash algorithm still provides even distribution in all cases and with
2431  * any server returning filehandles in whatever order (sequential or random).
2432  */
2433 static int
2434 rtablehash(nfs_fhandle *fh)
2435 {
2436 	ulong_t hash, len, i;
2437 	char *key;
2438 
2439 	key = fh->fh_buf;
2440 	len = (ulong_t)fh->fh_len;
2441 	for (hash = 0, i = 0; i < len; i++) {
2442 		hash += key[i];
2443 		hash += (hash << 10);
2444 		hash ^= (hash >> 6);
2445 	}
2446 	hash += (hash << 3);
2447 	hash ^= (hash >> 11);
2448 	hash += (hash << 15);
2449 	return (hash & rtablemask);
2450 }
2451 
2452 static vnode_t *
2453 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2454     struct vnodeops *vops,
2455     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2456     int (*compar)(const void *, const void *),
2457     int *newnode, cred_t *cr, char *dnm, char *nm)
2458 {
2459 	rnode_t *rp;
2460 	rnode_t *trp;
2461 	vnode_t *vp;
2462 	mntinfo_t *mi;
2463 
2464 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2465 
2466 	mi = VFTOMI(vfsp);
2467 start:
2468 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2469 		vp = RTOV(rp);
2470 		nfs_set_vroot(vp);
2471 		*newnode = 0;
2472 		return (vp);
2473 	}
2474 	rw_exit(&rhtp->r_lock);
2475 
2476 	mutex_enter(&rpfreelist_lock);
2477 	if (rpfreelist != NULL && rnew >= nrnode) {
2478 		rp = rpfreelist;
2479 		rp_rmfree(rp);
2480 		mutex_exit(&rpfreelist_lock);
2481 
2482 		vp = RTOV(rp);
2483 
2484 		if (rp->r_flags & RHASHED) {
2485 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2486 			mutex_enter(&vp->v_lock);
2487 			if (vp->v_count > 1) {
2488 				vp->v_count--;
2489 				mutex_exit(&vp->v_lock);
2490 				rw_exit(&rp->r_hashq->r_lock);
2491 				rw_enter(&rhtp->r_lock, RW_READER);
2492 				goto start;
2493 			}
2494 			mutex_exit(&vp->v_lock);
2495 			rp_rmhash_locked(rp);
2496 			rw_exit(&rp->r_hashq->r_lock);
2497 		}
2498 
2499 		rinactive(rp, cr);
2500 
2501 		mutex_enter(&vp->v_lock);
2502 		if (vp->v_count > 1) {
2503 			vp->v_count--;
2504 			mutex_exit(&vp->v_lock);
2505 			rw_enter(&rhtp->r_lock, RW_READER);
2506 			goto start;
2507 		}
2508 		mutex_exit(&vp->v_lock);
2509 		vn_invalid(vp);
2510 		/*
2511 		 * destroy old locks before bzero'ing and
2512 		 * recreating the locks below.
2513 		 */
2514 		nfs_rw_destroy(&rp->r_rwlock);
2515 		nfs_rw_destroy(&rp->r_lkserlock);
2516 		mutex_destroy(&rp->r_statelock);
2517 		cv_destroy(&rp->r_cv);
2518 		cv_destroy(&rp->r_commit.c_cv);
2519 		nfs_free_r_path(rp);
2520 		avl_destroy(&rp->r_dir);
2521 		/*
2522 		 * Make sure that if rnode is recycled then
2523 		 * VFS count is decremented properly before
2524 		 * reuse.
2525 		 */
2526 		VFS_RELE(vp->v_vfsp);
2527 		vn_reinit(vp);
2528 	} else {
2529 		vnode_t *new_vp;
2530 
2531 		mutex_exit(&rpfreelist_lock);
2532 
2533 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2534 		new_vp = vn_alloc(KM_SLEEP);
2535 
2536 		atomic_add_long((ulong_t *)&rnew, 1);
2537 #ifdef DEBUG
2538 		clstat_debug.nrnode.value.ui64++;
2539 #endif
2540 		vp = new_vp;
2541 	}
2542 
2543 	bzero(rp, sizeof (*rp));
2544 	rp->r_vnode = vp;
2545 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2546 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2547 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2548 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2549 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2550 	rp->r_fh.fh_len = fh->fh_len;
2551 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2552 	rp->r_server = mi->mi_curr_serv;
2553 	if (FAILOVER_MOUNT(mi)) {
2554 		/*
2555 		 * If replicated servers, stash pathnames
2556 		 */
2557 		if (dnm != NULL && nm != NULL) {
2558 			char *s, *p;
2559 			uint_t len;
2560 
2561 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2562 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2563 #ifdef DEBUG
2564 			clstat_debug.rpath.value.ui64 += len;
2565 #endif
2566 			s = rp->r_path;
2567 			for (p = dnm; *p; p++)
2568 				*s++ = *p;
2569 			*s++ = '/';
2570 			for (p = nm; *p; p++)
2571 				*s++ = *p;
2572 			*s = '\0';
2573 		} else {
2574 			/* special case for root */
2575 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2576 #ifdef DEBUG
2577 			clstat_debug.rpath.value.ui64 += 2;
2578 #endif
2579 			*rp->r_path = '.';
2580 			*(rp->r_path + 1) = '\0';
2581 		}
2582 	}
2583 	VFS_HOLD(vfsp);
2584 	rp->r_putapage = putapage;
2585 	rp->r_hashq = rhtp;
2586 	rp->r_flags = RREADDIRPLUS;
2587 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2588 	    offsetof(rddir_cache, tree));
2589 	vn_setops(vp, vops);
2590 	vp->v_data = (caddr_t)rp;
2591 	vp->v_vfsp = vfsp;
2592 	vp->v_type = VNON;
2593 	nfs_set_vroot(vp);
2594 
2595 	/*
2596 	 * There is a race condition if someone else
2597 	 * alloc's the rnode while no locks are held, so we
2598 	 * check again and recover if found.
2599 	 */
2600 	rw_enter(&rhtp->r_lock, RW_WRITER);
2601 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2602 		vp = RTOV(trp);
2603 		nfs_set_vroot(vp);
2604 		*newnode = 0;
2605 		rw_exit(&rhtp->r_lock);
2606 		rp_addfree(rp, cr);
2607 		rw_enter(&rhtp->r_lock, RW_READER);
2608 		return (vp);
2609 	}
2610 	rp_addhash(rp);
2611 	*newnode = 1;
2612 	return (vp);
2613 }
2614 
2615 static void
2616 nfs_set_vroot(vnode_t *vp)
2617 {
2618 	rnode_t *rp;
2619 	nfs_fhandle *rootfh;
2620 
2621 	rp = VTOR(vp);
2622 	rootfh = &rp->r_server->sv_fhandle;
2623 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2624 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2625 		if (!(vp->v_flag & VROOT)) {
2626 			mutex_enter(&vp->v_lock);
2627 			vp->v_flag |= VROOT;
2628 			mutex_exit(&vp->v_lock);
2629 		}
2630 	}
2631 }
2632 
2633 static void
2634 nfs_free_r_path(rnode_t *rp)
2635 {
2636 	char *path;
2637 	size_t len;
2638 
2639 	path = rp->r_path;
2640 	if (path) {
2641 		rp->r_path = NULL;
2642 		len = strlen(path) + 1;
2643 		kmem_free(path, len);
2644 #ifdef DEBUG
2645 		clstat_debug.rpath.value.ui64 -= len;
2646 #endif
2647 	}
2648 }
2649 
2650 /*
2651  * Put an rnode on the free list.
2652  *
2653  * Rnodes which were allocated above and beyond the normal limit
2654  * are immediately freed.
2655  */
2656 void
2657 rp_addfree(rnode_t *rp, cred_t *cr)
2658 {
2659 	vnode_t *vp;
2660 	struct vfs *vfsp;
2661 
2662 	vp = RTOV(rp);
2663 	ASSERT(vp->v_count >= 1);
2664 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2665 
2666 	/*
2667 	 * If we have too many rnodes allocated and there are no
2668 	 * references to this rnode, or if the rnode is no longer
2669 	 * accessible by it does not reside in the hash queues,
2670 	 * or if an i/o error occurred while writing to the file,
2671 	 * then just free it instead of putting it on the rnode
2672 	 * freelist.
2673 	 */
2674 	vfsp = vp->v_vfsp;
2675 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2676 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2677 		if (rp->r_flags & RHASHED) {
2678 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2679 			mutex_enter(&vp->v_lock);
2680 			if (vp->v_count > 1) {
2681 				vp->v_count--;
2682 				mutex_exit(&vp->v_lock);
2683 				rw_exit(&rp->r_hashq->r_lock);
2684 				return;
2685 			}
2686 			mutex_exit(&vp->v_lock);
2687 			rp_rmhash_locked(rp);
2688 			rw_exit(&rp->r_hashq->r_lock);
2689 		}
2690 
2691 		rinactive(rp, cr);
2692 
2693 		/*
2694 		 * Recheck the vnode reference count.  We need to
2695 		 * make sure that another reference has not been
2696 		 * acquired while we were not holding v_lock.  The
2697 		 * rnode is not in the rnode hash queues, so the
2698 		 * only way for a reference to have been acquired
2699 		 * is for a VOP_PUTPAGE because the rnode was marked
2700 		 * with RDIRTY or for a modified page.  This
2701 		 * reference may have been acquired before our call
2702 		 * to rinactive.  The i/o may have been completed,
2703 		 * thus allowing rinactive to complete, but the
2704 		 * reference to the vnode may not have been released
2705 		 * yet.  In any case, the rnode can not be destroyed
2706 		 * until the other references to this vnode have been
2707 		 * released.  The other references will take care of
2708 		 * either destroying the rnode or placing it on the
2709 		 * rnode freelist.  If there are no other references,
2710 		 * then the rnode may be safely destroyed.
2711 		 */
2712 		mutex_enter(&vp->v_lock);
2713 		if (vp->v_count > 1) {
2714 			vp->v_count--;
2715 			mutex_exit(&vp->v_lock);
2716 			return;
2717 		}
2718 		mutex_exit(&vp->v_lock);
2719 
2720 		destroy_rnode(rp);
2721 		return;
2722 	}
2723 
2724 	/*
2725 	 * Lock the hash queue and then recheck the reference count
2726 	 * to ensure that no other threads have acquired a reference
2727 	 * to indicate that the rnode should not be placed on the
2728 	 * freelist.  If another reference has been acquired, then
2729 	 * just release this one and let the other thread complete
2730 	 * the processing of adding this rnode to the freelist.
2731 	 */
2732 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2733 
2734 	mutex_enter(&vp->v_lock);
2735 	if (vp->v_count > 1) {
2736 		vp->v_count--;
2737 		mutex_exit(&vp->v_lock);
2738 		rw_exit(&rp->r_hashq->r_lock);
2739 		return;
2740 	}
2741 	mutex_exit(&vp->v_lock);
2742 
2743 	/*
2744 	 * If there is no cached data or metadata for this file, then
2745 	 * put the rnode on the front of the freelist so that it will
2746 	 * be reused before other rnodes which may have cached data or
2747 	 * metadata associated with them.
2748 	 */
2749 	mutex_enter(&rpfreelist_lock);
2750 	if (rpfreelist == NULL) {
2751 		rp->r_freef = rp;
2752 		rp->r_freeb = rp;
2753 		rpfreelist = rp;
2754 	} else {
2755 		rp->r_freef = rpfreelist;
2756 		rp->r_freeb = rpfreelist->r_freeb;
2757 		rpfreelist->r_freeb->r_freef = rp;
2758 		rpfreelist->r_freeb = rp;
2759 		if (!vn_has_cached_data(vp) &&
2760 		    !HAVE_RDDIR_CACHE(rp) &&
2761 		    rp->r_symlink.contents == NULL &&
2762 		    rp->r_secattr == NULL &&
2763 		    rp->r_pathconf == NULL)
2764 			rpfreelist = rp;
2765 	}
2766 	mutex_exit(&rpfreelist_lock);
2767 
2768 	rw_exit(&rp->r_hashq->r_lock);
2769 }
2770 
2771 /*
2772  * Remove an rnode from the free list.
2773  *
2774  * The caller must be holding rpfreelist_lock and the rnode
2775  * must be on the freelist.
2776  */
2777 static void
2778 rp_rmfree(rnode_t *rp)
2779 {
2780 
2781 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2782 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2783 
2784 	if (rp == rpfreelist) {
2785 		rpfreelist = rp->r_freef;
2786 		if (rp == rpfreelist)
2787 			rpfreelist = NULL;
2788 	}
2789 
2790 	rp->r_freeb->r_freef = rp->r_freef;
2791 	rp->r_freef->r_freeb = rp->r_freeb;
2792 
2793 	rp->r_freef = rp->r_freeb = NULL;
2794 }
2795 
2796 /*
2797  * Put a rnode in the hash table.
2798  *
2799  * The caller must be holding the exclusive hash queue lock.
2800  */
2801 static void
2802 rp_addhash(rnode_t *rp)
2803 {
2804 
2805 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2806 	ASSERT(!(rp->r_flags & RHASHED));
2807 
2808 	rp->r_hashf = rp->r_hashq->r_hashf;
2809 	rp->r_hashq->r_hashf = rp;
2810 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2811 	rp->r_hashf->r_hashb = rp;
2812 
2813 	mutex_enter(&rp->r_statelock);
2814 	rp->r_flags |= RHASHED;
2815 	mutex_exit(&rp->r_statelock);
2816 }
2817 
2818 /*
2819  * Remove a rnode from the hash table.
2820  *
2821  * The caller must be holding the hash queue lock.
2822  */
2823 static void
2824 rp_rmhash_locked(rnode_t *rp)
2825 {
2826 
2827 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2828 	ASSERT(rp->r_flags & RHASHED);
2829 
2830 	rp->r_hashb->r_hashf = rp->r_hashf;
2831 	rp->r_hashf->r_hashb = rp->r_hashb;
2832 
2833 	mutex_enter(&rp->r_statelock);
2834 	rp->r_flags &= ~RHASHED;
2835 	mutex_exit(&rp->r_statelock);
2836 }
2837 
2838 /*
2839  * Remove a rnode from the hash table.
2840  *
2841  * The caller must not be holding the hash queue lock.
2842  */
2843 void
2844 rp_rmhash(rnode_t *rp)
2845 {
2846 
2847 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2848 	rp_rmhash_locked(rp);
2849 	rw_exit(&rp->r_hashq->r_lock);
2850 }
2851 
2852 /*
2853  * Lookup a rnode by fhandle.
2854  *
2855  * The caller must be holding the hash queue lock, either shared or exclusive.
2856  */
2857 static rnode_t *
2858 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2859 {
2860 	rnode_t *rp;
2861 	vnode_t *vp;
2862 
2863 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2864 
2865 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2866 		vp = RTOV(rp);
2867 		if (vp->v_vfsp == vfsp &&
2868 		    rp->r_fh.fh_len == fh->fh_len &&
2869 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2870 			/*
2871 			 * remove rnode from free list, if necessary.
2872 			 */
2873 			if (rp->r_freef != NULL) {
2874 				mutex_enter(&rpfreelist_lock);
2875 				/*
2876 				 * If the rnode is on the freelist,
2877 				 * then remove it and use that reference
2878 				 * as the new reference.  Otherwise,
2879 				 * need to increment the reference count.
2880 				 */
2881 				if (rp->r_freef != NULL) {
2882 					rp_rmfree(rp);
2883 					mutex_exit(&rpfreelist_lock);
2884 				} else {
2885 					mutex_exit(&rpfreelist_lock);
2886 					VN_HOLD(vp);
2887 				}
2888 			} else
2889 				VN_HOLD(vp);
2890 			return (rp);
2891 		}
2892 	}
2893 	return (NULL);
2894 }
2895 
2896 /*
2897  * Return 1 if there is a active vnode belonging to this vfs in the
2898  * rtable cache.
2899  *
2900  * Several of these checks are done without holding the usual
2901  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2902  * etc. will redo the necessary checks before actually destroying
2903  * any rnodes.
2904  */
2905 int
2906 check_rtable(struct vfs *vfsp)
2907 {
2908 	int index;
2909 	rnode_t *rp;
2910 	vnode_t *vp;
2911 
2912 	for (index = 0; index < rtablesize; index++) {
2913 		rw_enter(&rtable[index].r_lock, RW_READER);
2914 		for (rp = rtable[index].r_hashf;
2915 		    rp != (rnode_t *)(&rtable[index]);
2916 		    rp = rp->r_hashf) {
2917 			vp = RTOV(rp);
2918 			if (vp->v_vfsp == vfsp) {
2919 				if (rp->r_freef == NULL ||
2920 				    (vn_has_cached_data(vp) &&
2921 				    (rp->r_flags & RDIRTY)) ||
2922 				    rp->r_count > 0) {
2923 					rw_exit(&rtable[index].r_lock);
2924 					return (1);
2925 				}
2926 			}
2927 		}
2928 		rw_exit(&rtable[index].r_lock);
2929 	}
2930 	return (0);
2931 }
2932 
2933 /*
2934  * Destroy inactive vnodes from the hash queues which belong to this
2935  * vfs.  It is essential that we destroy all inactive vnodes during a
2936  * forced unmount as well as during a normal unmount.
2937  */
2938 void
2939 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2940 {
2941 	int index;
2942 	rnode_t *rp;
2943 	rnode_t *rlist;
2944 	rnode_t *r_hashf;
2945 	vnode_t *vp;
2946 
2947 	rlist = NULL;
2948 
2949 	for (index = 0; index < rtablesize; index++) {
2950 		rw_enter(&rtable[index].r_lock, RW_WRITER);
2951 		for (rp = rtable[index].r_hashf;
2952 		    rp != (rnode_t *)(&rtable[index]);
2953 		    rp = r_hashf) {
2954 			/* save the hash pointer before destroying */
2955 			r_hashf = rp->r_hashf;
2956 			vp = RTOV(rp);
2957 			if (vp->v_vfsp == vfsp) {
2958 				mutex_enter(&rpfreelist_lock);
2959 				if (rp->r_freef != NULL) {
2960 					rp_rmfree(rp);
2961 					mutex_exit(&rpfreelist_lock);
2962 					rp_rmhash_locked(rp);
2963 					rp->r_hashf = rlist;
2964 					rlist = rp;
2965 				} else
2966 					mutex_exit(&rpfreelist_lock);
2967 			}
2968 		}
2969 		rw_exit(&rtable[index].r_lock);
2970 	}
2971 
2972 	for (rp = rlist; rp != NULL; rp = rlist) {
2973 		rlist = rp->r_hashf;
2974 		/*
2975 		 * This call to rp_addfree will end up destroying the
2976 		 * rnode, but in a safe way with the appropriate set
2977 		 * of checks done.
2978 		 */
2979 		rp_addfree(rp, cr);
2980 	}
2981 
2982 }
2983 
2984 /*
2985  * This routine destroys all the resources associated with the rnode
2986  * and then the rnode itself.
2987  */
2988 static void
2989 destroy_rnode(rnode_t *rp)
2990 {
2991 	vnode_t *vp;
2992 	vfs_t *vfsp;
2993 
2994 	vp = RTOV(rp);
2995 	vfsp = vp->v_vfsp;
2996 
2997 	ASSERT(vp->v_count == 1);
2998 	ASSERT(rp->r_count == 0);
2999 	ASSERT(rp->r_lmpl == NULL);
3000 	ASSERT(rp->r_mapcnt == 0);
3001 	ASSERT(!(rp->r_flags & RHASHED));
3002 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3003 	atomic_add_long((ulong_t *)&rnew, -1);
3004 #ifdef DEBUG
3005 	clstat_debug.nrnode.value.ui64--;
3006 #endif
3007 	nfs_rw_destroy(&rp->r_rwlock);
3008 	nfs_rw_destroy(&rp->r_lkserlock);
3009 	mutex_destroy(&rp->r_statelock);
3010 	cv_destroy(&rp->r_cv);
3011 	cv_destroy(&rp->r_commit.c_cv);
3012 	if (rp->r_flags & RDELMAPLIST)
3013 		list_destroy(&rp->r_indelmap);
3014 	nfs_free_r_path(rp);
3015 	avl_destroy(&rp->r_dir);
3016 	vn_invalid(vp);
3017 	vn_free(vp);
3018 	kmem_cache_free(rnode_cache, rp);
3019 	VFS_RELE(vfsp);
3020 }
3021 
3022 /*
3023  * Flush all vnodes in this (or every) vfs.
3024  * Used by nfs_sync and by nfs_unmount.
3025  */
3026 void
3027 rflush(struct vfs *vfsp, cred_t *cr)
3028 {
3029 	int index;
3030 	rnode_t *rp;
3031 	vnode_t *vp, **vplist;
3032 	long num, cnt;
3033 
3034 	/*
3035 	 * Check to see whether there is anything to do.
3036 	 */
3037 	num = rnew;
3038 	if (num == 0)
3039 		return;
3040 
3041 	/*
3042 	 * Allocate a slot for all currently active rnodes on the
3043 	 * supposition that they all may need flushing.
3044 	 */
3045 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3046 	cnt = 0;
3047 
3048 	/*
3049 	 * Walk the hash queues looking for rnodes with page
3050 	 * lists associated with them.  Make a list of these
3051 	 * files.
3052 	 */
3053 	for (index = 0; index < rtablesize; index++) {
3054 		rw_enter(&rtable[index].r_lock, RW_READER);
3055 		for (rp = rtable[index].r_hashf;
3056 		    rp != (rnode_t *)(&rtable[index]);
3057 		    rp = rp->r_hashf) {
3058 			vp = RTOV(rp);
3059 			/*
3060 			 * Don't bother sync'ing a vp if it
3061 			 * is part of virtual swap device or
3062 			 * if VFS is read-only
3063 			 */
3064 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3065 				continue;
3066 			/*
3067 			 * If flushing all mounted file systems or
3068 			 * the vnode belongs to this vfs, has pages
3069 			 * and is marked as either dirty or mmap'd,
3070 			 * hold and add this vnode to the list of
3071 			 * vnodes to flush.
3072 			 */
3073 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3074 			    vn_has_cached_data(vp) &&
3075 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3076 				VN_HOLD(vp);
3077 				vplist[cnt++] = vp;
3078 				if (cnt == num) {
3079 					rw_exit(&rtable[index].r_lock);
3080 					goto toomany;
3081 				}
3082 			}
3083 		}
3084 		rw_exit(&rtable[index].r_lock);
3085 	}
3086 toomany:
3087 
3088 	/*
3089 	 * Flush and release all of the files on the list.
3090 	 */
3091 	while (cnt-- > 0) {
3092 		vp = vplist[cnt];
3093 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3094 		VN_RELE(vp);
3095 	}
3096 
3097 	/*
3098 	 * Free the space allocated to hold the list.
3099 	 */
3100 	kmem_free(vplist, num * sizeof (*vplist));
3101 }
3102 
3103 /*
3104  * This probably needs to be larger than or equal to
3105  * log2(sizeof (struct rnode)) due to the way that rnodes are
3106  * allocated.
3107  */
3108 #define	ACACHE_SHIFT_BITS	9
3109 
3110 static int
3111 acachehash(rnode_t *rp, cred_t *cr)
3112 {
3113 
3114 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3115 	    acachemask);
3116 }
3117 
3118 #ifdef DEBUG
3119 static long nfs_access_cache_hits = 0;
3120 static long nfs_access_cache_misses = 0;
3121 #endif
3122 
3123 nfs_access_type_t
3124 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3125 {
3126 	vnode_t *vp;
3127 	acache_t *ap;
3128 	acache_hash_t *hp;
3129 	nfs_access_type_t all;
3130 
3131 	vp = RTOV(rp);
3132 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3133 		return (NFS_ACCESS_UNKNOWN);
3134 
3135 	if (rp->r_acache != NULL) {
3136 		hp = &acache[acachehash(rp, cr)];
3137 		rw_enter(&hp->lock, RW_READER);
3138 		ap = hp->next;
3139 		while (ap != (acache_t *)hp) {
3140 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3141 				if ((ap->known & acc) == acc) {
3142 #ifdef DEBUG
3143 					nfs_access_cache_hits++;
3144 #endif
3145 					if ((ap->allowed & acc) == acc)
3146 						all = NFS_ACCESS_ALLOWED;
3147 					else
3148 						all = NFS_ACCESS_DENIED;
3149 				} else {
3150 #ifdef DEBUG
3151 					nfs_access_cache_misses++;
3152 #endif
3153 					all = NFS_ACCESS_UNKNOWN;
3154 				}
3155 				rw_exit(&hp->lock);
3156 				return (all);
3157 			}
3158 			ap = ap->next;
3159 		}
3160 		rw_exit(&hp->lock);
3161 	}
3162 
3163 #ifdef DEBUG
3164 	nfs_access_cache_misses++;
3165 #endif
3166 	return (NFS_ACCESS_UNKNOWN);
3167 }
3168 
3169 void
3170 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3171 {
3172 	acache_t *ap;
3173 	acache_t *nap;
3174 	acache_hash_t *hp;
3175 
3176 	hp = &acache[acachehash(rp, cr)];
3177 
3178 	/*
3179 	 * Allocate now assuming that mostly an allocation will be
3180 	 * required.  This allows the allocation to happen without
3181 	 * holding the hash bucket locked.
3182 	 */
3183 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3184 	if (nap != NULL) {
3185 		nap->known = acc;
3186 		nap->allowed = resacc;
3187 		nap->rnode = rp;
3188 		crhold(cr);
3189 		nap->cred = cr;
3190 		nap->hashq = hp;
3191 	}
3192 
3193 	rw_enter(&hp->lock, RW_WRITER);
3194 
3195 	if (rp->r_acache != NULL) {
3196 		ap = hp->next;
3197 		while (ap != (acache_t *)hp) {
3198 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3199 				ap->known |= acc;
3200 				ap->allowed &= ~acc;
3201 				ap->allowed |= resacc;
3202 				rw_exit(&hp->lock);
3203 				if (nap != NULL) {
3204 					crfree(nap->cred);
3205 					kmem_cache_free(acache_cache, nap);
3206 				}
3207 				return;
3208 			}
3209 			ap = ap->next;
3210 		}
3211 	}
3212 
3213 	if (nap != NULL) {
3214 #ifdef DEBUG
3215 		clstat_debug.access.value.ui64++;
3216 #endif
3217 		nap->next = hp->next;
3218 		hp->next = nap;
3219 		nap->next->prev = nap;
3220 		nap->prev = (acache_t *)hp;
3221 
3222 		mutex_enter(&rp->r_statelock);
3223 		nap->list = rp->r_acache;
3224 		rp->r_acache = nap;
3225 		mutex_exit(&rp->r_statelock);
3226 	}
3227 
3228 	rw_exit(&hp->lock);
3229 }
3230 
3231 int
3232 nfs_access_purge_rp(rnode_t *rp)
3233 {
3234 	acache_t *ap;
3235 	acache_t *tmpap;
3236 	acache_t *rplist;
3237 
3238 	/*
3239 	 * If there aren't any cached entries, then there is nothing
3240 	 * to free.
3241 	 */
3242 	if (rp->r_acache == NULL)
3243 		return (0);
3244 
3245 	mutex_enter(&rp->r_statelock);
3246 	rplist = rp->r_acache;
3247 	rp->r_acache = NULL;
3248 	mutex_exit(&rp->r_statelock);
3249 
3250 	/*
3251 	 * Loop through each entry in the list pointed to in the
3252 	 * rnode.  Remove each of these entries from the hash
3253 	 * queue that it is on and remove it from the list in
3254 	 * the rnode.
3255 	 */
3256 	for (ap = rplist; ap != NULL; ap = tmpap) {
3257 		rw_enter(&ap->hashq->lock, RW_WRITER);
3258 		ap->prev->next = ap->next;
3259 		ap->next->prev = ap->prev;
3260 		rw_exit(&ap->hashq->lock);
3261 
3262 		tmpap = ap->list;
3263 		crfree(ap->cred);
3264 		kmem_cache_free(acache_cache, ap);
3265 #ifdef DEBUG
3266 		clstat_debug.access.value.ui64--;
3267 #endif
3268 	}
3269 
3270 	return (1);
3271 }
3272 
3273 static const char prefix[] = ".nfs";
3274 
3275 static kmutex_t newnum_lock;
3276 
3277 int
3278 newnum(void)
3279 {
3280 	static uint_t newnum = 0;
3281 	uint_t id;
3282 
3283 	mutex_enter(&newnum_lock);
3284 	if (newnum == 0)
3285 		newnum = gethrestime_sec() & 0xffff;
3286 	id = newnum++;
3287 	mutex_exit(&newnum_lock);
3288 	return (id);
3289 }
3290 
3291 char *
3292 newname(void)
3293 {
3294 	char *news;
3295 	char *s;
3296 	const char *p;
3297 	uint_t id;
3298 
3299 	id = newnum();
3300 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3301 	s = news;
3302 	p = prefix;
3303 	while (*p != '\0')
3304 		*s++ = *p++;
3305 	while (id != 0) {
3306 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3307 		id >>= 4;
3308 	}
3309 	*s = '\0';
3310 	return (news);
3311 }
3312 
3313 /*
3314  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3315  * framework.
3316  */
3317 static int
3318 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3319 {
3320 	ksp->ks_snaptime = gethrtime();
3321 	if (rw == KSTAT_WRITE) {
3322 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3323 #ifdef DEBUG
3324 		/*
3325 		 * Currently only the global zone can write to kstats, but we
3326 		 * add the check just for paranoia.
3327 		 */
3328 		if (INGLOBALZONE(curproc))
3329 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3330 			    sizeof (clstat_debug));
3331 #endif
3332 	} else {
3333 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3334 #ifdef DEBUG
3335 		/*
3336 		 * If we're displaying the "global" debug kstat values, we
3337 		 * display them as-is to all zones since in fact they apply to
3338 		 * the system as a whole.
3339 		 */
3340 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3341 		    sizeof (clstat_debug));
3342 #endif
3343 	}
3344 	return (0);
3345 }
3346 
3347 static void *
3348 clinit_zone(zoneid_t zoneid)
3349 {
3350 	kstat_t *nfs_client_kstat;
3351 	struct nfs_clnt *nfscl;
3352 	uint_t ndata;
3353 
3354 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3355 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3356 	nfscl->nfscl_chtable = NULL;
3357 	nfscl->nfscl_zoneid = zoneid;
3358 
3359 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3360 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3361 #ifdef DEBUG
3362 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3363 #endif
3364 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3365 	    "misc", KSTAT_TYPE_NAMED, ndata,
3366 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3367 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3368 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3369 		kstat_install(nfs_client_kstat);
3370 	}
3371 	mutex_enter(&nfs_clnt_list_lock);
3372 	list_insert_head(&nfs_clnt_list, nfscl);
3373 	mutex_exit(&nfs_clnt_list_lock);
3374 	return (nfscl);
3375 }
3376 
3377 /*ARGSUSED*/
3378 static void
3379 clfini_zone(zoneid_t zoneid, void *arg)
3380 {
3381 	struct nfs_clnt *nfscl = arg;
3382 	chhead_t *chp, *next;
3383 
3384 	if (nfscl == NULL)
3385 		return;
3386 	mutex_enter(&nfs_clnt_list_lock);
3387 	list_remove(&nfs_clnt_list, nfscl);
3388 	mutex_exit(&nfs_clnt_list_lock);
3389 	clreclaim_zone(nfscl, 0);
3390 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3391 		ASSERT(chp->ch_list == NULL);
3392 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3393 		next = chp->ch_next;
3394 		kmem_free(chp, sizeof (*chp));
3395 	}
3396 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3397 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3398 	kmem_free(nfscl, sizeof (*nfscl));
3399 }
3400 
3401 /*
3402  * Called by endpnt_destructor to make sure the client handles are
3403  * cleaned up before the RPC endpoints.  This becomes a no-op if
3404  * clfini_zone (above) is called first.  This function is needed
3405  * (rather than relying on clfini_zone to clean up) because the ZSD
3406  * callbacks have no ordering mechanism, so we have no way to ensure
3407  * that clfini_zone is called before endpnt_destructor.
3408  */
3409 void
3410 clcleanup_zone(zoneid_t zoneid)
3411 {
3412 	struct nfs_clnt *nfscl;
3413 
3414 	mutex_enter(&nfs_clnt_list_lock);
3415 	nfscl = list_head(&nfs_clnt_list);
3416 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3417 		if (nfscl->nfscl_zoneid == zoneid) {
3418 			clreclaim_zone(nfscl, 0);
3419 			break;
3420 		}
3421 	}
3422 	mutex_exit(&nfs_clnt_list_lock);
3423 }
3424 
3425 int
3426 nfs_subrinit(void)
3427 {
3428 	int i;
3429 	ulong_t nrnode_max;
3430 
3431 	/*
3432 	 * Allocate and initialize the rnode hash queues
3433 	 */
3434 	if (nrnode <= 0)
3435 		nrnode = ncsize;
3436 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3437 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3438 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3439 		    "setting nrnode to max value of %ld", nrnode_max);
3440 		nrnode = nrnode_max;
3441 	}
3442 
3443 	rtablesize = 1 << highbit(nrnode / hashlen);
3444 	rtablemask = rtablesize - 1;
3445 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3446 	for (i = 0; i < rtablesize; i++) {
3447 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3448 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3449 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3450 	}
3451 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3452 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3453 
3454 	/*
3455 	 * Allocate and initialize the access cache
3456 	 */
3457 
3458 	/*
3459 	 * Initial guess is one access cache entry per rnode unless
3460 	 * nacache is set to a non-zero value and then it is used to
3461 	 * indicate a guess at the number of access cache entries.
3462 	 */
3463 	if (nacache > 0)
3464 		acachesize = 1 << highbit(nacache / hashlen);
3465 	else
3466 		acachesize = rtablesize;
3467 	acachemask = acachesize - 1;
3468 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3469 	for (i = 0; i < acachesize; i++) {
3470 		acache[i].next = (acache_t *)&acache[i];
3471 		acache[i].prev = (acache_t *)&acache[i];
3472 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3473 	}
3474 	acache_cache = kmem_cache_create("nfs_access_cache",
3475 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3476 	/*
3477 	 * Allocate and initialize the client handle cache
3478 	 */
3479 	chtab_cache = kmem_cache_create("client_handle_cache",
3480 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3481 	/*
3482 	 * Initialize the list of per-zone client handles (and associated data).
3483 	 * This needs to be done before we call zone_key_create().
3484 	 */
3485 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3486 	    offsetof(struct nfs_clnt, nfscl_node));
3487 	/*
3488 	 * Initialize the zone_key for per-zone client handle lists.
3489 	 */
3490 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3491 	/*
3492 	 * Initialize the various mutexes and reader/writer locks
3493 	 */
3494 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3495 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3496 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3497 
3498 	/*
3499 	 * Assign unique major number for all nfs mounts
3500 	 */
3501 	if ((nfs_major = getudev()) == -1) {
3502 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3503 		    "nfs: init: can't get unique device number");
3504 		nfs_major = 0;
3505 	}
3506 	nfs_minor = 0;
3507 
3508 	if (nfs3_jukebox_delay == 0)
3509 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3510 
3511 	return (0);
3512 }
3513 
3514 void
3515 nfs_subrfini(void)
3516 {
3517 	int i;
3518 
3519 	/*
3520 	 * Deallocate the rnode hash queues
3521 	 */
3522 	kmem_cache_destroy(rnode_cache);
3523 
3524 	for (i = 0; i < rtablesize; i++)
3525 		rw_destroy(&rtable[i].r_lock);
3526 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3527 
3528 	/*
3529 	 * Deallocated the access cache
3530 	 */
3531 	kmem_cache_destroy(acache_cache);
3532 
3533 	for (i = 0; i < acachesize; i++)
3534 		rw_destroy(&acache[i].lock);
3535 	kmem_free(acache, acachesize * sizeof (*acache));
3536 
3537 	/*
3538 	 * Deallocate the client handle cache
3539 	 */
3540 	kmem_cache_destroy(chtab_cache);
3541 
3542 	/*
3543 	 * Destroy the various mutexes and reader/writer locks
3544 	 */
3545 	mutex_destroy(&rpfreelist_lock);
3546 	mutex_destroy(&newnum_lock);
3547 	mutex_destroy(&nfs_minor_lock);
3548 	(void) zone_key_delete(nfsclnt_zone_key);
3549 }
3550 
3551 enum nfsstat
3552 puterrno(int error)
3553 {
3554 
3555 	switch (error) {
3556 	case EOPNOTSUPP:
3557 		return (NFSERR_OPNOTSUPP);
3558 	case ENAMETOOLONG:
3559 		return (NFSERR_NAMETOOLONG);
3560 	case ENOTEMPTY:
3561 		return (NFSERR_NOTEMPTY);
3562 	case EDQUOT:
3563 		return (NFSERR_DQUOT);
3564 	case ESTALE:
3565 		return (NFSERR_STALE);
3566 	case EREMOTE:
3567 		return (NFSERR_REMOTE);
3568 	case ENOSYS:
3569 		return (NFSERR_OPNOTSUPP);
3570 	case EOVERFLOW:
3571 		return (NFSERR_INVAL);
3572 	default:
3573 		return ((enum nfsstat)error);
3574 	}
3575 	/* NOTREACHED */
3576 }
3577 
3578 int
3579 geterrno(enum nfsstat status)
3580 {
3581 
3582 	switch (status) {
3583 	case NFSERR_OPNOTSUPP:
3584 		return (EOPNOTSUPP);
3585 	case NFSERR_NAMETOOLONG:
3586 		return (ENAMETOOLONG);
3587 	case NFSERR_NOTEMPTY:
3588 		return (ENOTEMPTY);
3589 	case NFSERR_DQUOT:
3590 		return (EDQUOT);
3591 	case NFSERR_STALE:
3592 		return (ESTALE);
3593 	case NFSERR_REMOTE:
3594 		return (EREMOTE);
3595 	case NFSERR_WFLUSH:
3596 		return (EIO);
3597 	default:
3598 		return ((int)status);
3599 	}
3600 	/* NOTREACHED */
3601 }
3602 
3603 enum nfsstat3
3604 puterrno3(int error)
3605 {
3606 
3607 #ifdef DEBUG
3608 	switch (error) {
3609 	case 0:
3610 		return (NFS3_OK);
3611 	case EPERM:
3612 		return (NFS3ERR_PERM);
3613 	case ENOENT:
3614 		return (NFS3ERR_NOENT);
3615 	case EIO:
3616 		return (NFS3ERR_IO);
3617 	case ENXIO:
3618 		return (NFS3ERR_NXIO);
3619 	case EACCES:
3620 		return (NFS3ERR_ACCES);
3621 	case EEXIST:
3622 		return (NFS3ERR_EXIST);
3623 	case EXDEV:
3624 		return (NFS3ERR_XDEV);
3625 	case ENODEV:
3626 		return (NFS3ERR_NODEV);
3627 	case ENOTDIR:
3628 		return (NFS3ERR_NOTDIR);
3629 	case EISDIR:
3630 		return (NFS3ERR_ISDIR);
3631 	case EINVAL:
3632 		return (NFS3ERR_INVAL);
3633 	case EFBIG:
3634 		return (NFS3ERR_FBIG);
3635 	case ENOSPC:
3636 		return (NFS3ERR_NOSPC);
3637 	case EROFS:
3638 		return (NFS3ERR_ROFS);
3639 	case EMLINK:
3640 		return (NFS3ERR_MLINK);
3641 	case ENAMETOOLONG:
3642 		return (NFS3ERR_NAMETOOLONG);
3643 	case ENOTEMPTY:
3644 		return (NFS3ERR_NOTEMPTY);
3645 	case EDQUOT:
3646 		return (NFS3ERR_DQUOT);
3647 	case ESTALE:
3648 		return (NFS3ERR_STALE);
3649 	case EREMOTE:
3650 		return (NFS3ERR_REMOTE);
3651 	case ENOSYS:
3652 	case EOPNOTSUPP:
3653 		return (NFS3ERR_NOTSUPP);
3654 	case EOVERFLOW:
3655 		return (NFS3ERR_INVAL);
3656 	default:
3657 		zcmn_err(getzoneid(), CE_WARN,
3658 		    "puterrno3: got error %d", error);
3659 		return ((enum nfsstat3)error);
3660 	}
3661 #else
3662 	switch (error) {
3663 	case ENAMETOOLONG:
3664 		return (NFS3ERR_NAMETOOLONG);
3665 	case ENOTEMPTY:
3666 		return (NFS3ERR_NOTEMPTY);
3667 	case EDQUOT:
3668 		return (NFS3ERR_DQUOT);
3669 	case ESTALE:
3670 		return (NFS3ERR_STALE);
3671 	case ENOSYS:
3672 	case EOPNOTSUPP:
3673 		return (NFS3ERR_NOTSUPP);
3674 	case EREMOTE:
3675 		return (NFS3ERR_REMOTE);
3676 	case EOVERFLOW:
3677 		return (NFS3ERR_INVAL);
3678 	default:
3679 		return ((enum nfsstat3)error);
3680 	}
3681 #endif
3682 }
3683 
3684 int
3685 geterrno3(enum nfsstat3 status)
3686 {
3687 
3688 #ifdef DEBUG
3689 	switch (status) {
3690 	case NFS3_OK:
3691 		return (0);
3692 	case NFS3ERR_PERM:
3693 		return (EPERM);
3694 	case NFS3ERR_NOENT:
3695 		return (ENOENT);
3696 	case NFS3ERR_IO:
3697 		return (EIO);
3698 	case NFS3ERR_NXIO:
3699 		return (ENXIO);
3700 	case NFS3ERR_ACCES:
3701 		return (EACCES);
3702 	case NFS3ERR_EXIST:
3703 		return (EEXIST);
3704 	case NFS3ERR_XDEV:
3705 		return (EXDEV);
3706 	case NFS3ERR_NODEV:
3707 		return (ENODEV);
3708 	case NFS3ERR_NOTDIR:
3709 		return (ENOTDIR);
3710 	case NFS3ERR_ISDIR:
3711 		return (EISDIR);
3712 	case NFS3ERR_INVAL:
3713 		return (EINVAL);
3714 	case NFS3ERR_FBIG:
3715 		return (EFBIG);
3716 	case NFS3ERR_NOSPC:
3717 		return (ENOSPC);
3718 	case NFS3ERR_ROFS:
3719 		return (EROFS);
3720 	case NFS3ERR_MLINK:
3721 		return (EMLINK);
3722 	case NFS3ERR_NAMETOOLONG:
3723 		return (ENAMETOOLONG);
3724 	case NFS3ERR_NOTEMPTY:
3725 		return (ENOTEMPTY);
3726 	case NFS3ERR_DQUOT:
3727 		return (EDQUOT);
3728 	case NFS3ERR_STALE:
3729 		return (ESTALE);
3730 	case NFS3ERR_REMOTE:
3731 		return (EREMOTE);
3732 	case NFS3ERR_BADHANDLE:
3733 		return (ESTALE);
3734 	case NFS3ERR_NOT_SYNC:
3735 		return (EINVAL);
3736 	case NFS3ERR_BAD_COOKIE:
3737 		return (ENOENT);
3738 	case NFS3ERR_NOTSUPP:
3739 		return (EOPNOTSUPP);
3740 	case NFS3ERR_TOOSMALL:
3741 		return (EINVAL);
3742 	case NFS3ERR_SERVERFAULT:
3743 		return (EIO);
3744 	case NFS3ERR_BADTYPE:
3745 		return (EINVAL);
3746 	case NFS3ERR_JUKEBOX:
3747 		return (ENXIO);
3748 	default:
3749 		zcmn_err(getzoneid(), CE_WARN,
3750 		    "geterrno3: got status %d", status);
3751 		return ((int)status);
3752 	}
3753 #else
3754 	switch (status) {
3755 	case NFS3ERR_NAMETOOLONG:
3756 		return (ENAMETOOLONG);
3757 	case NFS3ERR_NOTEMPTY:
3758 		return (ENOTEMPTY);
3759 	case NFS3ERR_DQUOT:
3760 		return (EDQUOT);
3761 	case NFS3ERR_STALE:
3762 	case NFS3ERR_BADHANDLE:
3763 		return (ESTALE);
3764 	case NFS3ERR_NOTSUPP:
3765 		return (EOPNOTSUPP);
3766 	case NFS3ERR_REMOTE:
3767 		return (EREMOTE);
3768 	case NFS3ERR_NOT_SYNC:
3769 	case NFS3ERR_TOOSMALL:
3770 	case NFS3ERR_BADTYPE:
3771 		return (EINVAL);
3772 	case NFS3ERR_BAD_COOKIE:
3773 		return (ENOENT);
3774 	case NFS3ERR_SERVERFAULT:
3775 		return (EIO);
3776 	case NFS3ERR_JUKEBOX:
3777 		return (ENXIO);
3778 	default:
3779 		return ((int)status);
3780 	}
3781 #endif
3782 }
3783 
3784 rddir_cache *
3785 rddir_cache_alloc(int flags)
3786 {
3787 	rddir_cache *rc;
3788 
3789 	rc = kmem_alloc(sizeof (*rc), flags);
3790 	if (rc != NULL) {
3791 		rc->entries = NULL;
3792 		rc->flags = RDDIR;
3793 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3794 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3795 		rc->count = 1;
3796 #ifdef DEBUG
3797 		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3798 #endif
3799 	}
3800 	return (rc);
3801 }
3802 
3803 static void
3804 rddir_cache_free(rddir_cache *rc)
3805 {
3806 
3807 #ifdef DEBUG
3808 	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3809 #endif
3810 	if (rc->entries != NULL) {
3811 #ifdef DEBUG
3812 		rddir_cache_buf_free(rc->entries, rc->buflen);
3813 #else
3814 		kmem_free(rc->entries, rc->buflen);
3815 #endif
3816 	}
3817 	cv_destroy(&rc->cv);
3818 	mutex_destroy(&rc->lock);
3819 	kmem_free(rc, sizeof (*rc));
3820 }
3821 
3822 void
3823 rddir_cache_hold(rddir_cache *rc)
3824 {
3825 
3826 	mutex_enter(&rc->lock);
3827 	rc->count++;
3828 	mutex_exit(&rc->lock);
3829 }
3830 
3831 void
3832 rddir_cache_rele(rddir_cache *rc)
3833 {
3834 
3835 	mutex_enter(&rc->lock);
3836 	ASSERT(rc->count > 0);
3837 	if (--rc->count == 0) {
3838 		mutex_exit(&rc->lock);
3839 		rddir_cache_free(rc);
3840 	} else
3841 		mutex_exit(&rc->lock);
3842 }
3843 
3844 #ifdef DEBUG
3845 char *
3846 rddir_cache_buf_alloc(size_t size, int flags)
3847 {
3848 	char *rc;
3849 
3850 	rc = kmem_alloc(size, flags);
3851 	if (rc != NULL)
3852 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3853 	return (rc);
3854 }
3855 
3856 void
3857 rddir_cache_buf_free(void *addr, size_t size)
3858 {
3859 
3860 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3861 	kmem_free(addr, size);
3862 }
3863 #endif
3864 
3865 static int
3866 nfs_free_data_reclaim(rnode_t *rp)
3867 {
3868 	char *contents;
3869 	int size;
3870 	vsecattr_t *vsp;
3871 	nfs3_pathconf_info *info;
3872 	int freed;
3873 	cred_t *cred;
3874 
3875 	/*
3876 	 * Free any held credentials and caches which
3877 	 * may be associated with this rnode.
3878 	 */
3879 	mutex_enter(&rp->r_statelock);
3880 	cred = rp->r_cred;
3881 	rp->r_cred = NULL;
3882 	contents = rp->r_symlink.contents;
3883 	size = rp->r_symlink.size;
3884 	rp->r_symlink.contents = NULL;
3885 	vsp = rp->r_secattr;
3886 	rp->r_secattr = NULL;
3887 	info = rp->r_pathconf;
3888 	rp->r_pathconf = NULL;
3889 	mutex_exit(&rp->r_statelock);
3890 
3891 	if (cred != NULL)
3892 		crfree(cred);
3893 
3894 	/*
3895 	 * Free the access cache entries.
3896 	 */
3897 	freed = nfs_access_purge_rp(rp);
3898 
3899 	if (!HAVE_RDDIR_CACHE(rp) &&
3900 	    contents == NULL &&
3901 	    vsp == NULL &&
3902 	    info == NULL)
3903 		return (freed);
3904 
3905 	/*
3906 	 * Free the readdir cache entries
3907 	 */
3908 	if (HAVE_RDDIR_CACHE(rp))
3909 		nfs_purge_rddir_cache(RTOV(rp));
3910 
3911 	/*
3912 	 * Free the symbolic link cache.
3913 	 */
3914 	if (contents != NULL) {
3915 
3916 		kmem_free((void *)contents, size);
3917 	}
3918 
3919 	/*
3920 	 * Free any cached ACL.
3921 	 */
3922 	if (vsp != NULL)
3923 		nfs_acl_free(vsp);
3924 
3925 	/*
3926 	 * Free any cached pathconf information.
3927 	 */
3928 	if (info != NULL)
3929 		kmem_free(info, sizeof (*info));
3930 
3931 	return (1);
3932 }
3933 
3934 static int
3935 nfs_active_data_reclaim(rnode_t *rp)
3936 {
3937 	char *contents;
3938 	int size;
3939 	vsecattr_t *vsp;
3940 	nfs3_pathconf_info *info;
3941 	int freed;
3942 
3943 	/*
3944 	 * Free any held credentials and caches which
3945 	 * may be associated with this rnode.
3946 	 */
3947 	if (!mutex_tryenter(&rp->r_statelock))
3948 		return (0);
3949 	contents = rp->r_symlink.contents;
3950 	size = rp->r_symlink.size;
3951 	rp->r_symlink.contents = NULL;
3952 	vsp = rp->r_secattr;
3953 	rp->r_secattr = NULL;
3954 	info = rp->r_pathconf;
3955 	rp->r_pathconf = NULL;
3956 	mutex_exit(&rp->r_statelock);
3957 
3958 	/*
3959 	 * Free the access cache entries.
3960 	 */
3961 	freed = nfs_access_purge_rp(rp);
3962 
3963 	if (!HAVE_RDDIR_CACHE(rp) &&
3964 	    contents == NULL &&
3965 	    vsp == NULL &&
3966 	    info == NULL)
3967 		return (freed);
3968 
3969 	/*
3970 	 * Free the readdir cache entries
3971 	 */
3972 	if (HAVE_RDDIR_CACHE(rp))
3973 		nfs_purge_rddir_cache(RTOV(rp));
3974 
3975 	/*
3976 	 * Free the symbolic link cache.
3977 	 */
3978 	if (contents != NULL) {
3979 
3980 		kmem_free((void *)contents, size);
3981 	}
3982 
3983 	/*
3984 	 * Free any cached ACL.
3985 	 */
3986 	if (vsp != NULL)
3987 		nfs_acl_free(vsp);
3988 
3989 	/*
3990 	 * Free any cached pathconf information.
3991 	 */
3992 	if (info != NULL)
3993 		kmem_free(info, sizeof (*info));
3994 
3995 	return (1);
3996 }
3997 
3998 static int
3999 nfs_free_reclaim(void)
4000 {
4001 	int freed;
4002 	rnode_t *rp;
4003 
4004 #ifdef DEBUG
4005 	clstat_debug.f_reclaim.value.ui64++;
4006 #endif
4007 	freed = 0;
4008 	mutex_enter(&rpfreelist_lock);
4009 	rp = rpfreelist;
4010 	if (rp != NULL) {
4011 		do {
4012 			if (nfs_free_data_reclaim(rp))
4013 				freed = 1;
4014 		} while ((rp = rp->r_freef) != rpfreelist);
4015 	}
4016 	mutex_exit(&rpfreelist_lock);
4017 	return (freed);
4018 }
4019 
4020 static int
4021 nfs_active_reclaim(void)
4022 {
4023 	int freed;
4024 	int index;
4025 	rnode_t *rp;
4026 
4027 #ifdef DEBUG
4028 	clstat_debug.a_reclaim.value.ui64++;
4029 #endif
4030 	freed = 0;
4031 	for (index = 0; index < rtablesize; index++) {
4032 		rw_enter(&rtable[index].r_lock, RW_READER);
4033 		for (rp = rtable[index].r_hashf;
4034 		    rp != (rnode_t *)(&rtable[index]);
4035 		    rp = rp->r_hashf) {
4036 			if (nfs_active_data_reclaim(rp))
4037 				freed = 1;
4038 		}
4039 		rw_exit(&rtable[index].r_lock);
4040 	}
4041 	return (freed);
4042 }
4043 
4044 static int
4045 nfs_rnode_reclaim(void)
4046 {
4047 	int freed;
4048 	rnode_t *rp;
4049 	vnode_t *vp;
4050 
4051 #ifdef DEBUG
4052 	clstat_debug.r_reclaim.value.ui64++;
4053 #endif
4054 	freed = 0;
4055 	mutex_enter(&rpfreelist_lock);
4056 	while ((rp = rpfreelist) != NULL) {
4057 		rp_rmfree(rp);
4058 		mutex_exit(&rpfreelist_lock);
4059 		if (rp->r_flags & RHASHED) {
4060 			vp = RTOV(rp);
4061 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4062 			mutex_enter(&vp->v_lock);
4063 			if (vp->v_count > 1) {
4064 				vp->v_count--;
4065 				mutex_exit(&vp->v_lock);
4066 				rw_exit(&rp->r_hashq->r_lock);
4067 				mutex_enter(&rpfreelist_lock);
4068 				continue;
4069 			}
4070 			mutex_exit(&vp->v_lock);
4071 			rp_rmhash_locked(rp);
4072 			rw_exit(&rp->r_hashq->r_lock);
4073 		}
4074 		/*
4075 		 * This call to rp_addfree will end up destroying the
4076 		 * rnode, but in a safe way with the appropriate set
4077 		 * of checks done.
4078 		 */
4079 		rp_addfree(rp, CRED());
4080 		mutex_enter(&rpfreelist_lock);
4081 	}
4082 	mutex_exit(&rpfreelist_lock);
4083 	return (freed);
4084 }
4085 
4086 /*ARGSUSED*/
4087 static void
4088 nfs_reclaim(void *cdrarg)
4089 {
4090 
4091 #ifdef DEBUG
4092 	clstat_debug.reclaim.value.ui64++;
4093 #endif
4094 	if (nfs_free_reclaim())
4095 		return;
4096 
4097 	if (nfs_active_reclaim())
4098 		return;
4099 
4100 	(void) nfs_rnode_reclaim();
4101 }
4102 
4103 /*
4104  * NFS client failover support
4105  *
4106  * Routines to copy filehandles
4107  */
4108 void
4109 nfscopyfh(caddr_t fhp, vnode_t *vp)
4110 {
4111 	fhandle_t *dest = (fhandle_t *)fhp;
4112 
4113 	if (dest != NULL)
4114 		*dest = *VTOFH(vp);
4115 }
4116 
4117 void
4118 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4119 {
4120 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4121 
4122 	if (dest != NULL)
4123 		*dest = *VTOFH3(vp);
4124 }
4125 
4126 /*
4127  * NFS client failover support
4128  *
4129  * failover_safe() will test various conditions to ensure that
4130  * failover is permitted for this vnode.  It will be denied
4131  * if:
4132  *	1) the operation in progress does not support failover (NULL fi)
4133  *	2) there are no available replicas (NULL mi_servers->sv_next)
4134  *	3) any locks are outstanding on this file
4135  */
4136 static int
4137 failover_safe(failinfo_t *fi)
4138 {
4139 
4140 	/*
4141 	 * Does this op permit failover?
4142 	 */
4143 	if (fi == NULL || fi->vp == NULL)
4144 		return (0);
4145 
4146 	/*
4147 	 * Are there any alternates to failover to?
4148 	 */
4149 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4150 		return (0);
4151 
4152 	/*
4153 	 * Disable check; we've forced local locking
4154 	 *
4155 	 * if (flk_has_remote_locks(fi->vp))
4156 	 *	return (0);
4157 	 */
4158 
4159 	/*
4160 	 * If we have no partial path, we can't do anything
4161 	 */
4162 	if (VTOR(fi->vp)->r_path == NULL)
4163 		return (0);
4164 
4165 	return (1);
4166 }
4167 
4168 #include <sys/thread.h>
4169 
4170 /*
4171  * NFS client failover support
4172  *
4173  * failover_newserver() will start a search for a new server,
4174  * preferably by starting an async thread to do the work.  If
4175  * someone is already doing this (recognizable by MI_BINDINPROG
4176  * being set), it will simply return and the calling thread
4177  * will queue on the mi_failover_cv condition variable.
4178  */
4179 static void
4180 failover_newserver(mntinfo_t *mi)
4181 {
4182 	/*
4183 	 * Check if someone else is doing this already
4184 	 */
4185 	mutex_enter(&mi->mi_lock);
4186 	if (mi->mi_flags & MI_BINDINPROG) {
4187 		mutex_exit(&mi->mi_lock);
4188 		return;
4189 	}
4190 	mi->mi_flags |= MI_BINDINPROG;
4191 
4192 	/*
4193 	 * Need to hold the vfs struct so that it can't be released
4194 	 * while the failover thread is selecting a new server.
4195 	 */
4196 	VFS_HOLD(mi->mi_vfsp);
4197 
4198 	/*
4199 	 * Start a thread to do the real searching.
4200 	 */
4201 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4202 
4203 	mutex_exit(&mi->mi_lock);
4204 }
4205 
4206 /*
4207  * NFS client failover support
4208  *
4209  * failover_thread() will find a new server to replace the one
4210  * currently in use, wake up other threads waiting on this mount
4211  * point, and die.  It will start at the head of the server list
4212  * and poll servers until it finds one with an NFS server which is
4213  * registered and responds to a NULL procedure ping.
4214  *
4215  * XXX failover_thread is unsafe within the scope of the
4216  * present model defined for cpr to suspend the system.
4217  * Specifically, over-the-wire calls made by the thread
4218  * are unsafe. The thread needs to be reevaluated in case of
4219  * future updates to the cpr suspend model.
4220  */
4221 static void
4222 failover_thread(mntinfo_t *mi)
4223 {
4224 	servinfo_t *svp = NULL;
4225 	CLIENT *cl;
4226 	enum clnt_stat status;
4227 	struct timeval tv;
4228 	int error;
4229 	int oncethru = 0;
4230 	callb_cpr_t cprinfo;
4231 	rnode_t *rp;
4232 	int index;
4233 	char *srvnames;
4234 	size_t srvnames_len;
4235 	struct nfs_clnt *nfscl = NULL;
4236 	zoneid_t zoneid = getzoneid();
4237 
4238 #ifdef DEBUG
4239 	/*
4240 	 * This is currently only needed to access counters which exist on
4241 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4242 	 * on non-DEBUG kernels.
4243 	 */
4244 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4245 	ASSERT(nfscl != NULL);
4246 #endif
4247 
4248 	/*
4249 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4250 	 * code guarantees that there will be only one failover thread
4251 	 * per mountinfo at any instance.
4252 	 */
4253 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4254 	    "failover_thread");
4255 
4256 	mutex_enter(&mi->mi_lock);
4257 	while (mi->mi_readers) {
4258 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4259 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4260 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4261 	}
4262 	mutex_exit(&mi->mi_lock);
4263 
4264 	tv.tv_sec = 2;
4265 	tv.tv_usec = 0;
4266 
4267 	/*
4268 	 * Ping the null NFS procedure of every server in
4269 	 * the list until one responds.  We always start
4270 	 * at the head of the list and always skip the one
4271 	 * that is current, since it's caused us a problem.
4272 	 */
4273 	while (svp == NULL) {
4274 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4275 			if (!oncethru && svp == mi->mi_curr_serv)
4276 				continue;
4277 
4278 			/*
4279 			 * If the file system was forcibly umounted
4280 			 * while trying to do a failover, then just
4281 			 * give up on the failover.  It won't matter
4282 			 * what the server is.
4283 			 */
4284 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4285 				svp = NULL;
4286 				goto done;
4287 			}
4288 
4289 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4290 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4291 			if (error)
4292 				continue;
4293 
4294 			if (!(mi->mi_flags & MI_INT))
4295 				cl->cl_nosignal = TRUE;
4296 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4297 			    xdr_void, NULL, tv);
4298 			if (!(mi->mi_flags & MI_INT))
4299 				cl->cl_nosignal = FALSE;
4300 			AUTH_DESTROY(cl->cl_auth);
4301 			CLNT_DESTROY(cl);
4302 			if (status == RPC_SUCCESS) {
4303 				if (svp == mi->mi_curr_serv) {
4304 #ifdef DEBUG
4305 					zcmn_err(zoneid, CE_NOTE,
4306 			"NFS%d: failing over: selecting original server %s",
4307 					    mi->mi_vers, svp->sv_hostname);
4308 #else
4309 					zcmn_err(zoneid, CE_NOTE,
4310 			"NFS: failing over: selecting original server %s",
4311 					    svp->sv_hostname);
4312 #endif
4313 				} else {
4314 #ifdef DEBUG
4315 					zcmn_err(zoneid, CE_NOTE,
4316 				    "NFS%d: failing over from %s to %s",
4317 					    mi->mi_vers,
4318 					    mi->mi_curr_serv->sv_hostname,
4319 					    svp->sv_hostname);
4320 #else
4321 					zcmn_err(zoneid, CE_NOTE,
4322 				    "NFS: failing over from %s to %s",
4323 					    mi->mi_curr_serv->sv_hostname,
4324 					    svp->sv_hostname);
4325 #endif
4326 				}
4327 				break;
4328 			}
4329 		}
4330 
4331 		if (svp == NULL) {
4332 			if (!oncethru) {
4333 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4334 #ifdef DEBUG
4335 				zprintf(zoneid,
4336 				    "NFS%d servers %s not responding "
4337 				    "still trying\n", mi->mi_vers, srvnames);
4338 #else
4339 				zprintf(zoneid, "NFS servers %s not responding "
4340 				    "still trying\n", srvnames);
4341 #endif
4342 				oncethru = 1;
4343 			}
4344 			mutex_enter(&mi->mi_lock);
4345 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4346 			mutex_exit(&mi->mi_lock);
4347 			delay(hz);
4348 			mutex_enter(&mi->mi_lock);
4349 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4350 			mutex_exit(&mi->mi_lock);
4351 		}
4352 	}
4353 
4354 	if (oncethru) {
4355 #ifdef DEBUG
4356 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4357 #else
4358 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4359 #endif
4360 	}
4361 
4362 	if (svp != mi->mi_curr_serv) {
4363 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4364 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4365 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4366 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4367 		    mi->mi_vfsp);
4368 		if (rp != NULL) {
4369 			if (rp->r_flags & RHASHED)
4370 				rp_rmhash_locked(rp);
4371 			rw_exit(&rtable[index].r_lock);
4372 			rp->r_server = svp;
4373 			rp->r_fh = svp->sv_fhandle;
4374 			(void) nfs_free_data_reclaim(rp);
4375 			index = rtablehash(&rp->r_fh);
4376 			rp->r_hashq = &rtable[index];
4377 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4378 			vn_exists(RTOV(rp));
4379 			rp_addhash(rp);
4380 			rw_exit(&rp->r_hashq->r_lock);
4381 			VN_RELE(RTOV(rp));
4382 		} else
4383 			rw_exit(&rtable[index].r_lock);
4384 	}
4385 
4386 done:
4387 	if (oncethru)
4388 		kmem_free(srvnames, srvnames_len);
4389 	mutex_enter(&mi->mi_lock);
4390 	mi->mi_flags &= ~MI_BINDINPROG;
4391 	if (svp != NULL) {
4392 		mi->mi_curr_serv = svp;
4393 		mi->mi_failover++;
4394 #ifdef DEBUG
4395 	nfscl->nfscl_stat.failover.value.ui64++;
4396 #endif
4397 	}
4398 	cv_broadcast(&mi->mi_failover_cv);
4399 	CALLB_CPR_EXIT(&cprinfo);
4400 	VFS_RELE(mi->mi_vfsp);
4401 	zthread_exit();
4402 	/* NOTREACHED */
4403 }
4404 
4405 /*
4406  * NFS client failover support
4407  *
4408  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4409  * is cleared, meaning that failover is complete.  Called with
4410  * mi_lock mutex held.
4411  */
4412 static int
4413 failover_wait(mntinfo_t *mi)
4414 {
4415 	k_sigset_t smask;
4416 
4417 	/*
4418 	 * If someone else is hunting for a living server,
4419 	 * sleep until it's done.  After our sleep, we may
4420 	 * be bound to the right server and get off cheaply.
4421 	 */
4422 	while (mi->mi_flags & MI_BINDINPROG) {
4423 		/*
4424 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4425 		 * and SIGTERM. (Preserving the existing masks).
4426 		 * Mask out SIGINT if mount option nointr is specified.
4427 		 */
4428 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4429 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4430 			/*
4431 			 * restore original signal mask
4432 			 */
4433 			sigunintr(&smask);
4434 			return (EINTR);
4435 		}
4436 		/*
4437 		 * restore original signal mask
4438 		 */
4439 		sigunintr(&smask);
4440 	}
4441 	return (0);
4442 }
4443 
4444 /*
4445  * NFS client failover support
4446  *
4447  * failover_remap() will do a partial pathname lookup and find the
4448  * desired vnode on the current server.  The interim vnode will be
4449  * discarded after we pilfer the new filehandle.
4450  *
4451  * Side effects:
4452  * - This routine will also update the filehandle in the args structure
4453  *    pointed to by the fi->fhp pointer if it is non-NULL.
4454  */
4455 
4456 static int
4457 failover_remap(failinfo_t *fi)
4458 {
4459 	vnode_t *vp, *nvp, *rootvp;
4460 	rnode_t *rp, *nrp;
4461 	mntinfo_t *mi;
4462 	int error;
4463 #ifdef DEBUG
4464 	struct nfs_clnt *nfscl;
4465 
4466 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4467 	ASSERT(nfscl != NULL);
4468 #endif
4469 	/*
4470 	 * Sanity check
4471 	 */
4472 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4473 		return (EINVAL);
4474 	vp = fi->vp;
4475 	rp = VTOR(vp);
4476 	mi = VTOMI(vp);
4477 
4478 	if (!(vp->v_flag & VROOT)) {
4479 		/*
4480 		 * Given the root fh, use the path stored in
4481 		 * the rnode to find the fh for the new server.
4482 		 */
4483 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4484 		if (error)
4485 			return (error);
4486 
4487 		error = failover_lookup(rp->r_path, rootvp,
4488 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4489 
4490 		VN_RELE(rootvp);
4491 
4492 		if (error)
4493 			return (error);
4494 
4495 		/*
4496 		 * If we found the same rnode, we're done now
4497 		 */
4498 		if (nvp == vp) {
4499 			/*
4500 			 * Failed and the new server may physically be same
4501 			 * OR may share a same disk subsystem. In this case
4502 			 * file handle for a particular file path is not going
4503 			 * to change, given the same filehandle lookup will
4504 			 * always locate the same rnode as the existing one.
4505 			 * All we might need to do is to update the r_server
4506 			 * with the current servinfo.
4507 			 */
4508 			if (!VALID_FH(fi)) {
4509 				rp->r_server = mi->mi_curr_serv;
4510 			}
4511 			VN_RELE(nvp);
4512 			return (0);
4513 		}
4514 
4515 		/*
4516 		 * Try to make it so that no one else will find this
4517 		 * vnode because it is just a temporary to hold the
4518 		 * new file handle until that file handle can be
4519 		 * copied to the original vnode/rnode.
4520 		 */
4521 		nrp = VTOR(nvp);
4522 		mutex_enter(&mi->mi_remap_lock);
4523 		/*
4524 		 * Some other thread could have raced in here and could
4525 		 * have done the remap for this particular rnode before
4526 		 * this thread here. Check for rp->r_server and
4527 		 * mi->mi_curr_serv and return if they are same.
4528 		 */
4529 		if (VALID_FH(fi)) {
4530 			mutex_exit(&mi->mi_remap_lock);
4531 			VN_RELE(nvp);
4532 			return (0);
4533 		}
4534 
4535 		if (nrp->r_flags & RHASHED)
4536 			rp_rmhash(nrp);
4537 
4538 		/*
4539 		 * As a heuristic check on the validity of the new
4540 		 * file, check that the size and type match against
4541 		 * that we remember from the old version.
4542 		 */
4543 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4544 			mutex_exit(&mi->mi_remap_lock);
4545 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4546 			    "NFS replicas %s and %s: file %s not same.",
4547 			    rp->r_server->sv_hostname,
4548 			    nrp->r_server->sv_hostname, rp->r_path);
4549 			VN_RELE(nvp);
4550 			return (EINVAL);
4551 		}
4552 
4553 		/*
4554 		 * snarf the filehandle from the new rnode
4555 		 * then release it, again while updating the
4556 		 * hash queues for the rnode.
4557 		 */
4558 		if (rp->r_flags & RHASHED)
4559 			rp_rmhash(rp);
4560 		rp->r_server = mi->mi_curr_serv;
4561 		rp->r_fh = nrp->r_fh;
4562 		rp->r_hashq = nrp->r_hashq;
4563 		/*
4564 		 * Copy the attributes from the new rnode to the old
4565 		 * rnode.  This will help to reduce unnecessary page
4566 		 * cache flushes.
4567 		 */
4568 		rp->r_attr = nrp->r_attr;
4569 		rp->r_attrtime = nrp->r_attrtime;
4570 		rp->r_mtime = nrp->r_mtime;
4571 		(void) nfs_free_data_reclaim(rp);
4572 		nfs_setswaplike(vp, &rp->r_attr);
4573 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4574 		rp_addhash(rp);
4575 		rw_exit(&rp->r_hashq->r_lock);
4576 		mutex_exit(&mi->mi_remap_lock);
4577 		VN_RELE(nvp);
4578 	}
4579 
4580 	/*
4581 	 * Update successful failover remap count
4582 	 */
4583 	mutex_enter(&mi->mi_lock);
4584 	mi->mi_remap++;
4585 	mutex_exit(&mi->mi_lock);
4586 #ifdef DEBUG
4587 	nfscl->nfscl_stat.remap.value.ui64++;
4588 #endif
4589 
4590 	/*
4591 	 * If we have a copied filehandle to update, do it now.
4592 	 */
4593 	if (fi->fhp != NULL && fi->copyproc != NULL)
4594 		(*fi->copyproc)(fi->fhp, vp);
4595 
4596 	return (0);
4597 }
4598 
4599 /*
4600  * NFS client failover support
4601  *
4602  * We want a simple pathname lookup routine to parse the pieces
4603  * of path in rp->r_path.  We know that the path was a created
4604  * as rnodes were made, so we know we have only to deal with
4605  * paths that look like:
4606  *	dir1/dir2/dir3/file
4607  * Any evidence of anything like .., symlinks, and ENOTDIR
4608  * are hard errors, because they mean something in this filesystem
4609  * is different from the one we came from, or has changed under
4610  * us in some way.  If this is true, we want the failure.
4611  *
4612  * Extended attributes: if the filesystem is mounted with extended
4613  * attributes enabled (-o xattr), the attribute directory will be
4614  * represented in the r_path as the magic name XATTR_RPATH. So if
4615  * we see that name in the pathname, is must be because this node
4616  * is an extended attribute.  Therefore, look it up that way.
4617  */
4618 static int
4619 failover_lookup(char *path, vnode_t *root,
4620     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4621 	vnode_t *, cred_t *, int),
4622     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4623     vnode_t **new)
4624 {
4625 	vnode_t *dvp, *nvp;
4626 	int error = EINVAL;
4627 	char *s, *p, *tmppath;
4628 	size_t len;
4629 	mntinfo_t *mi;
4630 	bool_t xattr;
4631 
4632 	/* Make local copy of path */
4633 	len = strlen(path) + 1;
4634 	tmppath = kmem_alloc(len, KM_SLEEP);
4635 	(void) strcpy(tmppath, path);
4636 	s = tmppath;
4637 
4638 	dvp = root;
4639 	VN_HOLD(dvp);
4640 	mi = VTOMI(root);
4641 	xattr = mi->mi_flags & MI_EXTATTR;
4642 
4643 	do {
4644 		p = strchr(s, '/');
4645 		if (p != NULL)
4646 			*p = '\0';
4647 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4648 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4649 			    RFSCALL_SOFT);
4650 		} else {
4651 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4652 			    CRED(), RFSCALL_SOFT);
4653 		}
4654 		if (p != NULL)
4655 			*p++ = '/';
4656 		if (error) {
4657 			VN_RELE(dvp);
4658 			kmem_free(tmppath, len);
4659 			return (error);
4660 		}
4661 		s = p;
4662 		VN_RELE(dvp);
4663 		dvp = nvp;
4664 	} while (p != NULL);
4665 
4666 	if (nvp != NULL && new != NULL)
4667 		*new = nvp;
4668 	kmem_free(tmppath, len);
4669 	return (0);
4670 }
4671 
4672 /*
4673  * NFS client failover support
4674  *
4675  * sv_free() frees the malloc'd portion of a "servinfo_t".
4676  */
4677 void
4678 sv_free(servinfo_t *svp)
4679 {
4680 	servinfo_t *next;
4681 	struct knetconfig *knconf;
4682 
4683 	while (svp != NULL) {
4684 		next = svp->sv_next;
4685 		if (svp->sv_secdata)
4686 			sec_clnt_freeinfo(svp->sv_secdata);
4687 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4688 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4689 		knconf = svp->sv_knconf;
4690 		if (knconf != NULL) {
4691 			if (knconf->knc_protofmly != NULL)
4692 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4693 			if (knconf->knc_proto != NULL)
4694 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4695 			kmem_free(knconf, sizeof (*knconf));
4696 		}
4697 		knconf = svp->sv_origknconf;
4698 		if (knconf != NULL) {
4699 			if (knconf->knc_protofmly != NULL)
4700 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4701 			if (knconf->knc_proto != NULL)
4702 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4703 			kmem_free(knconf, sizeof (*knconf));
4704 		}
4705 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4706 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4707 		mutex_destroy(&svp->sv_lock);
4708 		kmem_free(svp, sizeof (*svp));
4709 		svp = next;
4710 	}
4711 }
4712 
4713 /*
4714  * Only can return non-zero if intr != 0.
4715  */
4716 int
4717 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4718 {
4719 
4720 	mutex_enter(&l->lock);
4721 
4722 	/*
4723 	 * If this is a nested enter, then allow it.  There
4724 	 * must be as many exits as enters through.
4725 	 */
4726 	if (l->owner == curthread) {
4727 		/* lock is held for writing by current thread */
4728 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4729 		l->count--;
4730 	} else if (rw == RW_READER) {
4731 		/*
4732 		 * While there is a writer active or writers waiting,
4733 		 * then wait for them to finish up and move on.  Then,
4734 		 * increment the count to indicate that a reader is
4735 		 * active.
4736 		 */
4737 		while (l->count < 0 || l->waiters > 0) {
4738 			if (intr) {
4739 				klwp_t *lwp = ttolwp(curthread);
4740 
4741 				if (lwp != NULL)
4742 					lwp->lwp_nostop++;
4743 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4744 					if (lwp != NULL)
4745 						lwp->lwp_nostop--;
4746 					mutex_exit(&l->lock);
4747 					return (EINTR);
4748 				}
4749 				if (lwp != NULL)
4750 					lwp->lwp_nostop--;
4751 			} else
4752 				cv_wait(&l->cv, &l->lock);
4753 		}
4754 		ASSERT(l->count < INT_MAX);
4755 #ifdef	DEBUG
4756 		if ((l->count % 10000) == 9999)
4757 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4758 			    "rwlock @ %p\n", l->count, (void *)&l);
4759 #endif
4760 		l->count++;
4761 	} else {
4762 		ASSERT(rw == RW_WRITER);
4763 		/*
4764 		 * While there are readers active or a writer
4765 		 * active, then wait for all of the readers
4766 		 * to finish or for the writer to finish.
4767 		 * Then, set the owner field to curthread and
4768 		 * decrement count to indicate that a writer
4769 		 * is active.
4770 		 */
4771 		while (l->count > 0 || l->owner != NULL) {
4772 			l->waiters++;
4773 			if (intr) {
4774 				klwp_t *lwp = ttolwp(curthread);
4775 
4776 				if (lwp != NULL)
4777 					lwp->lwp_nostop++;
4778 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4779 					if (lwp != NULL)
4780 						lwp->lwp_nostop--;
4781 					l->waiters--;
4782 					cv_broadcast(&l->cv);
4783 					mutex_exit(&l->lock);
4784 					return (EINTR);
4785 				}
4786 				if (lwp != NULL)
4787 					lwp->lwp_nostop--;
4788 			} else
4789 				cv_wait(&l->cv, &l->lock);
4790 			l->waiters--;
4791 		}
4792 		l->owner = curthread;
4793 		l->count--;
4794 	}
4795 
4796 	mutex_exit(&l->lock);
4797 
4798 	return (0);
4799 }
4800 
4801 /*
4802  * If the lock is available, obtain it and return non-zero.  If there is
4803  * already a conflicting lock, return 0 immediately.
4804  */
4805 
4806 int
4807 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4808 {
4809 	mutex_enter(&l->lock);
4810 
4811 	/*
4812 	 * If this is a nested enter, then allow it.  There
4813 	 * must be as many exits as enters through.
4814 	 */
4815 	if (l->owner == curthread) {
4816 		/* lock is held for writing by current thread */
4817 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4818 		l->count--;
4819 	} else if (rw == RW_READER) {
4820 		/*
4821 		 * If there is a writer active or writers waiting, deny the
4822 		 * lock.  Otherwise, bump the count of readers.
4823 		 */
4824 		if (l->count < 0 || l->waiters > 0) {
4825 			mutex_exit(&l->lock);
4826 			return (0);
4827 		}
4828 		l->count++;
4829 	} else {
4830 		ASSERT(rw == RW_WRITER);
4831 		/*
4832 		 * If there are readers active or a writer active, deny the
4833 		 * lock.  Otherwise, set the owner field to curthread and
4834 		 * decrement count to indicate that a writer is active.
4835 		 */
4836 		if (l->count > 0 || l->owner != NULL) {
4837 			mutex_exit(&l->lock);
4838 			return (0);
4839 		}
4840 		l->owner = curthread;
4841 		l->count--;
4842 	}
4843 
4844 	mutex_exit(&l->lock);
4845 
4846 	return (1);
4847 }
4848 
4849 void
4850 nfs_rw_exit(nfs_rwlock_t *l)
4851 {
4852 
4853 	mutex_enter(&l->lock);
4854 	/*
4855 	 * If this is releasing a writer lock, then increment count to
4856 	 * indicate that there is one less writer active.  If this was
4857 	 * the last of possibly nested writer locks, then clear the owner
4858 	 * field as well to indicate that there is no writer active
4859 	 * and wakeup any possible waiting writers or readers.
4860 	 *
4861 	 * If releasing a reader lock, then just decrement count to
4862 	 * indicate that there is one less reader active.  If this was
4863 	 * the last active reader and there are writer(s) waiting,
4864 	 * then wake up the first.
4865 	 */
4866 	if (l->owner != NULL) {
4867 		ASSERT(l->owner == curthread);
4868 		l->count++;
4869 		if (l->count == 0) {
4870 			l->owner = NULL;
4871 			cv_broadcast(&l->cv);
4872 		}
4873 	} else {
4874 		ASSERT(l->count > 0);
4875 		l->count--;
4876 		if (l->count == 0 && l->waiters > 0)
4877 			cv_broadcast(&l->cv);
4878 	}
4879 	mutex_exit(&l->lock);
4880 }
4881 
4882 int
4883 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4884 {
4885 
4886 	if (rw == RW_READER)
4887 		return (l->count > 0);
4888 	ASSERT(rw == RW_WRITER);
4889 	return (l->count < 0);
4890 }
4891 
4892 /* ARGSUSED */
4893 void
4894 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4895 {
4896 
4897 	l->count = 0;
4898 	l->waiters = 0;
4899 	l->owner = NULL;
4900 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4901 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4902 }
4903 
4904 void
4905 nfs_rw_destroy(nfs_rwlock_t *l)
4906 {
4907 
4908 	mutex_destroy(&l->lock);
4909 	cv_destroy(&l->cv);
4910 }
4911 
4912 int
4913 nfs3_rddir_compar(const void *x, const void *y)
4914 {
4915 	rddir_cache *a = (rddir_cache *)x;
4916 	rddir_cache *b = (rddir_cache *)y;
4917 
4918 	if (a->nfs3_cookie == b->nfs3_cookie) {
4919 		if (a->buflen == b->buflen)
4920 			return (0);
4921 		if (a->buflen < b->buflen)
4922 			return (-1);
4923 		return (1);
4924 	}
4925 
4926 	if (a->nfs3_cookie < b->nfs3_cookie)
4927 		return (-1);
4928 
4929 	return (1);
4930 }
4931 
4932 int
4933 nfs_rddir_compar(const void *x, const void *y)
4934 {
4935 	rddir_cache *a = (rddir_cache *)x;
4936 	rddir_cache *b = (rddir_cache *)y;
4937 
4938 	if (a->nfs_cookie == b->nfs_cookie) {
4939 		if (a->buflen == b->buflen)
4940 			return (0);
4941 		if (a->buflen < b->buflen)
4942 			return (-1);
4943 		return (1);
4944 	}
4945 
4946 	if (a->nfs_cookie < b->nfs_cookie)
4947 		return (-1);
4948 
4949 	return (1);
4950 }
4951 
4952 static char *
4953 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4954 {
4955 	servinfo_t *s;
4956 	char *srvnames;
4957 	char *namep;
4958 	size_t length;
4959 
4960 	/*
4961 	 * Calculate the length of the string required to hold all
4962 	 * of the server names plus either a comma or a null
4963 	 * character following each individual one.
4964 	 */
4965 	length = 0;
4966 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4967 		length += s->sv_hostnamelen;
4968 
4969 	srvnames = kmem_alloc(length, KM_SLEEP);
4970 
4971 	namep = srvnames;
4972 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4973 		(void) strcpy(namep, s->sv_hostname);
4974 		namep += s->sv_hostnamelen - 1;
4975 		*namep++ = ',';
4976 	}
4977 	*--namep = '\0';
4978 
4979 	*len = length;
4980 
4981 	return (srvnames);
4982 }
4983 
4984 /*
4985  * These two functions are temporary and designed for the upgrade-workaround
4986  * only.  They cannot be used for general zone-crossing NFS client support, and
4987  * will be removed shortly.
4988  *
4989  * When the workaround is enabled, all NFS traffic is forced into the global
4990  * zone.  These functions are called when the code needs to refer to the state
4991  * of the underlying network connection.  They're not called when the function
4992  * needs to refer to the state of the process that invoked the system call.
4993  * (E.g., when checking whether the zone is shutting down during the mount()
4994  * call.)
4995  */
4996 
4997 struct zone *
4998 nfs_zone(void)
4999 {
5000 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5001 }
5002 
5003 zoneid_t
5004 nfs_zoneid(void)
5005 {
5006 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5007 }
5008 
5009 /*
5010  * nfs_mount_label_policy:
5011  *	Determine whether the mount is allowed according to MAC check,
5012  *	by comparing (where appropriate) label of the remote server
5013  *	against the label of the zone being mounted into.
5014  *
5015  *	Returns:
5016  *		 0 :	access allowed
5017  *		-1 :	read-only access allowed (i.e., read-down)
5018  *		>0 :	error code, such as EACCES
5019  */
5020 int
5021 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5022     struct knetconfig *knconf, cred_t *cr)
5023 {
5024 	int		addr_type;
5025 	void		*ipaddr;
5026 	bslabel_t	*server_sl, *mntlabel;
5027 	zone_t		*mntzone = NULL;
5028 	ts_label_t	*zlabel;
5029 	tsol_tpc_t	*tp;
5030 	ts_label_t	*tsl = NULL;
5031 	int		retv;
5032 
5033 	/*
5034 	 * Get the zone's label.  Each zone on a labeled system has a label.
5035 	 */
5036 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5037 	zlabel = mntzone->zone_slabel;
5038 	ASSERT(zlabel != NULL);
5039 	label_hold(zlabel);
5040 
5041 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5042 		addr_type = IPV4_VERSION;
5043 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5044 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5045 		addr_type = IPV6_VERSION;
5046 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5047 	} else {
5048 		retv = 0;
5049 		goto out;
5050 	}
5051 
5052 	retv = EACCES;				/* assume the worst */
5053 
5054 	/*
5055 	 * Next, get the assigned label of the remote server.
5056 	 */
5057 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5058 	if (tp == NULL)
5059 		goto out;			/* error getting host entry */
5060 
5061 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5062 		goto rel_tpc;			/* invalid domain */
5063 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5064 	    (tp->tpc_tp.host_type != UNLABELED))
5065 		goto rel_tpc;			/* invalid hosttype */
5066 
5067 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5068 		tsl = getflabel_cipso(vfsp);
5069 		if (tsl == NULL)
5070 			goto rel_tpc;		/* error getting server lbl */
5071 
5072 		server_sl = label2bslabel(tsl);
5073 	} else {	/* UNLABELED */
5074 		server_sl = &tp->tpc_tp.tp_def_label;
5075 	}
5076 
5077 	mntlabel = label2bslabel(zlabel);
5078 
5079 	/*
5080 	 * Now compare labels to complete the MAC check.  If the labels
5081 	 * are equal or if the requestor is in the global zone and has
5082 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5083 	 * mounts into the global zone itself; restrict these to
5084 	 * read-only.)
5085 	 *
5086 	 * If the requestor is in some other zone, but his label
5087 	 * dominates the server, then allow read-down.
5088 	 *
5089 	 * Otherwise, access is denied.
5090 	 */
5091 	if (blequal(mntlabel, server_sl) ||
5092 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5093 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5094 		if ((mntzone == global_zone) ||
5095 		    !blequal(mntlabel, server_sl))
5096 			retv = -1;		/* read-only */
5097 		else
5098 			retv = 0;		/* access OK */
5099 	} else if (bldominates(mntlabel, server_sl)) {
5100 		retv = -1;			/* read-only */
5101 	} else {
5102 		retv = EACCES;
5103 	}
5104 
5105 	if (tsl != NULL)
5106 		label_rele(tsl);
5107 
5108 rel_tpc:
5109 	TPC_RELE(tp);
5110 out:
5111 	if (mntzone)
5112 		zone_rele(mntzone);
5113 	label_rele(zlabel);
5114 	return (retv);
5115 }
5116 
5117 boolean_t
5118 nfs_has_ctty(void)
5119 {
5120 	boolean_t rv;
5121 	mutex_enter(&curproc->p_splock);
5122 	rv = (curproc->p_sessp->s_vp != NULL);
5123 	mutex_exit(&curproc->p_splock);
5124 	return (rv);
5125 }
5126 
5127 /*
5128  * TX NFS routine used by NFSv3 and NFSv4 to do label check
5129  * on client label and server's file object lable.
5130  */
5131 boolean_t
5132 do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag)
5133 {
5134 	bslabel_t *slabel;
5135 	ts_label_t *tslabel;
5136 	boolean_t result;
5137 
5138 	if ((tslabel = nfs_getflabel(vp)) == NULL) {
5139 		return (B_FALSE);
5140 	}
5141 	slabel = label2bslabel(tslabel);
5142 	DTRACE_PROBE4(tx__rfs__log__info__labelcheck, char *,
5143 	    "comparing server's file label(1) with client label(2) (vp(3))",
5144 	    bslabel_t *, slabel, bslabel_t *, clabel, vnode_t *, vp);
5145 
5146 	if (flag == EQUALITY_CHECK)
5147 		result = blequal(clabel, slabel);
5148 	else
5149 		result = bldominates(clabel, slabel);
5150 	label_rele(tslabel);
5151 	return (result);
5152 }
5153 
5154 /*
5155  * See if xattr directory to see if it has any generic user attributes
5156  */
5157 int
5158 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5159 {
5160 	struct uio uio;
5161 	struct iovec iov;
5162 	char *dbuf;
5163 	struct dirent64 *dp;
5164 	size_t dlen = 8 * 1024;
5165 	size_t dbuflen;
5166 	int eof = 0;
5167 	int error;
5168 
5169 	*valp = 0;
5170 	dbuf = kmem_alloc(dlen, KM_SLEEP);
5171 	uio.uio_iov = &iov;
5172 	uio.uio_iovcnt = 1;
5173 	uio.uio_segflg = UIO_SYSSPACE;
5174 	uio.uio_fmode = 0;
5175 	uio.uio_extflg = UIO_COPY_CACHED;
5176 	uio.uio_loffset = 0;
5177 	uio.uio_resid = dlen;
5178 	iov.iov_base = dbuf;
5179 	iov.iov_len = dlen;
5180 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5181 	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5182 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5183 
5184 	dbuflen = dlen - uio.uio_resid;
5185 
5186 	if (error || dbuflen == 0) {
5187 		kmem_free(dbuf, dlen);
5188 		return (error);
5189 	}
5190 
5191 	dp = (dirent64_t *)dbuf;
5192 
5193 	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5194 		if (strcmp(dp->d_name, ".") == 0 ||
5195 		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5196 		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5197 		    VIEW_READONLY) == 0) {
5198 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5199 			continue;
5200 		}
5201 
5202 		*valp = 1;
5203 		break;
5204 	}
5205 	kmem_free(dbuf, dlen);
5206 	return (0);
5207 }
5208