xref: /titanic_44/usr/src/uts/common/fs/nfs/nfs_subr.c (revision a576ab5b6e08c47732b3dedca9eaa8a8cbb85720)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
26  *	All rights reserved.
27  */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred_impl.h>
35 #include <sys/proc.h>
36 #include <sys/user.h>
37 #include <sys/time.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/socket.h>
42 #include <sys/uio.h>
43 #include <sys/tiuser.h>
44 #include <sys/swap.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/kmem.h>
48 #include <sys/kstat.h>
49 #include <sys/cmn_err.h>
50 #include <sys/vtrace.h>
51 #include <sys/session.h>
52 #include <sys/dnlc.h>
53 #include <sys/bitmap.h>
54 #include <sys/acl.h>
55 #include <sys/ddi.h>
56 #include <sys/pathname.h>
57 #include <sys/flock.h>
58 #include <sys/dirent.h>
59 #include <sys/flock.h>
60 #include <sys/callb.h>
61 #include <sys/atomic.h>
62 #include <sys/list.h>
63 #include <sys/tsol/tnet.h>
64 #include <sys/priv.h>
65 #include <sys/sdt.h>
66 
67 #include <inet/ip6.h>
68 
69 #include <rpc/types.h>
70 #include <rpc/xdr.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73 
74 #include <nfs/nfs.h>
75 #include <nfs/nfs4.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/rnode.h>
78 #include <nfs/nfs_acl.h>
79 
80 #include <sys/tsol/label.h>
81 
82 /*
83  * The hash queues for the access to active and cached rnodes
84  * are organized as doubly linked lists.  A reader/writer lock
85  * for each hash bucket is used to control access and to synchronize
86  * lookups, additions, and deletions from the hash queue.
87  *
88  * The rnode freelist is organized as a doubly linked list with
89  * a head pointer.  Additions and deletions are synchronized via
90  * a single mutex.
91  *
92  * In order to add an rnode to the free list, it must be hashed into
93  * a hash queue and the exclusive lock to the hash queue be held.
94  * If an rnode is not hashed into a hash queue, then it is destroyed
95  * because it represents no valuable information that can be reused
96  * about the file.  The exclusive lock to the hash queue must be
97  * held in order to prevent a lookup in the hash queue from finding
98  * the rnode and using it and assuming that the rnode is not on the
99  * freelist.  The lookup in the hash queue will have the hash queue
100  * locked, either exclusive or shared.
101  *
102  * The vnode reference count for each rnode is not allowed to drop
103  * below 1.  This prevents external entities, such as the VM
104  * subsystem, from acquiring references to vnodes already on the
105  * freelist and then trying to place them back on the freelist
106  * when their reference is released.  This means that the when an
107  * rnode is looked up in the hash queues, then either the rnode
108  * is removed from the freelist and that reference is transferred to
109  * the new reference or the vnode reference count must be incremented
110  * accordingly.  The mutex for the freelist must be held in order to
111  * accurately test to see if the rnode is on the freelist or not.
112  * The hash queue lock might be held shared and it is possible that
113  * two different threads may race to remove the rnode from the
114  * freelist.  This race can be resolved by holding the mutex for the
115  * freelist.  Please note that the mutex for the freelist does not
116  * need to held if the rnode is not on the freelist.  It can not be
117  * placed on the freelist due to the requirement that the thread
118  * putting the rnode on the freelist must hold the exclusive lock
119  * to the hash queue and the thread doing the lookup in the hash
120  * queue is holding either a shared or exclusive lock to the hash
121  * queue.
122  *
123  * The lock ordering is:
124  *
125  *	hash bucket lock -> vnode lock
126  *	hash bucket lock -> freelist lock
127  */
128 static rhashq_t *rtable;
129 
130 static kmutex_t rpfreelist_lock;
131 static rnode_t *rpfreelist = NULL;
132 static long rnew = 0;
133 long nrnode = 0;
134 
135 static int rtablesize;
136 static int rtablemask;
137 
138 static int hashlen = 4;
139 
140 static struct kmem_cache *rnode_cache;
141 
142 /*
143  * Mutex to protect the following variables:
144  *	nfs_major
145  *	nfs_minor
146  */
147 kmutex_t nfs_minor_lock;
148 int nfs_major;
149 int nfs_minor;
150 
151 /* Do we allow preepoch (negative) time values otw? */
152 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
153 
154 /*
155  * Access cache
156  */
157 static acache_hash_t *acache;
158 static long nacache;	/* used strictly to size the number of hash queues */
159 
160 static int acachesize;
161 static int acachemask;
162 static struct kmem_cache *acache_cache;
163 
164 /*
165  * Client side utilities
166  */
167 
168 /*
169  * client side statistics
170  */
171 static const struct clstat clstat_tmpl = {
172 	{ "calls",	KSTAT_DATA_UINT64 },
173 	{ "badcalls",	KSTAT_DATA_UINT64 },
174 	{ "clgets",	KSTAT_DATA_UINT64 },
175 	{ "cltoomany",	KSTAT_DATA_UINT64 },
176 #ifdef DEBUG
177 	{ "clalloc",	KSTAT_DATA_UINT64 },
178 	{ "noresponse",	KSTAT_DATA_UINT64 },
179 	{ "failover",	KSTAT_DATA_UINT64 },
180 	{ "remap",	KSTAT_DATA_UINT64 },
181 #endif
182 };
183 
184 /*
185  * The following are statistics that describe behavior of the system as a whole
186  * and doesn't correspond to any one particular zone.
187  */
188 #ifdef DEBUG
189 static struct clstat_debug {
190 	kstat_named_t	nrnode;			/* number of allocated rnodes */
191 	kstat_named_t	access;			/* size of access cache */
192 	kstat_named_t	dirent;			/* size of readdir cache */
193 	kstat_named_t	dirents;		/* size of readdir buf cache */
194 	kstat_named_t	reclaim;		/* number of reclaims */
195 	kstat_named_t	clreclaim;		/* number of cl reclaims */
196 	kstat_named_t	f_reclaim;		/* number of free reclaims */
197 	kstat_named_t	a_reclaim;		/* number of active reclaims */
198 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
199 	kstat_named_t	rpath;			/* bytes used to store rpaths */
200 } clstat_debug = {
201 	{ "nrnode",	KSTAT_DATA_UINT64 },
202 	{ "access",	KSTAT_DATA_UINT64 },
203 	{ "dirent",	KSTAT_DATA_UINT64 },
204 	{ "dirents",	KSTAT_DATA_UINT64 },
205 	{ "reclaim",	KSTAT_DATA_UINT64 },
206 	{ "clreclaim",	KSTAT_DATA_UINT64 },
207 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
208 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
209 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
210 	{ "r_path",	KSTAT_DATA_UINT64 },
211 };
212 #endif	/* DEBUG */
213 
214 /*
215  * We keep a global list of per-zone client data, so we can clean up all zones
216  * if we get low on memory.
217  */
218 static list_t nfs_clnt_list;
219 static kmutex_t nfs_clnt_list_lock;
220 static zone_key_t nfsclnt_zone_key;
221 
222 static struct kmem_cache *chtab_cache;
223 
224 /*
225  * Some servers do not properly update the attributes of the
226  * directory when changes are made.  To allow interoperability
227  * with these broken servers, the nfs_disable_rddir_cache
228  * parameter must be set in /etc/system
229  */
230 int nfs_disable_rddir_cache = 0;
231 
232 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
233 		    struct chtab **);
234 void		clfree(CLIENT *, struct chtab *);
235 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
236 		    struct chtab **, struct nfs_clnt *);
237 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
238 		    struct chtab **, struct nfs_clnt *);
239 static void	clreclaim(void *);
240 static int	nfs_feedback(int, int, mntinfo_t *);
241 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
242 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
243 		    failinfo_t *);
244 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
245 		    caddr_t, cred_t *, int *, int, failinfo_t *);
246 static void	rinactive(rnode_t *, cred_t *);
247 static int	rtablehash(nfs_fhandle *);
248 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
249 		    struct vnodeops *,
250 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
251 			cred_t *),
252 		    int (*)(const void *, const void *), int *, cred_t *,
253 		    char *, char *);
254 static void	rp_rmfree(rnode_t *);
255 static void	rp_addhash(rnode_t *);
256 static void	rp_rmhash_locked(rnode_t *);
257 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
258 static void	destroy_rnode(rnode_t *);
259 static void	rddir_cache_free(rddir_cache *);
260 static int	nfs_free_data_reclaim(rnode_t *);
261 static int	nfs_active_data_reclaim(rnode_t *);
262 static int	nfs_free_reclaim(void);
263 static int	nfs_active_reclaim(void);
264 static int	nfs_rnode_reclaim(void);
265 static void	nfs_reclaim(void *);
266 static int	failover_safe(failinfo_t *);
267 static void	failover_newserver(mntinfo_t *mi);
268 static void	failover_thread(mntinfo_t *mi);
269 static int	failover_wait(mntinfo_t *);
270 static int	failover_remap(failinfo_t *);
271 static int	failover_lookup(char *, vnode_t *,
272 		    int (*)(vnode_t *, char *, vnode_t **,
273 			struct pathname *, int, vnode_t *, cred_t *, int),
274 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
275 		    vnode_t **);
276 static void	nfs_free_r_path(rnode_t *);
277 static void	nfs_set_vroot(vnode_t *);
278 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
279 
280 /*
281  * from rpcsec module (common/rpcsec)
282  */
283 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
284 extern void sec_clnt_freeh(AUTH *);
285 extern void sec_clnt_freeinfo(struct sec_data *);
286 
287 /*
288  * used in mount policy
289  */
290 extern ts_label_t *getflabel_cipso(vfs_t *);
291 
292 /*
293  * EIO or EINTR are not recoverable errors.
294  */
295 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
296 
297 /*
298  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
299  */
300 static int
301 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
302     struct chtab **chp, struct nfs_clnt *nfscl)
303 {
304 	struct chhead *ch, *newch;
305 	struct chhead **plistp;
306 	struct chtab *cp;
307 	int error;
308 	k_sigset_t smask;
309 
310 	if (newcl == NULL || chp == NULL || ci == NULL)
311 		return (EINVAL);
312 
313 	*newcl = NULL;
314 	*chp = NULL;
315 
316 	/*
317 	 * Find an unused handle or create one
318 	 */
319 	newch = NULL;
320 	nfscl->nfscl_stat.clgets.value.ui64++;
321 top:
322 	/*
323 	 * Find the correct entry in the cache to check for free
324 	 * client handles.  The search is based on the RPC program
325 	 * number, program version number, dev_t for the transport
326 	 * device, and the protocol family.
327 	 */
328 	mutex_enter(&nfscl->nfscl_chtable_lock);
329 	plistp = &nfscl->nfscl_chtable;
330 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
331 		if (ch->ch_prog == ci->cl_prog &&
332 		    ch->ch_vers == ci->cl_vers &&
333 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
334 		    (strcmp(ch->ch_protofmly,
335 		    svp->sv_knconf->knc_protofmly) == 0))
336 			break;
337 		plistp = &ch->ch_next;
338 	}
339 
340 	/*
341 	 * If we didn't find a cache entry for this quadruple, then
342 	 * create one.  If we don't have one already preallocated,
343 	 * then drop the cache lock, create one, and then start over.
344 	 * If we did have a preallocated entry, then just add it to
345 	 * the front of the list.
346 	 */
347 	if (ch == NULL) {
348 		if (newch == NULL) {
349 			mutex_exit(&nfscl->nfscl_chtable_lock);
350 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
351 			newch->ch_timesused = 0;
352 			newch->ch_prog = ci->cl_prog;
353 			newch->ch_vers = ci->cl_vers;
354 			newch->ch_dev = svp->sv_knconf->knc_rdev;
355 			newch->ch_protofmly = kmem_alloc(
356 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
357 			    KM_SLEEP);
358 			(void) strcpy(newch->ch_protofmly,
359 			    svp->sv_knconf->knc_protofmly);
360 			newch->ch_list = NULL;
361 			goto top;
362 		}
363 		ch = newch;
364 		newch = NULL;
365 		ch->ch_next = nfscl->nfscl_chtable;
366 		nfscl->nfscl_chtable = ch;
367 	/*
368 	 * We found a cache entry, but if it isn't on the front of the
369 	 * list, then move it to the front of the list to try to take
370 	 * advantage of locality of operations.
371 	 */
372 	} else if (ch != nfscl->nfscl_chtable) {
373 		*plistp = ch->ch_next;
374 		ch->ch_next = nfscl->nfscl_chtable;
375 		nfscl->nfscl_chtable = ch;
376 	}
377 
378 	/*
379 	 * If there was a free client handle cached, then remove it
380 	 * from the list, init it, and use it.
381 	 */
382 	if (ch->ch_list != NULL) {
383 		cp = ch->ch_list;
384 		ch->ch_list = cp->ch_list;
385 		mutex_exit(&nfscl->nfscl_chtable_lock);
386 		if (newch != NULL) {
387 			kmem_free(newch->ch_protofmly,
388 			    strlen(newch->ch_protofmly) + 1);
389 			kmem_free(newch, sizeof (*newch));
390 		}
391 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
392 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
393 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
394 		    &cp->ch_client->cl_auth);
395 		if (error || cp->ch_client->cl_auth == NULL) {
396 			CLNT_DESTROY(cp->ch_client);
397 			kmem_cache_free(chtab_cache, cp);
398 			return ((error != 0) ? error : EINTR);
399 		}
400 		ch->ch_timesused++;
401 		*newcl = cp->ch_client;
402 		*chp = cp;
403 		return (0);
404 	}
405 
406 	/*
407 	 * There weren't any free client handles which fit, so allocate
408 	 * a new one and use that.
409 	 */
410 #ifdef DEBUG
411 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
412 #endif
413 	mutex_exit(&nfscl->nfscl_chtable_lock);
414 
415 	nfscl->nfscl_stat.cltoomany.value.ui64++;
416 	if (newch != NULL) {
417 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
418 		kmem_free(newch, sizeof (*newch));
419 	}
420 
421 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
422 	cp->ch_head = ch;
423 
424 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
425 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
426 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
427 	sigunintr(&smask);
428 
429 	if (error != 0) {
430 		kmem_cache_free(chtab_cache, cp);
431 #ifdef DEBUG
432 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
433 #endif
434 		/*
435 		 * Warning is unnecessary if error is EINTR.
436 		 */
437 		if (error != EINTR) {
438 			nfs_cmn_err(error, CE_WARN,
439 			    "clget: couldn't create handle: %m\n");
440 		}
441 		return (error);
442 	}
443 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
444 	auth_destroy(cp->ch_client->cl_auth);
445 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
446 	    &cp->ch_client->cl_auth);
447 	if (error || cp->ch_client->cl_auth == NULL) {
448 		CLNT_DESTROY(cp->ch_client);
449 		kmem_cache_free(chtab_cache, cp);
450 #ifdef DEBUG
451 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
452 #endif
453 		return ((error != 0) ? error : EINTR);
454 	}
455 	ch->ch_timesused++;
456 	*newcl = cp->ch_client;
457 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
458 	*chp = cp;
459 	return (0);
460 }
461 
462 int
463 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
464     struct chtab **chp)
465 {
466 	struct nfs_clnt *nfscl;
467 
468 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
469 	ASSERT(nfscl != NULL);
470 
471 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
472 }
473 
474 static int
475 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
476     struct chtab **chp, struct nfs_clnt *nfscl)
477 {
478 	clinfo_t ci;
479 	int error;
480 
481 	/*
482 	 * Set read buffer size to rsize
483 	 * and add room for RPC headers.
484 	 */
485 	ci.cl_readsize = mi->mi_tsize;
486 	if (ci.cl_readsize != 0)
487 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
488 
489 	/*
490 	 * If soft mount and server is down just try once.
491 	 * meaning: do not retransmit.
492 	 */
493 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
494 		ci.cl_retrans = 0;
495 	else
496 		ci.cl_retrans = mi->mi_retrans;
497 
498 	ci.cl_prog = NFS_ACL_PROGRAM;
499 	ci.cl_vers = mi->mi_vers;
500 	ci.cl_flags = mi->mi_flags;
501 
502 	/*
503 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
504 	 * security flavor, the client tries to establish a security context
505 	 * by contacting the server. If the connection is timed out or reset,
506 	 * e.g. server reboot, we will try again.
507 	 */
508 	do {
509 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
510 
511 		if (error == 0)
512 			break;
513 
514 		/*
515 		 * For forced unmount or zone shutdown, bail out, no retry.
516 		 */
517 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
518 			error = EIO;
519 			break;
520 		}
521 
522 		/* do not retry for softmount */
523 		if (!(mi->mi_flags & MI_HARD))
524 			break;
525 
526 		/* let the caller deal with the failover case */
527 		if (FAILOVER_MOUNT(mi))
528 			break;
529 
530 	} while (error == ETIMEDOUT || error == ECONNRESET);
531 
532 	return (error);
533 }
534 
535 static int
536 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
537     struct chtab **chp, struct nfs_clnt *nfscl)
538 {
539 	clinfo_t ci;
540 	int error;
541 
542 	/*
543 	 * Set read buffer size to rsize
544 	 * and add room for RPC headers.
545 	 */
546 	ci.cl_readsize = mi->mi_tsize;
547 	if (ci.cl_readsize != 0)
548 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
549 
550 	/*
551 	 * If soft mount and server is down just try once.
552 	 * meaning: do not retransmit.
553 	 */
554 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
555 		ci.cl_retrans = 0;
556 	else
557 		ci.cl_retrans = mi->mi_retrans;
558 
559 	ci.cl_prog = mi->mi_prog;
560 	ci.cl_vers = mi->mi_vers;
561 	ci.cl_flags = mi->mi_flags;
562 
563 	/*
564 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
565 	 * security flavor, the client tries to establish a security context
566 	 * by contacting the server. If the connection is timed out or reset,
567 	 * e.g. server reboot, we will try again.
568 	 */
569 	do {
570 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
571 
572 		if (error == 0)
573 			break;
574 
575 		/*
576 		 * For forced unmount or zone shutdown, bail out, no retry.
577 		 */
578 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
579 			error = EIO;
580 			break;
581 		}
582 
583 		/* do not retry for softmount */
584 		if (!(mi->mi_flags & MI_HARD))
585 			break;
586 
587 		/* let the caller deal with the failover case */
588 		if (FAILOVER_MOUNT(mi))
589 			break;
590 
591 	} while (error == ETIMEDOUT || error == ECONNRESET);
592 
593 	return (error);
594 }
595 
596 static void
597 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
598 {
599 	if (cl->cl_auth != NULL) {
600 		sec_clnt_freeh(cl->cl_auth);
601 		cl->cl_auth = NULL;
602 	}
603 
604 	/*
605 	 * Timestamp this cache entry so that we know when it was last
606 	 * used.
607 	 */
608 	cp->ch_freed = gethrestime_sec();
609 
610 	/*
611 	 * Add the free client handle to the front of the list.
612 	 * This way, the list will be sorted in youngest to oldest
613 	 * order.
614 	 */
615 	mutex_enter(&nfscl->nfscl_chtable_lock);
616 	cp->ch_list = cp->ch_head->ch_list;
617 	cp->ch_head->ch_list = cp;
618 	mutex_exit(&nfscl->nfscl_chtable_lock);
619 }
620 
621 void
622 clfree(CLIENT *cl, struct chtab *cp)
623 {
624 	struct nfs_clnt *nfscl;
625 
626 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
627 	ASSERT(nfscl != NULL);
628 
629 	clfree_impl(cl, cp, nfscl);
630 }
631 
632 #define	CL_HOLDTIME	60	/* time to hold client handles */
633 
634 static void
635 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
636 {
637 	struct chhead *ch;
638 	struct chtab *cp;	/* list of objects that can be reclaimed */
639 	struct chtab *cpe;
640 	struct chtab *cpl;
641 	struct chtab **cpp;
642 #ifdef DEBUG
643 	int n = 0;
644 #endif
645 
646 	/*
647 	 * Need to reclaim some memory, so step through the cache
648 	 * looking through the lists for entries which can be freed.
649 	 */
650 	cp = NULL;
651 
652 	mutex_enter(&nfscl->nfscl_chtable_lock);
653 
654 	/*
655 	 * Here we step through each non-NULL quadruple and start to
656 	 * construct the reclaim list pointed to by cp.  Note that
657 	 * cp will contain all eligible chtab entries.  When this traversal
658 	 * completes, chtab entries from the last quadruple will be at the
659 	 * front of cp and entries from previously inspected quadruples have
660 	 * been appended to the rear of cp.
661 	 */
662 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
663 		if (ch->ch_list == NULL)
664 			continue;
665 		/*
666 		 * Search each list for entries older then
667 		 * cl_holdtime seconds.  The lists are maintained
668 		 * in youngest to oldest order so that when the
669 		 * first entry is found which is old enough, then
670 		 * all of the rest of the entries on the list will
671 		 * be old enough as well.
672 		 */
673 		cpl = ch->ch_list;
674 		cpp = &ch->ch_list;
675 		while (cpl != NULL &&
676 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
677 			cpp = &cpl->ch_list;
678 			cpl = cpl->ch_list;
679 		}
680 		if (cpl != NULL) {
681 			*cpp = NULL;
682 			if (cp != NULL) {
683 				cpe = cpl;
684 				while (cpe->ch_list != NULL)
685 					cpe = cpe->ch_list;
686 				cpe->ch_list = cp;
687 			}
688 			cp = cpl;
689 		}
690 	}
691 
692 	mutex_exit(&nfscl->nfscl_chtable_lock);
693 
694 	/*
695 	 * If cp is empty, then there is nothing to reclaim here.
696 	 */
697 	if (cp == NULL)
698 		return;
699 
700 	/*
701 	 * Step through the list of entries to free, destroying each client
702 	 * handle and kmem_free'ing the memory for each entry.
703 	 */
704 	while (cp != NULL) {
705 #ifdef DEBUG
706 		n++;
707 #endif
708 		CLNT_DESTROY(cp->ch_client);
709 		cpl = cp->ch_list;
710 		kmem_cache_free(chtab_cache, cp);
711 		cp = cpl;
712 	}
713 
714 #ifdef DEBUG
715 	/*
716 	 * Update clalloc so that nfsstat shows the current number
717 	 * of allocated client handles.
718 	 */
719 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
720 #endif
721 }
722 
723 /* ARGSUSED */
724 static void
725 clreclaim(void *all)
726 {
727 	struct nfs_clnt *nfscl;
728 
729 #ifdef DEBUG
730 	clstat_debug.clreclaim.value.ui64++;
731 #endif
732 	/*
733 	 * The system is low on memory; go through and try to reclaim some from
734 	 * every zone on the system.
735 	 */
736 	mutex_enter(&nfs_clnt_list_lock);
737 	nfscl = list_head(&nfs_clnt_list);
738 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
739 		clreclaim_zone(nfscl, CL_HOLDTIME);
740 	mutex_exit(&nfs_clnt_list_lock);
741 }
742 
743 /*
744  * Minimum time-out values indexed by call type
745  * These units are in "eights" of a second to avoid multiplies
746  */
747 static unsigned int minimum_timeo[] = {
748 	6, 7, 10
749 };
750 
751 /*
752  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
753  */
754 #define	MAXTIMO	(20*hz)
755 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
756 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
757 
758 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
759 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
760 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
761 
762 /*
763  * Function called when rfscall notices that we have been
764  * re-transmitting, or when we get a response without retransmissions.
765  * Return 1 if the transfer size was adjusted down - 0 if no change.
766  */
767 static int
768 nfs_feedback(int flag, int which, mntinfo_t *mi)
769 {
770 	int kind;
771 	int r = 0;
772 
773 	mutex_enter(&mi->mi_lock);
774 	if (flag == FEEDBACK_REXMIT1) {
775 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
776 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
777 			goto done;
778 		if (mi->mi_curread > MIN_NFS_TSIZE) {
779 			mi->mi_curread /= 2;
780 			if (mi->mi_curread < MIN_NFS_TSIZE)
781 				mi->mi_curread = MIN_NFS_TSIZE;
782 			r = 1;
783 		}
784 
785 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
786 			mi->mi_curwrite /= 2;
787 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
788 				mi->mi_curwrite = MIN_NFS_TSIZE;
789 			r = 1;
790 		}
791 	} else if (flag == FEEDBACK_OK) {
792 		kind = mi->mi_timer_type[which];
793 		if (kind == 0 ||
794 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
795 			goto done;
796 		if (kind == 1) {
797 			if (mi->mi_curread >= mi->mi_tsize)
798 				goto done;
799 			mi->mi_curread +=  MIN_NFS_TSIZE;
800 			if (mi->mi_curread > mi->mi_tsize/2)
801 				mi->mi_curread = mi->mi_tsize;
802 		} else if (kind == 2) {
803 			if (mi->mi_curwrite >= mi->mi_stsize)
804 				goto done;
805 			mi->mi_curwrite += MIN_NFS_TSIZE;
806 			if (mi->mi_curwrite > mi->mi_stsize/2)
807 				mi->mi_curwrite = mi->mi_stsize;
808 		}
809 	}
810 done:
811 	mutex_exit(&mi->mi_lock);
812 	return (r);
813 }
814 
815 #ifdef DEBUG
816 static int rfs2call_hits = 0;
817 static int rfs2call_misses = 0;
818 #endif
819 
820 int
821 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
822     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
823     enum nfsstat *statusp, int flags, failinfo_t *fi)
824 {
825 	int rpcerror;
826 	enum clnt_stat rpc_status;
827 
828 	ASSERT(statusp != NULL);
829 
830 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
831 	    cr, douprintf, &rpc_status, flags, fi);
832 	if (!rpcerror) {
833 		/*
834 		 * See crnetadjust() for comments.
835 		 */
836 		if (*statusp == NFSERR_ACCES &&
837 		    (cr = crnetadjust(cr)) != NULL) {
838 #ifdef DEBUG
839 			rfs2call_hits++;
840 #endif
841 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
842 			    resp, cr, douprintf, NULL, flags, fi);
843 			crfree(cr);
844 #ifdef DEBUG
845 			if (*statusp == NFSERR_ACCES)
846 				rfs2call_misses++;
847 #endif
848 		}
849 	} else if (rpc_status == RPC_PROCUNAVAIL) {
850 		*statusp = NFSERR_OPNOTSUPP;
851 		rpcerror = 0;
852 	}
853 
854 	return (rpcerror);
855 }
856 
857 #define	NFS3_JUKEBOX_DELAY	10 * hz
858 
859 static clock_t nfs3_jukebox_delay = 0;
860 
861 #ifdef DEBUG
862 static int rfs3call_hits = 0;
863 static int rfs3call_misses = 0;
864 #endif
865 
866 int
867 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
868     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
869     nfsstat3 *statusp, int flags, failinfo_t *fi)
870 {
871 	int rpcerror;
872 	int user_informed;
873 
874 	user_informed = 0;
875 	do {
876 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
877 		    cr, douprintf, NULL, flags, fi);
878 		if (!rpcerror) {
879 			cred_t *crr;
880 			if (*statusp == NFS3ERR_JUKEBOX) {
881 				if (ttoproc(curthread) == &p0) {
882 					rpcerror = EAGAIN;
883 					break;
884 				}
885 				if (!user_informed) {
886 					user_informed = 1;
887 					uprintf(
888 		"file temporarily unavailable on the server, retrying...\n");
889 				}
890 				delay(nfs3_jukebox_delay);
891 			}
892 			/*
893 			 * See crnetadjust() for comments.
894 			 */
895 			else if (*statusp == NFS3ERR_ACCES &&
896 			    (crr = crnetadjust(cr)) != NULL) {
897 #ifdef DEBUG
898 				rfs3call_hits++;
899 #endif
900 				rpcerror = rfscall(mi, which, xdrargs, argsp,
901 				    xdrres, resp, crr, douprintf,
902 				    NULL, flags, fi);
903 
904 				crfree(crr);
905 #ifdef DEBUG
906 				if (*statusp == NFS3ERR_ACCES)
907 					rfs3call_misses++;
908 #endif
909 			}
910 		}
911 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
912 
913 	return (rpcerror);
914 }
915 
916 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
917 #define	INC_READERS(mi)		{ \
918 	mi->mi_readers++; \
919 }
920 #define	DEC_READERS(mi)		{ \
921 	mi->mi_readers--; \
922 	if (mi->mi_readers == 0) \
923 		cv_broadcast(&mi->mi_failover_cv); \
924 }
925 
926 static int
927 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
928     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
929     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
930 {
931 	CLIENT *client;
932 	struct chtab *ch;
933 	cred_t *cr = icr;
934 	enum clnt_stat status;
935 	struct rpc_err rpcerr;
936 	struct timeval wait;
937 	int timeo;		/* in units of hz */
938 	int my_rsize, my_wsize;
939 	bool_t tryagain;
940 	bool_t cred_cloned = FALSE;
941 	k_sigset_t smask;
942 	servinfo_t *svp;
943 	struct nfs_clnt *nfscl;
944 	zoneid_t zoneid = getzoneid();
945 #ifdef DEBUG
946 	char *bufp;
947 #endif
948 
949 
950 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
951 	    "rfscall_start:which %d mi %p", which, mi);
952 
953 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
954 	ASSERT(nfscl != NULL);
955 
956 	nfscl->nfscl_stat.calls.value.ui64++;
957 	mi->mi_reqs[which].value.ui64++;
958 
959 	rpcerr.re_status = RPC_SUCCESS;
960 
961 	/*
962 	 * In case of forced unmount or zone shutdown, return EIO.
963 	 */
964 
965 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
966 		rpcerr.re_status = RPC_FAILED;
967 		rpcerr.re_errno = EIO;
968 		return (rpcerr.re_errno);
969 	}
970 
971 	/*
972 	 * Remember the transfer sizes in case
973 	 * nfs_feedback changes them underneath us.
974 	 */
975 	my_rsize = mi->mi_curread;
976 	my_wsize = mi->mi_curwrite;
977 
978 	/*
979 	 * NFS client failover support
980 	 *
981 	 * If this rnode is not in sync with the current server (VALID_FH),
982 	 * we'd like to do a remap to get in sync.  We can be interrupted
983 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
984 	 * use the best info we have to try the RPC.  Part of that is
985 	 * unconditionally updating the filehandle copy kept for V3.
986 	 *
987 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
988 	 * rw_enter(); we're trying to keep the current server from being
989 	 * changed on us until we're done with the remapping and have a
990 	 * matching client handle.  We don't want to sending a filehandle
991 	 * to the wrong host.
992 	 */
993 failoverretry:
994 	if (FAILOVER_MOUNT(mi)) {
995 		mutex_enter(&mi->mi_lock);
996 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
997 			if (failover_wait(mi)) {
998 				mutex_exit(&mi->mi_lock);
999 				return (EINTR);
1000 			}
1001 		}
1002 		INC_READERS(mi);
1003 		mutex_exit(&mi->mi_lock);
1004 		if (fi) {
1005 			if (!VALID_FH(fi) &&
1006 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1007 				int remaperr;
1008 
1009 				svp = mi->mi_curr_serv;
1010 				remaperr = failover_remap(fi);
1011 				if (remaperr != 0) {
1012 #ifdef DEBUG
1013 					if (remaperr != EINTR)
1014 						nfs_cmn_err(remaperr, CE_WARN,
1015 					    "rfscall couldn't failover: %m");
1016 #endif
1017 					mutex_enter(&mi->mi_lock);
1018 					DEC_READERS(mi);
1019 					mutex_exit(&mi->mi_lock);
1020 					/*
1021 					 * If failover_remap returns ETIMEDOUT
1022 					 * and the filesystem is hard mounted
1023 					 * we have to retry the call with a new
1024 					 * server.
1025 					 */
1026 					if ((mi->mi_flags & MI_HARD) &&
1027 					    IS_RECOVERABLE_ERROR(remaperr)) {
1028 						if (svp == mi->mi_curr_serv)
1029 							failover_newserver(mi);
1030 						rpcerr.re_status = RPC_SUCCESS;
1031 						goto failoverretry;
1032 					}
1033 					rpcerr.re_errno = remaperr;
1034 					return (remaperr);
1035 				}
1036 			}
1037 			if (fi->fhp && fi->copyproc)
1038 				(*fi->copyproc)(fi->fhp, fi->vp);
1039 		}
1040 	}
1041 
1042 	/* For TSOL, use a new cred which has net_mac_aware flag */
1043 	if (!cred_cloned && is_system_labeled()) {
1044 		cred_cloned = TRUE;
1045 		cr = crdup(icr);
1046 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1047 	}
1048 
1049 	/*
1050 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1051 	 * are guaranteed to reprocess the retry as a new request.
1052 	 */
1053 	svp = mi->mi_curr_serv;
1054 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1055 
1056 	if (FAILOVER_MOUNT(mi)) {
1057 		mutex_enter(&mi->mi_lock);
1058 		DEC_READERS(mi);
1059 		mutex_exit(&mi->mi_lock);
1060 
1061 		if ((rpcerr.re_errno == ETIMEDOUT ||
1062 		    rpcerr.re_errno == ECONNRESET) &&
1063 		    failover_safe(fi)) {
1064 			if (svp == mi->mi_curr_serv)
1065 				failover_newserver(mi);
1066 			goto failoverretry;
1067 		}
1068 	}
1069 	if (rpcerr.re_errno != 0)
1070 		return (rpcerr.re_errno);
1071 
1072 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1073 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1074 		timeo = (mi->mi_timeo * hz) / 10;
1075 	} else {
1076 		mutex_enter(&mi->mi_lock);
1077 		timeo = CLNT_SETTIMERS(client,
1078 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1079 		    &(mi->mi_timers[NFS_CALLTYPES]),
1080 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1081 		    (void (*)())NULL, (caddr_t)mi, 0);
1082 		mutex_exit(&mi->mi_lock);
1083 	}
1084 
1085 	/*
1086 	 * If hard mounted fs, retry call forever unless hard error occurs.
1087 	 */
1088 	do {
1089 		tryagain = FALSE;
1090 
1091 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1092 			status = RPC_FAILED;
1093 			rpcerr.re_status = RPC_FAILED;
1094 			rpcerr.re_errno = EIO;
1095 			break;
1096 		}
1097 
1098 		TICK_TO_TIMEVAL(timeo, &wait);
1099 
1100 		/*
1101 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1102 		 * and SIGTERM. (Preserving the existing masks).
1103 		 * Mask out SIGINT if mount option nointr is specified.
1104 		 */
1105 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1106 		if (!(mi->mi_flags & MI_INT))
1107 			client->cl_nosignal = TRUE;
1108 
1109 		/*
1110 		 * If there is a current signal, then don't bother
1111 		 * even trying to send out the request because we
1112 		 * won't be able to block waiting for the response.
1113 		 * Simply assume RPC_INTR and get on with it.
1114 		 */
1115 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1116 			status = RPC_INTR;
1117 		else {
1118 			status = CLNT_CALL(client, which, xdrargs, argsp,
1119 			    xdrres, resp, wait);
1120 		}
1121 
1122 		if (!(mi->mi_flags & MI_INT))
1123 			client->cl_nosignal = FALSE;
1124 		/*
1125 		 * restore original signal mask
1126 		 */
1127 		sigunintr(&smask);
1128 
1129 		switch (status) {
1130 		case RPC_SUCCESS:
1131 			if ((mi->mi_flags & MI_DYNAMIC) &&
1132 			    mi->mi_timer_type[which] != 0 &&
1133 			    (mi->mi_curread != my_rsize ||
1134 			    mi->mi_curwrite != my_wsize))
1135 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1136 			break;
1137 
1138 		case RPC_INTR:
1139 			/*
1140 			 * There is no way to recover from this error,
1141 			 * even if mount option nointr is specified.
1142 			 * SIGKILL, for example, cannot be blocked.
1143 			 */
1144 			rpcerr.re_status = RPC_INTR;
1145 			rpcerr.re_errno = EINTR;
1146 			break;
1147 
1148 		case RPC_UDERROR:
1149 			/*
1150 			 * If the NFS server is local (vold) and
1151 			 * it goes away then we get RPC_UDERROR.
1152 			 * This is a retryable error, so we would
1153 			 * loop, so check to see if the specific
1154 			 * error was ECONNRESET, indicating that
1155 			 * target did not exist at all.  If so,
1156 			 * return with RPC_PROGUNAVAIL and
1157 			 * ECONNRESET to indicate why.
1158 			 */
1159 			CLNT_GETERR(client, &rpcerr);
1160 			if (rpcerr.re_errno == ECONNRESET) {
1161 				rpcerr.re_status = RPC_PROGUNAVAIL;
1162 				rpcerr.re_errno = ECONNRESET;
1163 				break;
1164 			}
1165 			/*FALLTHROUGH*/
1166 
1167 		default:		/* probably RPC_TIMEDOUT */
1168 			if (IS_UNRECOVERABLE_RPC(status))
1169 				break;
1170 
1171 			/*
1172 			 * increment server not responding count
1173 			 */
1174 			mutex_enter(&mi->mi_lock);
1175 			mi->mi_noresponse++;
1176 			mutex_exit(&mi->mi_lock);
1177 #ifdef DEBUG
1178 			nfscl->nfscl_stat.noresponse.value.ui64++;
1179 #endif
1180 
1181 			if (!(mi->mi_flags & MI_HARD)) {
1182 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1183 				    (mi->mi_ss_call_type[which] == 0))
1184 					break;
1185 			}
1186 
1187 			/*
1188 			 * The call is in progress (over COTS).
1189 			 * Try the CLNT_CALL again, but don't
1190 			 * print a noisy error message.
1191 			 */
1192 			if (status == RPC_INPROGRESS) {
1193 				tryagain = TRUE;
1194 				break;
1195 			}
1196 
1197 			if (flags & RFSCALL_SOFT)
1198 				break;
1199 
1200 			/*
1201 			 * On zone shutdown, just move on.
1202 			 */
1203 			if (zone_status_get(curproc->p_zone) >=
1204 			    ZONE_IS_SHUTTING_DOWN) {
1205 				rpcerr.re_status = RPC_FAILED;
1206 				rpcerr.re_errno = EIO;
1207 				break;
1208 			}
1209 
1210 			/*
1211 			 * NFS client failover support
1212 			 *
1213 			 * If the current server just failed us, we'll
1214 			 * start the process of finding a new server.
1215 			 * After that, we can just retry.
1216 			 */
1217 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1218 				if (svp == mi->mi_curr_serv)
1219 					failover_newserver(mi);
1220 				clfree_impl(client, ch, nfscl);
1221 				goto failoverretry;
1222 			}
1223 
1224 			tryagain = TRUE;
1225 			timeo = backoff(timeo);
1226 			mutex_enter(&mi->mi_lock);
1227 			if (!(mi->mi_flags & MI_PRINTED)) {
1228 				mi->mi_flags |= MI_PRINTED;
1229 				mutex_exit(&mi->mi_lock);
1230 #ifdef DEBUG
1231 				zprintf(zoneid,
1232 			"NFS%d server %s not responding still trying\n",
1233 				    mi->mi_vers, svp->sv_hostname);
1234 #else
1235 				zprintf(zoneid,
1236 			"NFS server %s not responding still trying\n",
1237 				    svp->sv_hostname);
1238 #endif
1239 			} else
1240 				mutex_exit(&mi->mi_lock);
1241 			if (*douprintf && nfs_has_ctty()) {
1242 				*douprintf = 0;
1243 				if (!(mi->mi_flags & MI_NOPRINT))
1244 #ifdef DEBUG
1245 					uprintf(
1246 			    "NFS%d server %s not responding still trying\n",
1247 					    mi->mi_vers, svp->sv_hostname);
1248 #else
1249 					uprintf(
1250 			    "NFS server %s not responding still trying\n",
1251 					    svp->sv_hostname);
1252 #endif
1253 			}
1254 
1255 			/*
1256 			 * If doing dynamic adjustment of transfer
1257 			 * size and if it's a read or write call
1258 			 * and if the transfer size changed while
1259 			 * retransmitting or if the feedback routine
1260 			 * changed the transfer size,
1261 			 * then exit rfscall so that the transfer
1262 			 * size can be adjusted at the vnops level.
1263 			 */
1264 			if ((mi->mi_flags & MI_DYNAMIC) &&
1265 			    mi->mi_timer_type[which] != 0 &&
1266 			    (mi->mi_curread != my_rsize ||
1267 			    mi->mi_curwrite != my_wsize ||
1268 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1269 				/*
1270 				 * On read or write calls, return
1271 				 * back to the vnode ops level if
1272 				 * the transfer size changed.
1273 				 */
1274 				clfree_impl(client, ch, nfscl);
1275 				if (cred_cloned)
1276 					crfree(cr);
1277 				return (ENFS_TRYAGAIN);
1278 			}
1279 		}
1280 	} while (tryagain);
1281 
1282 	if (status != RPC_SUCCESS) {
1283 		/*
1284 		 * Let soft mounts use the timed out message.
1285 		 */
1286 		if (status == RPC_INPROGRESS)
1287 			status = RPC_TIMEDOUT;
1288 		nfscl->nfscl_stat.badcalls.value.ui64++;
1289 		if (status != RPC_INTR) {
1290 			mutex_enter(&mi->mi_lock);
1291 			mi->mi_flags |= MI_DOWN;
1292 			mutex_exit(&mi->mi_lock);
1293 			CLNT_GETERR(client, &rpcerr);
1294 #ifdef DEBUG
1295 			bufp = clnt_sperror(client, svp->sv_hostname);
1296 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1297 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1298 			if (nfs_has_ctty()) {
1299 				if (!(mi->mi_flags & MI_NOPRINT)) {
1300 					uprintf("NFS%d %s failed for %s\n",
1301 					    mi->mi_vers, mi->mi_rfsnames[which],
1302 					    bufp);
1303 				}
1304 			}
1305 			kmem_free(bufp, MAXPATHLEN);
1306 #else
1307 			zprintf(zoneid,
1308 			    "NFS %s failed for server %s: error %d (%s)\n",
1309 			    mi->mi_rfsnames[which], svp->sv_hostname,
1310 			    status, clnt_sperrno(status));
1311 			if (nfs_has_ctty()) {
1312 				if (!(mi->mi_flags & MI_NOPRINT)) {
1313 					uprintf(
1314 				"NFS %s failed for server %s: error %d (%s)\n",
1315 					    mi->mi_rfsnames[which],
1316 					    svp->sv_hostname, status,
1317 					    clnt_sperrno(status));
1318 				}
1319 			}
1320 #endif
1321 			/*
1322 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1323 			 * re_errno is set appropriately depending on
1324 			 * the authentication error
1325 			 */
1326 			if (status == RPC_VERSMISMATCH ||
1327 			    status == RPC_PROGVERSMISMATCH)
1328 				rpcerr.re_errno = EIO;
1329 		}
1330 	} else {
1331 		/*
1332 		 * Test the value of mi_down and mi_printed without
1333 		 * holding the mi_lock mutex.  If they are both zero,
1334 		 * then it is okay to skip the down and printed
1335 		 * processing.  This saves on a mutex_enter and
1336 		 * mutex_exit pair for a normal, successful RPC.
1337 		 * This was just complete overhead.
1338 		 */
1339 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1340 			mutex_enter(&mi->mi_lock);
1341 			mi->mi_flags &= ~MI_DOWN;
1342 			if (mi->mi_flags & MI_PRINTED) {
1343 				mi->mi_flags &= ~MI_PRINTED;
1344 				mutex_exit(&mi->mi_lock);
1345 #ifdef DEBUG
1346 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1347 				zprintf(zoneid, "NFS%d server %s ok\n",
1348 				    mi->mi_vers, svp->sv_hostname);
1349 #else
1350 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1351 				zprintf(zoneid, "NFS server %s ok\n",
1352 				    svp->sv_hostname);
1353 #endif
1354 			} else
1355 				mutex_exit(&mi->mi_lock);
1356 		}
1357 
1358 		if (*douprintf == 0) {
1359 			if (!(mi->mi_flags & MI_NOPRINT))
1360 #ifdef DEBUG
1361 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362 					uprintf("NFS%d server %s ok\n",
1363 					    mi->mi_vers, svp->sv_hostname);
1364 #else
1365 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1366 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1367 #endif
1368 			*douprintf = 1;
1369 		}
1370 	}
1371 
1372 	clfree_impl(client, ch, nfscl);
1373 	if (cred_cloned)
1374 		crfree(cr);
1375 
1376 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1377 
1378 	if (rpc_status != NULL)
1379 		*rpc_status = rpcerr.re_status;
1380 
1381 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1382 	    rpcerr.re_errno);
1383 
1384 	return (rpcerr.re_errno);
1385 }
1386 
1387 #ifdef DEBUG
1388 static int acl2call_hits = 0;
1389 static int acl2call_misses = 0;
1390 #endif
1391 
1392 int
1393 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1394     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1395     enum nfsstat *statusp, int flags, failinfo_t *fi)
1396 {
1397 	int rpcerror;
1398 
1399 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1400 	    cr, douprintf, flags, fi);
1401 	if (!rpcerror) {
1402 		/*
1403 		 * See comments with crnetadjust().
1404 		 */
1405 		if (*statusp == NFSERR_ACCES &&
1406 		    (cr = crnetadjust(cr)) != NULL) {
1407 #ifdef DEBUG
1408 			acl2call_hits++;
1409 #endif
1410 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1411 			    resp, cr, douprintf, flags, fi);
1412 			crfree(cr);
1413 #ifdef DEBUG
1414 			if (*statusp == NFSERR_ACCES)
1415 				acl2call_misses++;
1416 #endif
1417 		}
1418 	}
1419 
1420 	return (rpcerror);
1421 }
1422 
1423 #ifdef DEBUG
1424 static int acl3call_hits = 0;
1425 static int acl3call_misses = 0;
1426 #endif
1427 
1428 int
1429 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1430     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1431     nfsstat3 *statusp, int flags, failinfo_t *fi)
1432 {
1433 	int rpcerror;
1434 	int user_informed;
1435 
1436 	user_informed = 0;
1437 
1438 	do {
1439 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1440 		    cr, douprintf, flags, fi);
1441 		if (!rpcerror) {
1442 			cred_t *crr;
1443 			if (*statusp == NFS3ERR_JUKEBOX) {
1444 				if (!user_informed) {
1445 					user_informed = 1;
1446 					uprintf(
1447 		"file temporarily unavailable on the server, retrying...\n");
1448 				}
1449 				delay(nfs3_jukebox_delay);
1450 			}
1451 			/*
1452 			 * See crnetadjust() for comments.
1453 			 */
1454 			else if (*statusp == NFS3ERR_ACCES &&
1455 			    (crr = crnetadjust(cr)) != NULL) {
1456 #ifdef DEBUG
1457 				acl3call_hits++;
1458 #endif
1459 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1460 				    xdrres, resp, crr, douprintf, flags, fi);
1461 
1462 				crfree(crr);
1463 #ifdef DEBUG
1464 				if (*statusp == NFS3ERR_ACCES)
1465 					acl3call_misses++;
1466 #endif
1467 			}
1468 		}
1469 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1470 
1471 	return (rpcerror);
1472 }
1473 
1474 static int
1475 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1476     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1477     int flags, failinfo_t *fi)
1478 {
1479 	CLIENT *client;
1480 	struct chtab *ch;
1481 	cred_t *cr = icr;
1482 	bool_t cred_cloned = FALSE;
1483 	enum clnt_stat status;
1484 	struct rpc_err rpcerr;
1485 	struct timeval wait;
1486 	int timeo;		/* in units of hz */
1487 #if 0 /* notyet */
1488 	int my_rsize, my_wsize;
1489 #endif
1490 	bool_t tryagain;
1491 	k_sigset_t smask;
1492 	servinfo_t *svp;
1493 	struct nfs_clnt *nfscl;
1494 	zoneid_t zoneid = getzoneid();
1495 #ifdef DEBUG
1496 	char *bufp;
1497 #endif
1498 
1499 #if 0 /* notyet */
1500 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1501 	    "rfscall_start:which %d mi %p", which, mi);
1502 #endif
1503 
1504 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1505 	ASSERT(nfscl != NULL);
1506 
1507 	nfscl->nfscl_stat.calls.value.ui64++;
1508 	mi->mi_aclreqs[which].value.ui64++;
1509 
1510 	rpcerr.re_status = RPC_SUCCESS;
1511 
1512 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1513 		rpcerr.re_status = RPC_FAILED;
1514 		rpcerr.re_errno = EIO;
1515 		return (rpcerr.re_errno);
1516 	}
1517 
1518 #if 0 /* notyet */
1519 	/*
1520 	 * Remember the transfer sizes in case
1521 	 * nfs_feedback changes them underneath us.
1522 	 */
1523 	my_rsize = mi->mi_curread;
1524 	my_wsize = mi->mi_curwrite;
1525 #endif
1526 
1527 	/*
1528 	 * NFS client failover support
1529 	 *
1530 	 * If this rnode is not in sync with the current server (VALID_FH),
1531 	 * we'd like to do a remap to get in sync.  We can be interrupted
1532 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1533 	 * use the best info we have to try the RPC.  Part of that is
1534 	 * unconditionally updating the filehandle copy kept for V3.
1535 	 *
1536 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1537 	 * rw_enter(); we're trying to keep the current server from being
1538 	 * changed on us until we're done with the remapping and have a
1539 	 * matching client handle.  We don't want to sending a filehandle
1540 	 * to the wrong host.
1541 	 */
1542 failoverretry:
1543 	if (FAILOVER_MOUNT(mi)) {
1544 		mutex_enter(&mi->mi_lock);
1545 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1546 			if (failover_wait(mi)) {
1547 				mutex_exit(&mi->mi_lock);
1548 				return (EINTR);
1549 			}
1550 		}
1551 		INC_READERS(mi);
1552 		mutex_exit(&mi->mi_lock);
1553 		if (fi) {
1554 			if (!VALID_FH(fi) &&
1555 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1556 				int remaperr;
1557 
1558 				svp = mi->mi_curr_serv;
1559 				remaperr = failover_remap(fi);
1560 				if (remaperr != 0) {
1561 #ifdef DEBUG
1562 					if (remaperr != EINTR)
1563 						nfs_cmn_err(remaperr, CE_WARN,
1564 					    "aclcall couldn't failover: %m");
1565 #endif
1566 					mutex_enter(&mi->mi_lock);
1567 					DEC_READERS(mi);
1568 					mutex_exit(&mi->mi_lock);
1569 
1570 					/*
1571 					 * If failover_remap returns ETIMEDOUT
1572 					 * and the filesystem is hard mounted
1573 					 * we have to retry the call with a new
1574 					 * server.
1575 					 */
1576 					if ((mi->mi_flags & MI_HARD) &&
1577 					    IS_RECOVERABLE_ERROR(remaperr)) {
1578 						if (svp == mi->mi_curr_serv)
1579 							failover_newserver(mi);
1580 						rpcerr.re_status = RPC_SUCCESS;
1581 						goto failoverretry;
1582 					}
1583 					return (remaperr);
1584 				}
1585 			}
1586 			if (fi->fhp && fi->copyproc)
1587 				(*fi->copyproc)(fi->fhp, fi->vp);
1588 		}
1589 	}
1590 
1591 	/* For TSOL, use a new cred which has net_mac_aware flag */
1592 	if (!cred_cloned && is_system_labeled()) {
1593 		cred_cloned = TRUE;
1594 		cr = crdup(icr);
1595 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1596 	}
1597 
1598 	/*
1599 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1600 	 * are guaranteed to reprocess the retry as a new request.
1601 	 */
1602 	svp = mi->mi_curr_serv;
1603 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1604 	if (FAILOVER_MOUNT(mi)) {
1605 		mutex_enter(&mi->mi_lock);
1606 		DEC_READERS(mi);
1607 		mutex_exit(&mi->mi_lock);
1608 
1609 		if ((rpcerr.re_errno == ETIMEDOUT ||
1610 		    rpcerr.re_errno == ECONNRESET) &&
1611 		    failover_safe(fi)) {
1612 			if (svp == mi->mi_curr_serv)
1613 				failover_newserver(mi);
1614 			goto failoverretry;
1615 		}
1616 	}
1617 	if (rpcerr.re_errno != 0) {
1618 		if (cred_cloned)
1619 			crfree(cr);
1620 		return (rpcerr.re_errno);
1621 	}
1622 
1623 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1624 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1625 		timeo = (mi->mi_timeo * hz) / 10;
1626 	} else {
1627 		mutex_enter(&mi->mi_lock);
1628 		timeo = CLNT_SETTIMERS(client,
1629 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1630 		    &(mi->mi_timers[NFS_CALLTYPES]),
1631 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1632 		    (void (*)()) 0, (caddr_t)mi, 0);
1633 		mutex_exit(&mi->mi_lock);
1634 	}
1635 
1636 	/*
1637 	 * If hard mounted fs, retry call forever unless hard error occurs.
1638 	 */
1639 	do {
1640 		tryagain = FALSE;
1641 
1642 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1643 			status = RPC_FAILED;
1644 			rpcerr.re_status = RPC_FAILED;
1645 			rpcerr.re_errno = EIO;
1646 			break;
1647 		}
1648 
1649 		TICK_TO_TIMEVAL(timeo, &wait);
1650 
1651 		/*
1652 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1653 		 * and SIGTERM. (Preserving the existing masks).
1654 		 * Mask out SIGINT if mount option nointr is specified.
1655 		 */
1656 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1657 		if (!(mi->mi_flags & MI_INT))
1658 			client->cl_nosignal = TRUE;
1659 
1660 		/*
1661 		 * If there is a current signal, then don't bother
1662 		 * even trying to send out the request because we
1663 		 * won't be able to block waiting for the response.
1664 		 * Simply assume RPC_INTR and get on with it.
1665 		 */
1666 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1667 			status = RPC_INTR;
1668 		else {
1669 			status = CLNT_CALL(client, which, xdrargs, argsp,
1670 			    xdrres, resp, wait);
1671 		}
1672 
1673 		if (!(mi->mi_flags & MI_INT))
1674 			client->cl_nosignal = FALSE;
1675 		/*
1676 		 * restore original signal mask
1677 		 */
1678 		sigunintr(&smask);
1679 
1680 		switch (status) {
1681 		case RPC_SUCCESS:
1682 #if 0 /* notyet */
1683 			if ((mi->mi_flags & MI_DYNAMIC) &&
1684 			    mi->mi_timer_type[which] != 0 &&
1685 			    (mi->mi_curread != my_rsize ||
1686 			    mi->mi_curwrite != my_wsize))
1687 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1688 #endif
1689 			break;
1690 
1691 		/*
1692 		 * Unfortunately, there are servers in the world which
1693 		 * are not coded correctly.  They are not prepared to
1694 		 * handle RPC requests to the NFS port which are not
1695 		 * NFS requests.  Thus, they may try to process the
1696 		 * NFS_ACL request as if it were an NFS request.  This
1697 		 * does not work.  Generally, an error will be generated
1698 		 * on the client because it will not be able to decode
1699 		 * the response from the server.  However, it seems
1700 		 * possible that the server may not be able to decode
1701 		 * the arguments.  Thus, the criteria for deciding
1702 		 * whether the server supports NFS_ACL or not is whether
1703 		 * the following RPC errors are returned from CLNT_CALL.
1704 		 */
1705 		case RPC_CANTDECODERES:
1706 		case RPC_PROGUNAVAIL:
1707 		case RPC_CANTDECODEARGS:
1708 		case RPC_PROGVERSMISMATCH:
1709 			mutex_enter(&mi->mi_lock);
1710 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1711 			mutex_exit(&mi->mi_lock);
1712 			break;
1713 
1714 		/*
1715 		 * If the server supports NFS_ACL but not the new ops
1716 		 * for extended attributes, make sure we don't retry.
1717 		 */
1718 		case RPC_PROCUNAVAIL:
1719 			mutex_enter(&mi->mi_lock);
1720 			mi->mi_flags &= ~MI_EXTATTR;
1721 			mutex_exit(&mi->mi_lock);
1722 			break;
1723 
1724 		case RPC_INTR:
1725 			/*
1726 			 * There is no way to recover from this error,
1727 			 * even if mount option nointr is specified.
1728 			 * SIGKILL, for example, cannot be blocked.
1729 			 */
1730 			rpcerr.re_status = RPC_INTR;
1731 			rpcerr.re_errno = EINTR;
1732 			break;
1733 
1734 		case RPC_UDERROR:
1735 			/*
1736 			 * If the NFS server is local (vold) and
1737 			 * it goes away then we get RPC_UDERROR.
1738 			 * This is a retryable error, so we would
1739 			 * loop, so check to see if the specific
1740 			 * error was ECONNRESET, indicating that
1741 			 * target did not exist at all.  If so,
1742 			 * return with RPC_PROGUNAVAIL and
1743 			 * ECONNRESET to indicate why.
1744 			 */
1745 			CLNT_GETERR(client, &rpcerr);
1746 			if (rpcerr.re_errno == ECONNRESET) {
1747 				rpcerr.re_status = RPC_PROGUNAVAIL;
1748 				rpcerr.re_errno = ECONNRESET;
1749 				break;
1750 			}
1751 			/*FALLTHROUGH*/
1752 
1753 		default:		/* probably RPC_TIMEDOUT */
1754 			if (IS_UNRECOVERABLE_RPC(status))
1755 				break;
1756 
1757 			/*
1758 			 * increment server not responding count
1759 			 */
1760 			mutex_enter(&mi->mi_lock);
1761 			mi->mi_noresponse++;
1762 			mutex_exit(&mi->mi_lock);
1763 #ifdef DEBUG
1764 			nfscl->nfscl_stat.noresponse.value.ui64++;
1765 #endif
1766 
1767 			if (!(mi->mi_flags & MI_HARD)) {
1768 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1769 				    (mi->mi_acl_ss_call_type[which] == 0))
1770 					break;
1771 			}
1772 
1773 			/*
1774 			 * The call is in progress (over COTS).
1775 			 * Try the CLNT_CALL again, but don't
1776 			 * print a noisy error message.
1777 			 */
1778 			if (status == RPC_INPROGRESS) {
1779 				tryagain = TRUE;
1780 				break;
1781 			}
1782 
1783 			if (flags & RFSCALL_SOFT)
1784 				break;
1785 
1786 			/*
1787 			 * On zone shutdown, just move on.
1788 			 */
1789 			if (zone_status_get(curproc->p_zone) >=
1790 			    ZONE_IS_SHUTTING_DOWN) {
1791 				rpcerr.re_status = RPC_FAILED;
1792 				rpcerr.re_errno = EIO;
1793 				break;
1794 			}
1795 
1796 			/*
1797 			 * NFS client failover support
1798 			 *
1799 			 * If the current server just failed us, we'll
1800 			 * start the process of finding a new server.
1801 			 * After that, we can just retry.
1802 			 */
1803 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1804 				if (svp == mi->mi_curr_serv)
1805 					failover_newserver(mi);
1806 				clfree_impl(client, ch, nfscl);
1807 				goto failoverretry;
1808 			}
1809 
1810 			tryagain = TRUE;
1811 			timeo = backoff(timeo);
1812 			mutex_enter(&mi->mi_lock);
1813 			if (!(mi->mi_flags & MI_PRINTED)) {
1814 				mi->mi_flags |= MI_PRINTED;
1815 				mutex_exit(&mi->mi_lock);
1816 #ifdef DEBUG
1817 				zprintf(zoneid,
1818 			"NFS_ACL%d server %s not responding still trying\n",
1819 				    mi->mi_vers, svp->sv_hostname);
1820 #else
1821 				zprintf(zoneid,
1822 			    "NFS server %s not responding still trying\n",
1823 				    svp->sv_hostname);
1824 #endif
1825 			} else
1826 				mutex_exit(&mi->mi_lock);
1827 			if (*douprintf && nfs_has_ctty()) {
1828 				*douprintf = 0;
1829 				if (!(mi->mi_flags & MI_NOPRINT))
1830 #ifdef DEBUG
1831 					uprintf(
1832 			"NFS_ACL%d server %s not responding still trying\n",
1833 					    mi->mi_vers, svp->sv_hostname);
1834 #else
1835 					uprintf(
1836 			    "NFS server %s not responding still trying\n",
1837 					    svp->sv_hostname);
1838 #endif
1839 			}
1840 
1841 #if 0 /* notyet */
1842 			/*
1843 			 * If doing dynamic adjustment of transfer
1844 			 * size and if it's a read or write call
1845 			 * and if the transfer size changed while
1846 			 * retransmitting or if the feedback routine
1847 			 * changed the transfer size,
1848 			 * then exit rfscall so that the transfer
1849 			 * size can be adjusted at the vnops level.
1850 			 */
1851 			if ((mi->mi_flags & MI_DYNAMIC) &&
1852 			    mi->mi_acl_timer_type[which] != 0 &&
1853 			    (mi->mi_curread != my_rsize ||
1854 			    mi->mi_curwrite != my_wsize ||
1855 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1856 				/*
1857 				 * On read or write calls, return
1858 				 * back to the vnode ops level if
1859 				 * the transfer size changed.
1860 				 */
1861 				clfree_impl(client, ch, nfscl);
1862 				if (cred_cloned)
1863 					crfree(cr);
1864 				return (ENFS_TRYAGAIN);
1865 			}
1866 #endif
1867 		}
1868 	} while (tryagain);
1869 
1870 	if (status != RPC_SUCCESS) {
1871 		/*
1872 		 * Let soft mounts use the timed out message.
1873 		 */
1874 		if (status == RPC_INPROGRESS)
1875 			status = RPC_TIMEDOUT;
1876 		nfscl->nfscl_stat.badcalls.value.ui64++;
1877 		if (status == RPC_CANTDECODERES ||
1878 		    status == RPC_PROGUNAVAIL ||
1879 		    status == RPC_PROCUNAVAIL ||
1880 		    status == RPC_CANTDECODEARGS ||
1881 		    status == RPC_PROGVERSMISMATCH)
1882 			CLNT_GETERR(client, &rpcerr);
1883 		else if (status != RPC_INTR) {
1884 			mutex_enter(&mi->mi_lock);
1885 			mi->mi_flags |= MI_DOWN;
1886 			mutex_exit(&mi->mi_lock);
1887 			CLNT_GETERR(client, &rpcerr);
1888 #ifdef DEBUG
1889 			bufp = clnt_sperror(client, svp->sv_hostname);
1890 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1891 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1892 			if (nfs_has_ctty()) {
1893 				if (!(mi->mi_flags & MI_NOPRINT)) {
1894 					uprintf("NFS_ACL%d %s failed for %s\n",
1895 					    mi->mi_vers, mi->mi_aclnames[which],
1896 					    bufp);
1897 				}
1898 			}
1899 			kmem_free(bufp, MAXPATHLEN);
1900 #else
1901 			zprintf(zoneid,
1902 			    "NFS %s failed for server %s: error %d (%s)\n",
1903 			    mi->mi_aclnames[which], svp->sv_hostname,
1904 			    status, clnt_sperrno(status));
1905 			if (nfs_has_ctty()) {
1906 				if (!(mi->mi_flags & MI_NOPRINT))
1907 					uprintf(
1908 				"NFS %s failed for server %s: error %d (%s)\n",
1909 					    mi->mi_aclnames[which],
1910 					    svp->sv_hostname, status,
1911 					    clnt_sperrno(status));
1912 			}
1913 #endif
1914 			/*
1915 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1916 			 * re_errno is set appropriately depending on
1917 			 * the authentication error
1918 			 */
1919 			if (status == RPC_VERSMISMATCH ||
1920 			    status == RPC_PROGVERSMISMATCH)
1921 				rpcerr.re_errno = EIO;
1922 		}
1923 	} else {
1924 		/*
1925 		 * Test the value of mi_down and mi_printed without
1926 		 * holding the mi_lock mutex.  If they are both zero,
1927 		 * then it is okay to skip the down and printed
1928 		 * processing.  This saves on a mutex_enter and
1929 		 * mutex_exit pair for a normal, successful RPC.
1930 		 * This was just complete overhead.
1931 		 */
1932 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1933 			mutex_enter(&mi->mi_lock);
1934 			mi->mi_flags &= ~MI_DOWN;
1935 			if (mi->mi_flags & MI_PRINTED) {
1936 				mi->mi_flags &= ~MI_PRINTED;
1937 				mutex_exit(&mi->mi_lock);
1938 #ifdef DEBUG
1939 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1940 				    mi->mi_vers, svp->sv_hostname);
1941 #else
1942 				zprintf(zoneid, "NFS server %s ok\n",
1943 				    svp->sv_hostname);
1944 #endif
1945 			} else
1946 				mutex_exit(&mi->mi_lock);
1947 		}
1948 
1949 		if (*douprintf == 0) {
1950 			if (!(mi->mi_flags & MI_NOPRINT))
1951 #ifdef DEBUG
1952 				uprintf("NFS_ACL%d server %s ok\n",
1953 				    mi->mi_vers, svp->sv_hostname);
1954 #else
1955 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1956 #endif
1957 			*douprintf = 1;
1958 		}
1959 	}
1960 
1961 	clfree_impl(client, ch, nfscl);
1962 	if (cred_cloned)
1963 		crfree(cr);
1964 
1965 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1966 
1967 #if 0 /* notyet */
1968 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1969 	    rpcerr.re_errno);
1970 #endif
1971 
1972 	return (rpcerr.re_errno);
1973 }
1974 
1975 int
1976 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1977 {
1978 	uint_t mask = vap->va_mask;
1979 
1980 	if (!(mask & AT_MODE))
1981 		sa->sa_mode = (uint32_t)-1;
1982 	else
1983 		sa->sa_mode = vap->va_mode;
1984 	if (!(mask & AT_UID))
1985 		sa->sa_uid = (uint32_t)-1;
1986 	else
1987 		sa->sa_uid = (uint32_t)vap->va_uid;
1988 	if (!(mask & AT_GID))
1989 		sa->sa_gid = (uint32_t)-1;
1990 	else
1991 		sa->sa_gid = (uint32_t)vap->va_gid;
1992 	if (!(mask & AT_SIZE))
1993 		sa->sa_size = (uint32_t)-1;
1994 	else
1995 		sa->sa_size = (uint32_t)vap->va_size;
1996 	if (!(mask & AT_ATIME))
1997 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
1998 	else {
1999 		/* check time validity */
2000 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2001 			return (EOVERFLOW);
2002 		}
2003 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2004 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2005 	}
2006 	if (!(mask & AT_MTIME))
2007 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2008 	else {
2009 		/* check time validity */
2010 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2011 			return (EOVERFLOW);
2012 		}
2013 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2014 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2015 	}
2016 	return (0);
2017 }
2018 
2019 int
2020 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2021 {
2022 	uint_t mask = vap->va_mask;
2023 
2024 	if (!(mask & AT_MODE))
2025 		sa->mode.set_it = FALSE;
2026 	else {
2027 		sa->mode.set_it = TRUE;
2028 		sa->mode.mode = (mode3)vap->va_mode;
2029 	}
2030 	if (!(mask & AT_UID))
2031 		sa->uid.set_it = FALSE;
2032 	else {
2033 		sa->uid.set_it = TRUE;
2034 		sa->uid.uid = (uid3)vap->va_uid;
2035 	}
2036 	if (!(mask & AT_GID))
2037 		sa->gid.set_it = FALSE;
2038 	else {
2039 		sa->gid.set_it = TRUE;
2040 		sa->gid.gid = (gid3)vap->va_gid;
2041 	}
2042 	if (!(mask & AT_SIZE))
2043 		sa->size.set_it = FALSE;
2044 	else {
2045 		sa->size.set_it = TRUE;
2046 		sa->size.size = (size3)vap->va_size;
2047 	}
2048 	if (!(mask & AT_ATIME))
2049 		sa->atime.set_it = DONT_CHANGE;
2050 	else {
2051 		/* check time validity */
2052 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2053 			return (EOVERFLOW);
2054 		}
2055 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2056 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2057 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2058 	}
2059 	if (!(mask & AT_MTIME))
2060 		sa->mtime.set_it = DONT_CHANGE;
2061 	else {
2062 		/* check time validity */
2063 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2064 			return (EOVERFLOW);
2065 		}
2066 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2067 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2068 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2069 	}
2070 	return (0);
2071 }
2072 
2073 void
2074 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2075 {
2076 
2077 	da->da_fhandle = VTOFH(dvp);
2078 	da->da_name = nm;
2079 	da->da_flags = 0;
2080 }
2081 
2082 void
2083 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2084 {
2085 
2086 	da->dirp = VTOFH3(dvp);
2087 	da->name = nm;
2088 }
2089 
2090 int
2091 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2092 {
2093 	int error;
2094 	rnode_t *rp;
2095 	struct vattr va;
2096 
2097 	va.va_mask = AT_MODE | AT_GID;
2098 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2099 	if (error)
2100 		return (error);
2101 
2102 	/*
2103 	 * To determine the expected group-id of the created file:
2104 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2105 	 *	GRPID option, and the directory's set-gid bit is clear,
2106 	 *	then use the process's gid.
2107 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2108 	 */
2109 	rp = VTOR(dvp);
2110 	mutex_enter(&rp->r_statelock);
2111 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2112 		*gidp = crgetgid(cr);
2113 	else
2114 		*gidp = va.va_gid;
2115 	mutex_exit(&rp->r_statelock);
2116 	return (0);
2117 }
2118 
2119 int
2120 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2121 {
2122 	int error;
2123 	struct vattr va;
2124 
2125 	va.va_mask = AT_MODE;
2126 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2127 	if (error)
2128 		return (error);
2129 
2130 	/*
2131 	 * Modify the expected mode (om) so that the set-gid bit matches
2132 	 * that of the parent directory (dvp).
2133 	 */
2134 	if (va.va_mode & VSGID)
2135 		*omp |= VSGID;
2136 	else
2137 		*omp &= ~VSGID;
2138 	return (0);
2139 }
2140 
2141 void
2142 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2143 {
2144 
2145 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2146 		if (!(vp->v_flag & VSWAPLIKE)) {
2147 			mutex_enter(&vp->v_lock);
2148 			vp->v_flag |= VSWAPLIKE;
2149 			mutex_exit(&vp->v_lock);
2150 		}
2151 	} else {
2152 		if (vp->v_flag & VSWAPLIKE) {
2153 			mutex_enter(&vp->v_lock);
2154 			vp->v_flag &= ~VSWAPLIKE;
2155 			mutex_exit(&vp->v_lock);
2156 		}
2157 	}
2158 }
2159 
2160 /*
2161  * Free the resources associated with an rnode.
2162  */
2163 static void
2164 rinactive(rnode_t *rp, cred_t *cr)
2165 {
2166 	vnode_t *vp;
2167 	cred_t *cred;
2168 	char *contents;
2169 	int size;
2170 	vsecattr_t *vsp;
2171 	int error;
2172 	nfs3_pathconf_info *info;
2173 
2174 	/*
2175 	 * Before freeing anything, wait until all asynchronous
2176 	 * activity is done on this rnode.  This will allow all
2177 	 * asynchronous read ahead and write behind i/o's to
2178 	 * finish.
2179 	 */
2180 	mutex_enter(&rp->r_statelock);
2181 	while (rp->r_count > 0)
2182 		cv_wait(&rp->r_cv, &rp->r_statelock);
2183 	mutex_exit(&rp->r_statelock);
2184 
2185 	/*
2186 	 * Flush and invalidate all pages associated with the vnode.
2187 	 */
2188 	vp = RTOV(rp);
2189 	if (vn_has_cached_data(vp)) {
2190 		ASSERT(vp->v_type != VCHR);
2191 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2192 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2193 			if (error && (error == ENOSPC || error == EDQUOT)) {
2194 				mutex_enter(&rp->r_statelock);
2195 				if (!rp->r_error)
2196 					rp->r_error = error;
2197 				mutex_exit(&rp->r_statelock);
2198 			}
2199 		}
2200 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2201 	}
2202 
2203 	/*
2204 	 * Free any held credentials and caches which may be associated
2205 	 * with this rnode.
2206 	 */
2207 	mutex_enter(&rp->r_statelock);
2208 	cred = rp->r_cred;
2209 	rp->r_cred = NULL;
2210 	contents = rp->r_symlink.contents;
2211 	size = rp->r_symlink.size;
2212 	rp->r_symlink.contents = NULL;
2213 	vsp = rp->r_secattr;
2214 	rp->r_secattr = NULL;
2215 	info = rp->r_pathconf;
2216 	rp->r_pathconf = NULL;
2217 	mutex_exit(&rp->r_statelock);
2218 
2219 	/*
2220 	 * Free the held credential.
2221 	 */
2222 	if (cred != NULL)
2223 		crfree(cred);
2224 
2225 	/*
2226 	 * Free the access cache entries.
2227 	 */
2228 	(void) nfs_access_purge_rp(rp);
2229 
2230 	/*
2231 	 * Free the readdir cache entries.
2232 	 */
2233 	if (HAVE_RDDIR_CACHE(rp))
2234 		nfs_purge_rddir_cache(vp);
2235 
2236 	/*
2237 	 * Free the symbolic link cache.
2238 	 */
2239 	if (contents != NULL) {
2240 
2241 		kmem_free((void *)contents, size);
2242 	}
2243 
2244 	/*
2245 	 * Free any cached ACL.
2246 	 */
2247 	if (vsp != NULL)
2248 		nfs_acl_free(vsp);
2249 
2250 	/*
2251 	 * Free any cached pathconf information.
2252 	 */
2253 	if (info != NULL)
2254 		kmem_free(info, sizeof (*info));
2255 }
2256 
2257 /*
2258  * Return a vnode for the given NFS Version 2 file handle.
2259  * If no rnode exists for this fhandle, create one and put it
2260  * into the hash queues.  If the rnode for this fhandle
2261  * already exists, return it.
2262  *
2263  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2264  */
2265 vnode_t *
2266 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2267     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2268 {
2269 	int newnode;
2270 	int index;
2271 	vnode_t *vp;
2272 	nfs_fhandle nfh;
2273 	vattr_t va;
2274 
2275 	nfh.fh_len = NFS_FHSIZE;
2276 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2277 
2278 	index = rtablehash(&nfh);
2279 	rw_enter(&rtable[index].r_lock, RW_READER);
2280 
2281 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2282 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2283 
2284 	if (attr != NULL) {
2285 		if (!newnode) {
2286 			rw_exit(&rtable[index].r_lock);
2287 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2288 		} else {
2289 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2290 				vp->v_type = VBAD;
2291 			else
2292 				vp->v_type = n2v_type(attr);
2293 			/*
2294 			 * A translation here seems to be necessary
2295 			 * because this function can be called
2296 			 * with `attr' that has come from the wire,
2297 			 * and been operated on by vattr_to_nattr().
2298 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2299 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2300 			 * ->makenfsnode().
2301 			 */
2302 			if ((attr->na_rdev & 0xffff0000) == 0)
2303 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2304 			else
2305 				vp->v_rdev = expldev(n2v_rdev(attr));
2306 			nfs_attrcache(vp, attr, t);
2307 			rw_exit(&rtable[index].r_lock);
2308 		}
2309 	} else {
2310 		if (newnode) {
2311 			PURGE_ATTRCACHE(vp);
2312 		}
2313 		rw_exit(&rtable[index].r_lock);
2314 	}
2315 
2316 	return (vp);
2317 }
2318 
2319 /*
2320  * Return a vnode for the given NFS Version 3 file handle.
2321  * If no rnode exists for this fhandle, create one and put it
2322  * into the hash queues.  If the rnode for this fhandle
2323  * already exists, return it.
2324  *
2325  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2326  */
2327 vnode_t *
2328 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2329     cred_t *cr, char *dnm, char *nm)
2330 {
2331 	int newnode;
2332 	int index;
2333 	vnode_t *vp;
2334 
2335 	index = rtablehash((nfs_fhandle *)fh);
2336 	rw_enter(&rtable[index].r_lock, RW_READER);
2337 
2338 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2339 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2340 	    dnm, nm);
2341 
2342 	if (vap == NULL) {
2343 		if (newnode) {
2344 			PURGE_ATTRCACHE(vp);
2345 		}
2346 		rw_exit(&rtable[index].r_lock);
2347 		return (vp);
2348 	}
2349 
2350 	if (!newnode) {
2351 		rw_exit(&rtable[index].r_lock);
2352 		nfs_attr_cache(vp, vap, t, cr);
2353 	} else {
2354 		rnode_t *rp = VTOR(vp);
2355 
2356 		vp->v_type = vap->va_type;
2357 		vp->v_rdev = vap->va_rdev;
2358 
2359 		mutex_enter(&rp->r_statelock);
2360 		if (rp->r_mtime <= t)
2361 			nfs_attrcache_va(vp, vap);
2362 		mutex_exit(&rp->r_statelock);
2363 		rw_exit(&rtable[index].r_lock);
2364 	}
2365 
2366 	return (vp);
2367 }
2368 
2369 vnode_t *
2370 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2371     cred_t *cr, char *dnm, char *nm)
2372 {
2373 	int newnode;
2374 	int index;
2375 	vnode_t *vp;
2376 	vattr_t va;
2377 
2378 	index = rtablehash((nfs_fhandle *)fh);
2379 	rw_enter(&rtable[index].r_lock, RW_READER);
2380 
2381 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2382 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2383 	    dnm, nm);
2384 
2385 	if (attr == NULL) {
2386 		if (newnode) {
2387 			PURGE_ATTRCACHE(vp);
2388 		}
2389 		rw_exit(&rtable[index].r_lock);
2390 		return (vp);
2391 	}
2392 
2393 	if (!newnode) {
2394 		rw_exit(&rtable[index].r_lock);
2395 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2396 	} else {
2397 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2398 			vp->v_type = VBAD;
2399 		else
2400 			vp->v_type = nf3_to_vt[attr->type];
2401 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2402 		    attr->rdev.specdata2);
2403 		nfs3_attrcache(vp, attr, t);
2404 		rw_exit(&rtable[index].r_lock);
2405 	}
2406 
2407 	return (vp);
2408 }
2409 
2410 /*
2411  * Read this comment before making changes to rtablehash()!
2412  * This is a hash function in which seemingly obvious and harmless
2413  * changes can cause escalations costing million dollars!
2414  * Know what you are doing.
2415  *
2416  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2417  * algorithm is currently detailed here:
2418  *
2419  *   http://burtleburtle.net/bob/hash/doobs.html
2420  *
2421  * Of course, the above link may not be valid by the time you are reading
2422  * this, but suffice it to say that the one-at-a-time algorithm works well in
2423  * almost all cases.  If you are changing the algorithm be sure to verify that
2424  * the hash algorithm still provides even distribution in all cases and with
2425  * any server returning filehandles in whatever order (sequential or random).
2426  */
2427 static int
2428 rtablehash(nfs_fhandle *fh)
2429 {
2430 	ulong_t hash, len, i;
2431 	char *key;
2432 
2433 	key = fh->fh_buf;
2434 	len = (ulong_t)fh->fh_len;
2435 	for (hash = 0, i = 0; i < len; i++) {
2436 		hash += key[i];
2437 		hash += (hash << 10);
2438 		hash ^= (hash >> 6);
2439 	}
2440 	hash += (hash << 3);
2441 	hash ^= (hash >> 11);
2442 	hash += (hash << 15);
2443 	return (hash & rtablemask);
2444 }
2445 
2446 static vnode_t *
2447 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2448     struct vnodeops *vops,
2449     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2450     int (*compar)(const void *, const void *),
2451     int *newnode, cred_t *cr, char *dnm, char *nm)
2452 {
2453 	rnode_t *rp;
2454 	rnode_t *trp;
2455 	vnode_t *vp;
2456 	mntinfo_t *mi;
2457 
2458 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2459 
2460 	mi = VFTOMI(vfsp);
2461 start:
2462 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2463 		vp = RTOV(rp);
2464 		nfs_set_vroot(vp);
2465 		*newnode = 0;
2466 		return (vp);
2467 	}
2468 	rw_exit(&rhtp->r_lock);
2469 
2470 	mutex_enter(&rpfreelist_lock);
2471 	if (rpfreelist != NULL && rnew >= nrnode) {
2472 		rp = rpfreelist;
2473 		rp_rmfree(rp);
2474 		mutex_exit(&rpfreelist_lock);
2475 
2476 		vp = RTOV(rp);
2477 
2478 		if (rp->r_flags & RHASHED) {
2479 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2480 			mutex_enter(&vp->v_lock);
2481 			if (vp->v_count > 1) {
2482 				vp->v_count--;
2483 				mutex_exit(&vp->v_lock);
2484 				rw_exit(&rp->r_hashq->r_lock);
2485 				rw_enter(&rhtp->r_lock, RW_READER);
2486 				goto start;
2487 			}
2488 			mutex_exit(&vp->v_lock);
2489 			rp_rmhash_locked(rp);
2490 			rw_exit(&rp->r_hashq->r_lock);
2491 		}
2492 
2493 		rinactive(rp, cr);
2494 
2495 		mutex_enter(&vp->v_lock);
2496 		if (vp->v_count > 1) {
2497 			vp->v_count--;
2498 			mutex_exit(&vp->v_lock);
2499 			rw_enter(&rhtp->r_lock, RW_READER);
2500 			goto start;
2501 		}
2502 		mutex_exit(&vp->v_lock);
2503 		vn_invalid(vp);
2504 		/*
2505 		 * destroy old locks before bzero'ing and
2506 		 * recreating the locks below.
2507 		 */
2508 		nfs_rw_destroy(&rp->r_rwlock);
2509 		nfs_rw_destroy(&rp->r_lkserlock);
2510 		mutex_destroy(&rp->r_statelock);
2511 		cv_destroy(&rp->r_cv);
2512 		cv_destroy(&rp->r_commit.c_cv);
2513 		nfs_free_r_path(rp);
2514 		avl_destroy(&rp->r_dir);
2515 		/*
2516 		 * Make sure that if rnode is recycled then
2517 		 * VFS count is decremented properly before
2518 		 * reuse.
2519 		 */
2520 		VFS_RELE(vp->v_vfsp);
2521 		vn_reinit(vp);
2522 	} else {
2523 		vnode_t *new_vp;
2524 
2525 		mutex_exit(&rpfreelist_lock);
2526 
2527 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2528 		new_vp = vn_alloc(KM_SLEEP);
2529 
2530 		atomic_add_long((ulong_t *)&rnew, 1);
2531 #ifdef DEBUG
2532 		clstat_debug.nrnode.value.ui64++;
2533 #endif
2534 		vp = new_vp;
2535 	}
2536 
2537 	bzero(rp, sizeof (*rp));
2538 	rp->r_vnode = vp;
2539 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2540 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2541 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2542 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2543 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2544 	rp->r_fh.fh_len = fh->fh_len;
2545 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2546 	rp->r_server = mi->mi_curr_serv;
2547 	if (FAILOVER_MOUNT(mi)) {
2548 		/*
2549 		 * If replicated servers, stash pathnames
2550 		 */
2551 		if (dnm != NULL && nm != NULL) {
2552 			char *s, *p;
2553 			uint_t len;
2554 
2555 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2556 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2557 #ifdef DEBUG
2558 			clstat_debug.rpath.value.ui64 += len;
2559 #endif
2560 			s = rp->r_path;
2561 			for (p = dnm; *p; p++)
2562 				*s++ = *p;
2563 			*s++ = '/';
2564 			for (p = nm; *p; p++)
2565 				*s++ = *p;
2566 			*s = '\0';
2567 		} else {
2568 			/* special case for root */
2569 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2570 #ifdef DEBUG
2571 			clstat_debug.rpath.value.ui64 += 2;
2572 #endif
2573 			*rp->r_path = '.';
2574 			*(rp->r_path + 1) = '\0';
2575 		}
2576 	}
2577 	VFS_HOLD(vfsp);
2578 	rp->r_putapage = putapage;
2579 	rp->r_hashq = rhtp;
2580 	rp->r_flags = RREADDIRPLUS;
2581 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2582 	    offsetof(rddir_cache, tree));
2583 	vn_setops(vp, vops);
2584 	vp->v_data = (caddr_t)rp;
2585 	vp->v_vfsp = vfsp;
2586 	vp->v_type = VNON;
2587 	nfs_set_vroot(vp);
2588 
2589 	/*
2590 	 * There is a race condition if someone else
2591 	 * alloc's the rnode while no locks are held, so we
2592 	 * check again and recover if found.
2593 	 */
2594 	rw_enter(&rhtp->r_lock, RW_WRITER);
2595 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2596 		vp = RTOV(trp);
2597 		nfs_set_vroot(vp);
2598 		*newnode = 0;
2599 		rw_exit(&rhtp->r_lock);
2600 		rp_addfree(rp, cr);
2601 		rw_enter(&rhtp->r_lock, RW_READER);
2602 		return (vp);
2603 	}
2604 	rp_addhash(rp);
2605 	*newnode = 1;
2606 	return (vp);
2607 }
2608 
2609 static void
2610 nfs_set_vroot(vnode_t *vp)
2611 {
2612 	rnode_t *rp;
2613 	nfs_fhandle *rootfh;
2614 
2615 	rp = VTOR(vp);
2616 	rootfh = &rp->r_server->sv_fhandle;
2617 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2618 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2619 		if (!(vp->v_flag & VROOT)) {
2620 			mutex_enter(&vp->v_lock);
2621 			vp->v_flag |= VROOT;
2622 			mutex_exit(&vp->v_lock);
2623 		}
2624 	}
2625 }
2626 
2627 static void
2628 nfs_free_r_path(rnode_t *rp)
2629 {
2630 	char *path;
2631 	size_t len;
2632 
2633 	path = rp->r_path;
2634 	if (path) {
2635 		rp->r_path = NULL;
2636 		len = strlen(path) + 1;
2637 		kmem_free(path, len);
2638 #ifdef DEBUG
2639 		clstat_debug.rpath.value.ui64 -= len;
2640 #endif
2641 	}
2642 }
2643 
2644 /*
2645  * Put an rnode on the free list.
2646  *
2647  * Rnodes which were allocated above and beyond the normal limit
2648  * are immediately freed.
2649  */
2650 void
2651 rp_addfree(rnode_t *rp, cred_t *cr)
2652 {
2653 	vnode_t *vp;
2654 	struct vfs *vfsp;
2655 
2656 	vp = RTOV(rp);
2657 	ASSERT(vp->v_count >= 1);
2658 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2659 
2660 	/*
2661 	 * If we have too many rnodes allocated and there are no
2662 	 * references to this rnode, or if the rnode is no longer
2663 	 * accessible by it does not reside in the hash queues,
2664 	 * or if an i/o error occurred while writing to the file,
2665 	 * then just free it instead of putting it on the rnode
2666 	 * freelist.
2667 	 */
2668 	vfsp = vp->v_vfsp;
2669 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2670 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2671 		if (rp->r_flags & RHASHED) {
2672 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2673 			mutex_enter(&vp->v_lock);
2674 			if (vp->v_count > 1) {
2675 				vp->v_count--;
2676 				mutex_exit(&vp->v_lock);
2677 				rw_exit(&rp->r_hashq->r_lock);
2678 				return;
2679 			}
2680 			mutex_exit(&vp->v_lock);
2681 			rp_rmhash_locked(rp);
2682 			rw_exit(&rp->r_hashq->r_lock);
2683 		}
2684 
2685 		rinactive(rp, cr);
2686 
2687 		/*
2688 		 * Recheck the vnode reference count.  We need to
2689 		 * make sure that another reference has not been
2690 		 * acquired while we were not holding v_lock.  The
2691 		 * rnode is not in the rnode hash queues, so the
2692 		 * only way for a reference to have been acquired
2693 		 * is for a VOP_PUTPAGE because the rnode was marked
2694 		 * with RDIRTY or for a modified page.  This
2695 		 * reference may have been acquired before our call
2696 		 * to rinactive.  The i/o may have been completed,
2697 		 * thus allowing rinactive to complete, but the
2698 		 * reference to the vnode may not have been released
2699 		 * yet.  In any case, the rnode can not be destroyed
2700 		 * until the other references to this vnode have been
2701 		 * released.  The other references will take care of
2702 		 * either destroying the rnode or placing it on the
2703 		 * rnode freelist.  If there are no other references,
2704 		 * then the rnode may be safely destroyed.
2705 		 */
2706 		mutex_enter(&vp->v_lock);
2707 		if (vp->v_count > 1) {
2708 			vp->v_count--;
2709 			mutex_exit(&vp->v_lock);
2710 			return;
2711 		}
2712 		mutex_exit(&vp->v_lock);
2713 
2714 		destroy_rnode(rp);
2715 		return;
2716 	}
2717 
2718 	/*
2719 	 * Lock the hash queue and then recheck the reference count
2720 	 * to ensure that no other threads have acquired a reference
2721 	 * to indicate that the rnode should not be placed on the
2722 	 * freelist.  If another reference has been acquired, then
2723 	 * just release this one and let the other thread complete
2724 	 * the processing of adding this rnode to the freelist.
2725 	 */
2726 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2727 
2728 	mutex_enter(&vp->v_lock);
2729 	if (vp->v_count > 1) {
2730 		vp->v_count--;
2731 		mutex_exit(&vp->v_lock);
2732 		rw_exit(&rp->r_hashq->r_lock);
2733 		return;
2734 	}
2735 	mutex_exit(&vp->v_lock);
2736 
2737 	/*
2738 	 * If there is no cached data or metadata for this file, then
2739 	 * put the rnode on the front of the freelist so that it will
2740 	 * be reused before other rnodes which may have cached data or
2741 	 * metadata associated with them.
2742 	 */
2743 	mutex_enter(&rpfreelist_lock);
2744 	if (rpfreelist == NULL) {
2745 		rp->r_freef = rp;
2746 		rp->r_freeb = rp;
2747 		rpfreelist = rp;
2748 	} else {
2749 		rp->r_freef = rpfreelist;
2750 		rp->r_freeb = rpfreelist->r_freeb;
2751 		rpfreelist->r_freeb->r_freef = rp;
2752 		rpfreelist->r_freeb = rp;
2753 		if (!vn_has_cached_data(vp) &&
2754 		    !HAVE_RDDIR_CACHE(rp) &&
2755 		    rp->r_symlink.contents == NULL &&
2756 		    rp->r_secattr == NULL &&
2757 		    rp->r_pathconf == NULL)
2758 			rpfreelist = rp;
2759 	}
2760 	mutex_exit(&rpfreelist_lock);
2761 
2762 	rw_exit(&rp->r_hashq->r_lock);
2763 }
2764 
2765 /*
2766  * Remove an rnode from the free list.
2767  *
2768  * The caller must be holding rpfreelist_lock and the rnode
2769  * must be on the freelist.
2770  */
2771 static void
2772 rp_rmfree(rnode_t *rp)
2773 {
2774 
2775 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2776 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2777 
2778 	if (rp == rpfreelist) {
2779 		rpfreelist = rp->r_freef;
2780 		if (rp == rpfreelist)
2781 			rpfreelist = NULL;
2782 	}
2783 
2784 	rp->r_freeb->r_freef = rp->r_freef;
2785 	rp->r_freef->r_freeb = rp->r_freeb;
2786 
2787 	rp->r_freef = rp->r_freeb = NULL;
2788 }
2789 
2790 /*
2791  * Put a rnode in the hash table.
2792  *
2793  * The caller must be holding the exclusive hash queue lock.
2794  */
2795 static void
2796 rp_addhash(rnode_t *rp)
2797 {
2798 
2799 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2800 	ASSERT(!(rp->r_flags & RHASHED));
2801 
2802 	rp->r_hashf = rp->r_hashq->r_hashf;
2803 	rp->r_hashq->r_hashf = rp;
2804 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2805 	rp->r_hashf->r_hashb = rp;
2806 
2807 	mutex_enter(&rp->r_statelock);
2808 	rp->r_flags |= RHASHED;
2809 	mutex_exit(&rp->r_statelock);
2810 }
2811 
2812 /*
2813  * Remove a rnode from the hash table.
2814  *
2815  * The caller must be holding the hash queue lock.
2816  */
2817 static void
2818 rp_rmhash_locked(rnode_t *rp)
2819 {
2820 
2821 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2822 	ASSERT(rp->r_flags & RHASHED);
2823 
2824 	rp->r_hashb->r_hashf = rp->r_hashf;
2825 	rp->r_hashf->r_hashb = rp->r_hashb;
2826 
2827 	mutex_enter(&rp->r_statelock);
2828 	rp->r_flags &= ~RHASHED;
2829 	mutex_exit(&rp->r_statelock);
2830 }
2831 
2832 /*
2833  * Remove a rnode from the hash table.
2834  *
2835  * The caller must not be holding the hash queue lock.
2836  */
2837 void
2838 rp_rmhash(rnode_t *rp)
2839 {
2840 
2841 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2842 	rp_rmhash_locked(rp);
2843 	rw_exit(&rp->r_hashq->r_lock);
2844 }
2845 
2846 /*
2847  * Lookup a rnode by fhandle.
2848  *
2849  * The caller must be holding the hash queue lock, either shared or exclusive.
2850  */
2851 static rnode_t *
2852 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2853 {
2854 	rnode_t *rp;
2855 	vnode_t *vp;
2856 
2857 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2858 
2859 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2860 		vp = RTOV(rp);
2861 		if (vp->v_vfsp == vfsp &&
2862 		    rp->r_fh.fh_len == fh->fh_len &&
2863 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2864 			/*
2865 			 * remove rnode from free list, if necessary.
2866 			 */
2867 			if (rp->r_freef != NULL) {
2868 				mutex_enter(&rpfreelist_lock);
2869 				/*
2870 				 * If the rnode is on the freelist,
2871 				 * then remove it and use that reference
2872 				 * as the new reference.  Otherwise,
2873 				 * need to increment the reference count.
2874 				 */
2875 				if (rp->r_freef != NULL) {
2876 					rp_rmfree(rp);
2877 					mutex_exit(&rpfreelist_lock);
2878 				} else {
2879 					mutex_exit(&rpfreelist_lock);
2880 					VN_HOLD(vp);
2881 				}
2882 			} else
2883 				VN_HOLD(vp);
2884 			return (rp);
2885 		}
2886 	}
2887 	return (NULL);
2888 }
2889 
2890 /*
2891  * Return 1 if there is a active vnode belonging to this vfs in the
2892  * rtable cache.
2893  *
2894  * Several of these checks are done without holding the usual
2895  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2896  * etc. will redo the necessary checks before actually destroying
2897  * any rnodes.
2898  */
2899 int
2900 check_rtable(struct vfs *vfsp)
2901 {
2902 	int index;
2903 	rnode_t *rp;
2904 	vnode_t *vp;
2905 
2906 	for (index = 0; index < rtablesize; index++) {
2907 		rw_enter(&rtable[index].r_lock, RW_READER);
2908 		for (rp = rtable[index].r_hashf;
2909 		    rp != (rnode_t *)(&rtable[index]);
2910 		    rp = rp->r_hashf) {
2911 			vp = RTOV(rp);
2912 			if (vp->v_vfsp == vfsp) {
2913 				if (rp->r_freef == NULL ||
2914 				    (vn_has_cached_data(vp) &&
2915 				    (rp->r_flags & RDIRTY)) ||
2916 				    rp->r_count > 0) {
2917 					rw_exit(&rtable[index].r_lock);
2918 					return (1);
2919 				}
2920 			}
2921 		}
2922 		rw_exit(&rtable[index].r_lock);
2923 	}
2924 	return (0);
2925 }
2926 
2927 /*
2928  * Destroy inactive vnodes from the hash queues which belong to this
2929  * vfs.  It is essential that we destroy all inactive vnodes during a
2930  * forced unmount as well as during a normal unmount.
2931  */
2932 void
2933 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2934 {
2935 	int index;
2936 	rnode_t *rp;
2937 	rnode_t *rlist;
2938 	rnode_t *r_hashf;
2939 	vnode_t *vp;
2940 
2941 	rlist = NULL;
2942 
2943 	for (index = 0; index < rtablesize; index++) {
2944 		rw_enter(&rtable[index].r_lock, RW_WRITER);
2945 		for (rp = rtable[index].r_hashf;
2946 		    rp != (rnode_t *)(&rtable[index]);
2947 		    rp = r_hashf) {
2948 			/* save the hash pointer before destroying */
2949 			r_hashf = rp->r_hashf;
2950 			vp = RTOV(rp);
2951 			if (vp->v_vfsp == vfsp) {
2952 				mutex_enter(&rpfreelist_lock);
2953 				if (rp->r_freef != NULL) {
2954 					rp_rmfree(rp);
2955 					mutex_exit(&rpfreelist_lock);
2956 					rp_rmhash_locked(rp);
2957 					rp->r_hashf = rlist;
2958 					rlist = rp;
2959 				} else
2960 					mutex_exit(&rpfreelist_lock);
2961 			}
2962 		}
2963 		rw_exit(&rtable[index].r_lock);
2964 	}
2965 
2966 	for (rp = rlist; rp != NULL; rp = rlist) {
2967 		rlist = rp->r_hashf;
2968 		/*
2969 		 * This call to rp_addfree will end up destroying the
2970 		 * rnode, but in a safe way with the appropriate set
2971 		 * of checks done.
2972 		 */
2973 		rp_addfree(rp, cr);
2974 	}
2975 
2976 }
2977 
2978 /*
2979  * This routine destroys all the resources associated with the rnode
2980  * and then the rnode itself.
2981  */
2982 static void
2983 destroy_rnode(rnode_t *rp)
2984 {
2985 	vnode_t *vp;
2986 	vfs_t *vfsp;
2987 
2988 	vp = RTOV(rp);
2989 	vfsp = vp->v_vfsp;
2990 
2991 	ASSERT(vp->v_count == 1);
2992 	ASSERT(rp->r_count == 0);
2993 	ASSERT(rp->r_lmpl == NULL);
2994 	ASSERT(rp->r_mapcnt == 0);
2995 	ASSERT(!(rp->r_flags & RHASHED));
2996 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2997 	atomic_add_long((ulong_t *)&rnew, -1);
2998 #ifdef DEBUG
2999 	clstat_debug.nrnode.value.ui64--;
3000 #endif
3001 	nfs_rw_destroy(&rp->r_rwlock);
3002 	nfs_rw_destroy(&rp->r_lkserlock);
3003 	mutex_destroy(&rp->r_statelock);
3004 	cv_destroy(&rp->r_cv);
3005 	cv_destroy(&rp->r_commit.c_cv);
3006 	if (rp->r_flags & RDELMAPLIST)
3007 		list_destroy(&rp->r_indelmap);
3008 	nfs_free_r_path(rp);
3009 	avl_destroy(&rp->r_dir);
3010 	vn_invalid(vp);
3011 	vn_free(vp);
3012 	kmem_cache_free(rnode_cache, rp);
3013 	VFS_RELE(vfsp);
3014 }
3015 
3016 /*
3017  * Flush all vnodes in this (or every) vfs.
3018  * Used by nfs_sync and by nfs_unmount.
3019  */
3020 void
3021 rflush(struct vfs *vfsp, cred_t *cr)
3022 {
3023 	int index;
3024 	rnode_t *rp;
3025 	vnode_t *vp, **vplist;
3026 	long num, cnt;
3027 
3028 	/*
3029 	 * Check to see whether there is anything to do.
3030 	 */
3031 	num = rnew;
3032 	if (num == 0)
3033 		return;
3034 
3035 	/*
3036 	 * Allocate a slot for all currently active rnodes on the
3037 	 * supposition that they all may need flushing.
3038 	 */
3039 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3040 	cnt = 0;
3041 
3042 	/*
3043 	 * Walk the hash queues looking for rnodes with page
3044 	 * lists associated with them.  Make a list of these
3045 	 * files.
3046 	 */
3047 	for (index = 0; index < rtablesize; index++) {
3048 		rw_enter(&rtable[index].r_lock, RW_READER);
3049 		for (rp = rtable[index].r_hashf;
3050 		    rp != (rnode_t *)(&rtable[index]);
3051 		    rp = rp->r_hashf) {
3052 			vp = RTOV(rp);
3053 			/*
3054 			 * Don't bother sync'ing a vp if it
3055 			 * is part of virtual swap device or
3056 			 * if VFS is read-only
3057 			 */
3058 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3059 				continue;
3060 			/*
3061 			 * If flushing all mounted file systems or
3062 			 * the vnode belongs to this vfs, has pages
3063 			 * and is marked as either dirty or mmap'd,
3064 			 * hold and add this vnode to the list of
3065 			 * vnodes to flush.
3066 			 */
3067 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3068 			    vn_has_cached_data(vp) &&
3069 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3070 				VN_HOLD(vp);
3071 				vplist[cnt++] = vp;
3072 				if (cnt == num) {
3073 					rw_exit(&rtable[index].r_lock);
3074 					goto toomany;
3075 				}
3076 			}
3077 		}
3078 		rw_exit(&rtable[index].r_lock);
3079 	}
3080 toomany:
3081 
3082 	/*
3083 	 * Flush and release all of the files on the list.
3084 	 */
3085 	while (cnt-- > 0) {
3086 		vp = vplist[cnt];
3087 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3088 		VN_RELE(vp);
3089 	}
3090 
3091 	/*
3092 	 * Free the space allocated to hold the list.
3093 	 */
3094 	kmem_free(vplist, num * sizeof (*vplist));
3095 }
3096 
3097 /*
3098  * This probably needs to be larger than or equal to
3099  * log2(sizeof (struct rnode)) due to the way that rnodes are
3100  * allocated.
3101  */
3102 #define	ACACHE_SHIFT_BITS	9
3103 
3104 static int
3105 acachehash(rnode_t *rp, cred_t *cr)
3106 {
3107 
3108 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3109 	    acachemask);
3110 }
3111 
3112 #ifdef DEBUG
3113 static long nfs_access_cache_hits = 0;
3114 static long nfs_access_cache_misses = 0;
3115 #endif
3116 
3117 nfs_access_type_t
3118 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3119 {
3120 	vnode_t *vp;
3121 	acache_t *ap;
3122 	acache_hash_t *hp;
3123 	nfs_access_type_t all;
3124 
3125 	vp = RTOV(rp);
3126 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3127 		return (NFS_ACCESS_UNKNOWN);
3128 
3129 	if (rp->r_acache != NULL) {
3130 		hp = &acache[acachehash(rp, cr)];
3131 		rw_enter(&hp->lock, RW_READER);
3132 		ap = hp->next;
3133 		while (ap != (acache_t *)hp) {
3134 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3135 				if ((ap->known & acc) == acc) {
3136 #ifdef DEBUG
3137 					nfs_access_cache_hits++;
3138 #endif
3139 					if ((ap->allowed & acc) == acc)
3140 						all = NFS_ACCESS_ALLOWED;
3141 					else
3142 						all = NFS_ACCESS_DENIED;
3143 				} else {
3144 #ifdef DEBUG
3145 					nfs_access_cache_misses++;
3146 #endif
3147 					all = NFS_ACCESS_UNKNOWN;
3148 				}
3149 				rw_exit(&hp->lock);
3150 				return (all);
3151 			}
3152 			ap = ap->next;
3153 		}
3154 		rw_exit(&hp->lock);
3155 	}
3156 
3157 #ifdef DEBUG
3158 	nfs_access_cache_misses++;
3159 #endif
3160 	return (NFS_ACCESS_UNKNOWN);
3161 }
3162 
3163 void
3164 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3165 {
3166 	acache_t *ap;
3167 	acache_t *nap;
3168 	acache_hash_t *hp;
3169 
3170 	hp = &acache[acachehash(rp, cr)];
3171 
3172 	/*
3173 	 * Allocate now assuming that mostly an allocation will be
3174 	 * required.  This allows the allocation to happen without
3175 	 * holding the hash bucket locked.
3176 	 */
3177 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3178 	if (nap != NULL) {
3179 		nap->known = acc;
3180 		nap->allowed = resacc;
3181 		nap->rnode = rp;
3182 		crhold(cr);
3183 		nap->cred = cr;
3184 		nap->hashq = hp;
3185 	}
3186 
3187 	rw_enter(&hp->lock, RW_WRITER);
3188 
3189 	if (rp->r_acache != NULL) {
3190 		ap = hp->next;
3191 		while (ap != (acache_t *)hp) {
3192 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3193 				ap->known |= acc;
3194 				ap->allowed &= ~acc;
3195 				ap->allowed |= resacc;
3196 				rw_exit(&hp->lock);
3197 				if (nap != NULL) {
3198 					crfree(nap->cred);
3199 					kmem_cache_free(acache_cache, nap);
3200 				}
3201 				return;
3202 			}
3203 			ap = ap->next;
3204 		}
3205 	}
3206 
3207 	if (nap != NULL) {
3208 #ifdef DEBUG
3209 		clstat_debug.access.value.ui64++;
3210 #endif
3211 		nap->next = hp->next;
3212 		hp->next = nap;
3213 		nap->next->prev = nap;
3214 		nap->prev = (acache_t *)hp;
3215 
3216 		mutex_enter(&rp->r_statelock);
3217 		nap->list = rp->r_acache;
3218 		rp->r_acache = nap;
3219 		mutex_exit(&rp->r_statelock);
3220 	}
3221 
3222 	rw_exit(&hp->lock);
3223 }
3224 
3225 int
3226 nfs_access_purge_rp(rnode_t *rp)
3227 {
3228 	acache_t *ap;
3229 	acache_t *tmpap;
3230 	acache_t *rplist;
3231 
3232 	/*
3233 	 * If there aren't any cached entries, then there is nothing
3234 	 * to free.
3235 	 */
3236 	if (rp->r_acache == NULL)
3237 		return (0);
3238 
3239 	mutex_enter(&rp->r_statelock);
3240 	rplist = rp->r_acache;
3241 	rp->r_acache = NULL;
3242 	mutex_exit(&rp->r_statelock);
3243 
3244 	/*
3245 	 * Loop through each entry in the list pointed to in the
3246 	 * rnode.  Remove each of these entries from the hash
3247 	 * queue that it is on and remove it from the list in
3248 	 * the rnode.
3249 	 */
3250 	for (ap = rplist; ap != NULL; ap = tmpap) {
3251 		rw_enter(&ap->hashq->lock, RW_WRITER);
3252 		ap->prev->next = ap->next;
3253 		ap->next->prev = ap->prev;
3254 		rw_exit(&ap->hashq->lock);
3255 
3256 		tmpap = ap->list;
3257 		crfree(ap->cred);
3258 		kmem_cache_free(acache_cache, ap);
3259 #ifdef DEBUG
3260 		clstat_debug.access.value.ui64--;
3261 #endif
3262 	}
3263 
3264 	return (1);
3265 }
3266 
3267 static const char prefix[] = ".nfs";
3268 
3269 static kmutex_t newnum_lock;
3270 
3271 int
3272 newnum(void)
3273 {
3274 	static uint_t newnum = 0;
3275 	uint_t id;
3276 
3277 	mutex_enter(&newnum_lock);
3278 	if (newnum == 0)
3279 		newnum = gethrestime_sec() & 0xffff;
3280 	id = newnum++;
3281 	mutex_exit(&newnum_lock);
3282 	return (id);
3283 }
3284 
3285 char *
3286 newname(void)
3287 {
3288 	char *news;
3289 	char *s;
3290 	const char *p;
3291 	uint_t id;
3292 
3293 	id = newnum();
3294 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3295 	s = news;
3296 	p = prefix;
3297 	while (*p != '\0')
3298 		*s++ = *p++;
3299 	while (id != 0) {
3300 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3301 		id >>= 4;
3302 	}
3303 	*s = '\0';
3304 	return (news);
3305 }
3306 
3307 int
3308 nfs_atoi(char *cp)
3309 {
3310 	int n;
3311 
3312 	n = 0;
3313 	while (*cp != '\0') {
3314 		n = n * 10 + (*cp - '0');
3315 		cp++;
3316 	}
3317 
3318 	return (n);
3319 }
3320 
3321 /*
3322  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3323  * framework.
3324  */
3325 static int
3326 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3327 {
3328 	ksp->ks_snaptime = gethrtime();
3329 	if (rw == KSTAT_WRITE) {
3330 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3331 #ifdef DEBUG
3332 		/*
3333 		 * Currently only the global zone can write to kstats, but we
3334 		 * add the check just for paranoia.
3335 		 */
3336 		if (INGLOBALZONE(curproc))
3337 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3338 			    sizeof (clstat_debug));
3339 #endif
3340 	} else {
3341 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3342 #ifdef DEBUG
3343 		/*
3344 		 * If we're displaying the "global" debug kstat values, we
3345 		 * display them as-is to all zones since in fact they apply to
3346 		 * the system as a whole.
3347 		 */
3348 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3349 		    sizeof (clstat_debug));
3350 #endif
3351 	}
3352 	return (0);
3353 }
3354 
3355 static void *
3356 clinit_zone(zoneid_t zoneid)
3357 {
3358 	kstat_t *nfs_client_kstat;
3359 	struct nfs_clnt *nfscl;
3360 	uint_t ndata;
3361 
3362 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3363 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3364 	nfscl->nfscl_chtable = NULL;
3365 	nfscl->nfscl_zoneid = zoneid;
3366 
3367 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3368 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3369 #ifdef DEBUG
3370 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3371 #endif
3372 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3373 	    "misc", KSTAT_TYPE_NAMED, ndata,
3374 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3375 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3376 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3377 		kstat_install(nfs_client_kstat);
3378 	}
3379 	mutex_enter(&nfs_clnt_list_lock);
3380 	list_insert_head(&nfs_clnt_list, nfscl);
3381 	mutex_exit(&nfs_clnt_list_lock);
3382 	return (nfscl);
3383 }
3384 
3385 /*ARGSUSED*/
3386 static void
3387 clfini_zone(zoneid_t zoneid, void *arg)
3388 {
3389 	struct nfs_clnt *nfscl = arg;
3390 	chhead_t *chp, *next;
3391 
3392 	if (nfscl == NULL)
3393 		return;
3394 	mutex_enter(&nfs_clnt_list_lock);
3395 	list_remove(&nfs_clnt_list, nfscl);
3396 	mutex_exit(&nfs_clnt_list_lock);
3397 	clreclaim_zone(nfscl, 0);
3398 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3399 		ASSERT(chp->ch_list == NULL);
3400 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3401 		next = chp->ch_next;
3402 		kmem_free(chp, sizeof (*chp));
3403 	}
3404 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3405 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3406 	kmem_free(nfscl, sizeof (*nfscl));
3407 }
3408 
3409 /*
3410  * Called by endpnt_destructor to make sure the client handles are
3411  * cleaned up before the RPC endpoints.  This becomes a no-op if
3412  * clfini_zone (above) is called first.  This function is needed
3413  * (rather than relying on clfini_zone to clean up) because the ZSD
3414  * callbacks have no ordering mechanism, so we have no way to ensure
3415  * that clfini_zone is called before endpnt_destructor.
3416  */
3417 void
3418 clcleanup_zone(zoneid_t zoneid)
3419 {
3420 	struct nfs_clnt *nfscl;
3421 
3422 	mutex_enter(&nfs_clnt_list_lock);
3423 	nfscl = list_head(&nfs_clnt_list);
3424 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3425 		if (nfscl->nfscl_zoneid == zoneid) {
3426 			clreclaim_zone(nfscl, 0);
3427 			break;
3428 		}
3429 	}
3430 	mutex_exit(&nfs_clnt_list_lock);
3431 }
3432 
3433 int
3434 nfs_subrinit(void)
3435 {
3436 	int i;
3437 	ulong_t nrnode_max;
3438 
3439 	/*
3440 	 * Allocate and initialize the rnode hash queues
3441 	 */
3442 	if (nrnode <= 0)
3443 		nrnode = ncsize;
3444 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3445 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3446 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3447 		    "setting nrnode to max value of %ld", nrnode_max);
3448 		nrnode = nrnode_max;
3449 	}
3450 
3451 	rtablesize = 1 << highbit(nrnode / hashlen);
3452 	rtablemask = rtablesize - 1;
3453 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3454 	for (i = 0; i < rtablesize; i++) {
3455 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3456 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3457 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3458 	}
3459 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3460 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3461 
3462 	/*
3463 	 * Allocate and initialize the access cache
3464 	 */
3465 
3466 	/*
3467 	 * Initial guess is one access cache entry per rnode unless
3468 	 * nacache is set to a non-zero value and then it is used to
3469 	 * indicate a guess at the number of access cache entries.
3470 	 */
3471 	if (nacache > 0)
3472 		acachesize = 1 << highbit(nacache / hashlen);
3473 	else
3474 		acachesize = rtablesize;
3475 	acachemask = acachesize - 1;
3476 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3477 	for (i = 0; i < acachesize; i++) {
3478 		acache[i].next = (acache_t *)&acache[i];
3479 		acache[i].prev = (acache_t *)&acache[i];
3480 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3481 	}
3482 	acache_cache = kmem_cache_create("nfs_access_cache",
3483 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3484 	/*
3485 	 * Allocate and initialize the client handle cache
3486 	 */
3487 	chtab_cache = kmem_cache_create("client_handle_cache",
3488 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3489 	/*
3490 	 * Initialize the list of per-zone client handles (and associated data).
3491 	 * This needs to be done before we call zone_key_create().
3492 	 */
3493 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3494 	    offsetof(struct nfs_clnt, nfscl_node));
3495 	/*
3496 	 * Initialize the zone_key for per-zone client handle lists.
3497 	 */
3498 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3499 	/*
3500 	 * Initialize the various mutexes and reader/writer locks
3501 	 */
3502 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3503 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3504 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3505 
3506 	/*
3507 	 * Assign unique major number for all nfs mounts
3508 	 */
3509 	if ((nfs_major = getudev()) == -1) {
3510 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3511 		    "nfs: init: can't get unique device number");
3512 		nfs_major = 0;
3513 	}
3514 	nfs_minor = 0;
3515 
3516 	if (nfs3_jukebox_delay == 0)
3517 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3518 
3519 	return (0);
3520 }
3521 
3522 void
3523 nfs_subrfini(void)
3524 {
3525 	int i;
3526 
3527 	/*
3528 	 * Deallocate the rnode hash queues
3529 	 */
3530 	kmem_cache_destroy(rnode_cache);
3531 
3532 	for (i = 0; i < rtablesize; i++)
3533 		rw_destroy(&rtable[i].r_lock);
3534 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3535 
3536 	/*
3537 	 * Deallocated the access cache
3538 	 */
3539 	kmem_cache_destroy(acache_cache);
3540 
3541 	for (i = 0; i < acachesize; i++)
3542 		rw_destroy(&acache[i].lock);
3543 	kmem_free(acache, acachesize * sizeof (*acache));
3544 
3545 	/*
3546 	 * Deallocate the client handle cache
3547 	 */
3548 	kmem_cache_destroy(chtab_cache);
3549 
3550 	/*
3551 	 * Destroy the various mutexes and reader/writer locks
3552 	 */
3553 	mutex_destroy(&rpfreelist_lock);
3554 	mutex_destroy(&newnum_lock);
3555 	mutex_destroy(&nfs_minor_lock);
3556 	(void) zone_key_delete(nfsclnt_zone_key);
3557 }
3558 
3559 enum nfsstat
3560 puterrno(int error)
3561 {
3562 
3563 	switch (error) {
3564 	case EOPNOTSUPP:
3565 		return (NFSERR_OPNOTSUPP);
3566 	case ENAMETOOLONG:
3567 		return (NFSERR_NAMETOOLONG);
3568 	case ENOTEMPTY:
3569 		return (NFSERR_NOTEMPTY);
3570 	case EDQUOT:
3571 		return (NFSERR_DQUOT);
3572 	case ESTALE:
3573 		return (NFSERR_STALE);
3574 	case EREMOTE:
3575 		return (NFSERR_REMOTE);
3576 	case ENOSYS:
3577 		return (NFSERR_OPNOTSUPP);
3578 	case EOVERFLOW:
3579 		return (NFSERR_INVAL);
3580 	default:
3581 		return ((enum nfsstat)error);
3582 	}
3583 	/* NOTREACHED */
3584 }
3585 
3586 int
3587 geterrno(enum nfsstat status)
3588 {
3589 
3590 	switch (status) {
3591 	case NFSERR_OPNOTSUPP:
3592 		return (EOPNOTSUPP);
3593 	case NFSERR_NAMETOOLONG:
3594 		return (ENAMETOOLONG);
3595 	case NFSERR_NOTEMPTY:
3596 		return (ENOTEMPTY);
3597 	case NFSERR_DQUOT:
3598 		return (EDQUOT);
3599 	case NFSERR_STALE:
3600 		return (ESTALE);
3601 	case NFSERR_REMOTE:
3602 		return (EREMOTE);
3603 	case NFSERR_WFLUSH:
3604 		return (EIO);
3605 	default:
3606 		return ((int)status);
3607 	}
3608 	/* NOTREACHED */
3609 }
3610 
3611 enum nfsstat3
3612 puterrno3(int error)
3613 {
3614 
3615 #ifdef DEBUG
3616 	switch (error) {
3617 	case 0:
3618 		return (NFS3_OK);
3619 	case EPERM:
3620 		return (NFS3ERR_PERM);
3621 	case ENOENT:
3622 		return (NFS3ERR_NOENT);
3623 	case EIO:
3624 		return (NFS3ERR_IO);
3625 	case ENXIO:
3626 		return (NFS3ERR_NXIO);
3627 	case EACCES:
3628 		return (NFS3ERR_ACCES);
3629 	case EEXIST:
3630 		return (NFS3ERR_EXIST);
3631 	case EXDEV:
3632 		return (NFS3ERR_XDEV);
3633 	case ENODEV:
3634 		return (NFS3ERR_NODEV);
3635 	case ENOTDIR:
3636 		return (NFS3ERR_NOTDIR);
3637 	case EISDIR:
3638 		return (NFS3ERR_ISDIR);
3639 	case EINVAL:
3640 		return (NFS3ERR_INVAL);
3641 	case EFBIG:
3642 		return (NFS3ERR_FBIG);
3643 	case ENOSPC:
3644 		return (NFS3ERR_NOSPC);
3645 	case EROFS:
3646 		return (NFS3ERR_ROFS);
3647 	case EMLINK:
3648 		return (NFS3ERR_MLINK);
3649 	case ENAMETOOLONG:
3650 		return (NFS3ERR_NAMETOOLONG);
3651 	case ENOTEMPTY:
3652 		return (NFS3ERR_NOTEMPTY);
3653 	case EDQUOT:
3654 		return (NFS3ERR_DQUOT);
3655 	case ESTALE:
3656 		return (NFS3ERR_STALE);
3657 	case EREMOTE:
3658 		return (NFS3ERR_REMOTE);
3659 	case ENOSYS:
3660 	case EOPNOTSUPP:
3661 		return (NFS3ERR_NOTSUPP);
3662 	case EOVERFLOW:
3663 		return (NFS3ERR_INVAL);
3664 	default:
3665 		zcmn_err(getzoneid(), CE_WARN,
3666 		    "puterrno3: got error %d", error);
3667 		return ((enum nfsstat3)error);
3668 	}
3669 #else
3670 	switch (error) {
3671 	case ENAMETOOLONG:
3672 		return (NFS3ERR_NAMETOOLONG);
3673 	case ENOTEMPTY:
3674 		return (NFS3ERR_NOTEMPTY);
3675 	case EDQUOT:
3676 		return (NFS3ERR_DQUOT);
3677 	case ESTALE:
3678 		return (NFS3ERR_STALE);
3679 	case ENOSYS:
3680 	case EOPNOTSUPP:
3681 		return (NFS3ERR_NOTSUPP);
3682 	case EREMOTE:
3683 		return (NFS3ERR_REMOTE);
3684 	case EOVERFLOW:
3685 		return (NFS3ERR_INVAL);
3686 	default:
3687 		return ((enum nfsstat3)error);
3688 	}
3689 #endif
3690 }
3691 
3692 int
3693 geterrno3(enum nfsstat3 status)
3694 {
3695 
3696 #ifdef DEBUG
3697 	switch (status) {
3698 	case NFS3_OK:
3699 		return (0);
3700 	case NFS3ERR_PERM:
3701 		return (EPERM);
3702 	case NFS3ERR_NOENT:
3703 		return (ENOENT);
3704 	case NFS3ERR_IO:
3705 		return (EIO);
3706 	case NFS3ERR_NXIO:
3707 		return (ENXIO);
3708 	case NFS3ERR_ACCES:
3709 		return (EACCES);
3710 	case NFS3ERR_EXIST:
3711 		return (EEXIST);
3712 	case NFS3ERR_XDEV:
3713 		return (EXDEV);
3714 	case NFS3ERR_NODEV:
3715 		return (ENODEV);
3716 	case NFS3ERR_NOTDIR:
3717 		return (ENOTDIR);
3718 	case NFS3ERR_ISDIR:
3719 		return (EISDIR);
3720 	case NFS3ERR_INVAL:
3721 		return (EINVAL);
3722 	case NFS3ERR_FBIG:
3723 		return (EFBIG);
3724 	case NFS3ERR_NOSPC:
3725 		return (ENOSPC);
3726 	case NFS3ERR_ROFS:
3727 		return (EROFS);
3728 	case NFS3ERR_MLINK:
3729 		return (EMLINK);
3730 	case NFS3ERR_NAMETOOLONG:
3731 		return (ENAMETOOLONG);
3732 	case NFS3ERR_NOTEMPTY:
3733 		return (ENOTEMPTY);
3734 	case NFS3ERR_DQUOT:
3735 		return (EDQUOT);
3736 	case NFS3ERR_STALE:
3737 		return (ESTALE);
3738 	case NFS3ERR_REMOTE:
3739 		return (EREMOTE);
3740 	case NFS3ERR_BADHANDLE:
3741 		return (ESTALE);
3742 	case NFS3ERR_NOT_SYNC:
3743 		return (EINVAL);
3744 	case NFS3ERR_BAD_COOKIE:
3745 		return (ENOENT);
3746 	case NFS3ERR_NOTSUPP:
3747 		return (EOPNOTSUPP);
3748 	case NFS3ERR_TOOSMALL:
3749 		return (EINVAL);
3750 	case NFS3ERR_SERVERFAULT:
3751 		return (EIO);
3752 	case NFS3ERR_BADTYPE:
3753 		return (EINVAL);
3754 	case NFS3ERR_JUKEBOX:
3755 		return (ENXIO);
3756 	default:
3757 		zcmn_err(getzoneid(), CE_WARN,
3758 		    "geterrno3: got status %d", status);
3759 		return ((int)status);
3760 	}
3761 #else
3762 	switch (status) {
3763 	case NFS3ERR_NAMETOOLONG:
3764 		return (ENAMETOOLONG);
3765 	case NFS3ERR_NOTEMPTY:
3766 		return (ENOTEMPTY);
3767 	case NFS3ERR_DQUOT:
3768 		return (EDQUOT);
3769 	case NFS3ERR_STALE:
3770 	case NFS3ERR_BADHANDLE:
3771 		return (ESTALE);
3772 	case NFS3ERR_NOTSUPP:
3773 		return (EOPNOTSUPP);
3774 	case NFS3ERR_REMOTE:
3775 		return (EREMOTE);
3776 	case NFS3ERR_NOT_SYNC:
3777 	case NFS3ERR_TOOSMALL:
3778 	case NFS3ERR_BADTYPE:
3779 		return (EINVAL);
3780 	case NFS3ERR_BAD_COOKIE:
3781 		return (ENOENT);
3782 	case NFS3ERR_SERVERFAULT:
3783 		return (EIO);
3784 	case NFS3ERR_JUKEBOX:
3785 		return (ENXIO);
3786 	default:
3787 		return ((int)status);
3788 	}
3789 #endif
3790 }
3791 
3792 rddir_cache *
3793 rddir_cache_alloc(int flags)
3794 {
3795 	rddir_cache *rc;
3796 
3797 	rc = kmem_alloc(sizeof (*rc), flags);
3798 	if (rc != NULL) {
3799 		rc->entries = NULL;
3800 		rc->flags = RDDIR;
3801 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3802 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3803 		rc->count = 1;
3804 #ifdef DEBUG
3805 		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3806 #endif
3807 	}
3808 	return (rc);
3809 }
3810 
3811 static void
3812 rddir_cache_free(rddir_cache *rc)
3813 {
3814 
3815 #ifdef DEBUG
3816 	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3817 #endif
3818 	if (rc->entries != NULL) {
3819 #ifdef DEBUG
3820 		rddir_cache_buf_free(rc->entries, rc->buflen);
3821 #else
3822 		kmem_free(rc->entries, rc->buflen);
3823 #endif
3824 	}
3825 	cv_destroy(&rc->cv);
3826 	mutex_destroy(&rc->lock);
3827 	kmem_free(rc, sizeof (*rc));
3828 }
3829 
3830 void
3831 rddir_cache_hold(rddir_cache *rc)
3832 {
3833 
3834 	mutex_enter(&rc->lock);
3835 	rc->count++;
3836 	mutex_exit(&rc->lock);
3837 }
3838 
3839 void
3840 rddir_cache_rele(rddir_cache *rc)
3841 {
3842 
3843 	mutex_enter(&rc->lock);
3844 	ASSERT(rc->count > 0);
3845 	if (--rc->count == 0) {
3846 		mutex_exit(&rc->lock);
3847 		rddir_cache_free(rc);
3848 	} else
3849 		mutex_exit(&rc->lock);
3850 }
3851 
3852 #ifdef DEBUG
3853 char *
3854 rddir_cache_buf_alloc(size_t size, int flags)
3855 {
3856 	char *rc;
3857 
3858 	rc = kmem_alloc(size, flags);
3859 	if (rc != NULL)
3860 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3861 	return (rc);
3862 }
3863 
3864 void
3865 rddir_cache_buf_free(void *addr, size_t size)
3866 {
3867 
3868 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3869 	kmem_free(addr, size);
3870 }
3871 #endif
3872 
3873 static int
3874 nfs_free_data_reclaim(rnode_t *rp)
3875 {
3876 	char *contents;
3877 	int size;
3878 	vsecattr_t *vsp;
3879 	nfs3_pathconf_info *info;
3880 	int freed;
3881 	cred_t *cred;
3882 
3883 	/*
3884 	 * Free any held credentials and caches which
3885 	 * may be associated with this rnode.
3886 	 */
3887 	mutex_enter(&rp->r_statelock);
3888 	cred = rp->r_cred;
3889 	rp->r_cred = NULL;
3890 	contents = rp->r_symlink.contents;
3891 	size = rp->r_symlink.size;
3892 	rp->r_symlink.contents = NULL;
3893 	vsp = rp->r_secattr;
3894 	rp->r_secattr = NULL;
3895 	info = rp->r_pathconf;
3896 	rp->r_pathconf = NULL;
3897 	mutex_exit(&rp->r_statelock);
3898 
3899 	if (cred != NULL)
3900 		crfree(cred);
3901 
3902 	/*
3903 	 * Free the access cache entries.
3904 	 */
3905 	freed = nfs_access_purge_rp(rp);
3906 
3907 	if (!HAVE_RDDIR_CACHE(rp) &&
3908 	    contents == NULL &&
3909 	    vsp == NULL &&
3910 	    info == NULL)
3911 		return (freed);
3912 
3913 	/*
3914 	 * Free the readdir cache entries
3915 	 */
3916 	if (HAVE_RDDIR_CACHE(rp))
3917 		nfs_purge_rddir_cache(RTOV(rp));
3918 
3919 	/*
3920 	 * Free the symbolic link cache.
3921 	 */
3922 	if (contents != NULL) {
3923 
3924 		kmem_free((void *)contents, size);
3925 	}
3926 
3927 	/*
3928 	 * Free any cached ACL.
3929 	 */
3930 	if (vsp != NULL)
3931 		nfs_acl_free(vsp);
3932 
3933 	/*
3934 	 * Free any cached pathconf information.
3935 	 */
3936 	if (info != NULL)
3937 		kmem_free(info, sizeof (*info));
3938 
3939 	return (1);
3940 }
3941 
3942 static int
3943 nfs_active_data_reclaim(rnode_t *rp)
3944 {
3945 	char *contents;
3946 	int size;
3947 	vsecattr_t *vsp;
3948 	nfs3_pathconf_info *info;
3949 	int freed;
3950 
3951 	/*
3952 	 * Free any held credentials and caches which
3953 	 * may be associated with this rnode.
3954 	 */
3955 	if (!mutex_tryenter(&rp->r_statelock))
3956 		return (0);
3957 	contents = rp->r_symlink.contents;
3958 	size = rp->r_symlink.size;
3959 	rp->r_symlink.contents = NULL;
3960 	vsp = rp->r_secattr;
3961 	rp->r_secattr = NULL;
3962 	info = rp->r_pathconf;
3963 	rp->r_pathconf = NULL;
3964 	mutex_exit(&rp->r_statelock);
3965 
3966 	/*
3967 	 * Free the access cache entries.
3968 	 */
3969 	freed = nfs_access_purge_rp(rp);
3970 
3971 	if (!HAVE_RDDIR_CACHE(rp) &&
3972 	    contents == NULL &&
3973 	    vsp == NULL &&
3974 	    info == NULL)
3975 		return (freed);
3976 
3977 	/*
3978 	 * Free the readdir cache entries
3979 	 */
3980 	if (HAVE_RDDIR_CACHE(rp))
3981 		nfs_purge_rddir_cache(RTOV(rp));
3982 
3983 	/*
3984 	 * Free the symbolic link cache.
3985 	 */
3986 	if (contents != NULL) {
3987 
3988 		kmem_free((void *)contents, size);
3989 	}
3990 
3991 	/*
3992 	 * Free any cached ACL.
3993 	 */
3994 	if (vsp != NULL)
3995 		nfs_acl_free(vsp);
3996 
3997 	/*
3998 	 * Free any cached pathconf information.
3999 	 */
4000 	if (info != NULL)
4001 		kmem_free(info, sizeof (*info));
4002 
4003 	return (1);
4004 }
4005 
4006 static int
4007 nfs_free_reclaim(void)
4008 {
4009 	int freed;
4010 	rnode_t *rp;
4011 
4012 #ifdef DEBUG
4013 	clstat_debug.f_reclaim.value.ui64++;
4014 #endif
4015 	freed = 0;
4016 	mutex_enter(&rpfreelist_lock);
4017 	rp = rpfreelist;
4018 	if (rp != NULL) {
4019 		do {
4020 			if (nfs_free_data_reclaim(rp))
4021 				freed = 1;
4022 		} while ((rp = rp->r_freef) != rpfreelist);
4023 	}
4024 	mutex_exit(&rpfreelist_lock);
4025 	return (freed);
4026 }
4027 
4028 static int
4029 nfs_active_reclaim(void)
4030 {
4031 	int freed;
4032 	int index;
4033 	rnode_t *rp;
4034 
4035 #ifdef DEBUG
4036 	clstat_debug.a_reclaim.value.ui64++;
4037 #endif
4038 	freed = 0;
4039 	for (index = 0; index < rtablesize; index++) {
4040 		rw_enter(&rtable[index].r_lock, RW_READER);
4041 		for (rp = rtable[index].r_hashf;
4042 		    rp != (rnode_t *)(&rtable[index]);
4043 		    rp = rp->r_hashf) {
4044 			if (nfs_active_data_reclaim(rp))
4045 				freed = 1;
4046 		}
4047 		rw_exit(&rtable[index].r_lock);
4048 	}
4049 	return (freed);
4050 }
4051 
4052 static int
4053 nfs_rnode_reclaim(void)
4054 {
4055 	int freed;
4056 	rnode_t *rp;
4057 	vnode_t *vp;
4058 
4059 #ifdef DEBUG
4060 	clstat_debug.r_reclaim.value.ui64++;
4061 #endif
4062 	freed = 0;
4063 	mutex_enter(&rpfreelist_lock);
4064 	while ((rp = rpfreelist) != NULL) {
4065 		rp_rmfree(rp);
4066 		mutex_exit(&rpfreelist_lock);
4067 		if (rp->r_flags & RHASHED) {
4068 			vp = RTOV(rp);
4069 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4070 			mutex_enter(&vp->v_lock);
4071 			if (vp->v_count > 1) {
4072 				vp->v_count--;
4073 				mutex_exit(&vp->v_lock);
4074 				rw_exit(&rp->r_hashq->r_lock);
4075 				mutex_enter(&rpfreelist_lock);
4076 				continue;
4077 			}
4078 			mutex_exit(&vp->v_lock);
4079 			rp_rmhash_locked(rp);
4080 			rw_exit(&rp->r_hashq->r_lock);
4081 		}
4082 		/*
4083 		 * This call to rp_addfree will end up destroying the
4084 		 * rnode, but in a safe way with the appropriate set
4085 		 * of checks done.
4086 		 */
4087 		rp_addfree(rp, CRED());
4088 		mutex_enter(&rpfreelist_lock);
4089 	}
4090 	mutex_exit(&rpfreelist_lock);
4091 	return (freed);
4092 }
4093 
4094 /*ARGSUSED*/
4095 static void
4096 nfs_reclaim(void *cdrarg)
4097 {
4098 
4099 #ifdef DEBUG
4100 	clstat_debug.reclaim.value.ui64++;
4101 #endif
4102 	if (nfs_free_reclaim())
4103 		return;
4104 
4105 	if (nfs_active_reclaim())
4106 		return;
4107 
4108 	(void) nfs_rnode_reclaim();
4109 }
4110 
4111 /*
4112  * NFS client failover support
4113  *
4114  * Routines to copy filehandles
4115  */
4116 void
4117 nfscopyfh(caddr_t fhp, vnode_t *vp)
4118 {
4119 	fhandle_t *dest = (fhandle_t *)fhp;
4120 
4121 	if (dest != NULL)
4122 		*dest = *VTOFH(vp);
4123 }
4124 
4125 void
4126 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4127 {
4128 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4129 
4130 	if (dest != NULL)
4131 		*dest = *VTOFH3(vp);
4132 }
4133 
4134 /*
4135  * NFS client failover support
4136  *
4137  * failover_safe() will test various conditions to ensure that
4138  * failover is permitted for this vnode.  It will be denied
4139  * if:
4140  *	1) the operation in progress does not support failover (NULL fi)
4141  *	2) there are no available replicas (NULL mi_servers->sv_next)
4142  *	3) any locks are outstanding on this file
4143  */
4144 static int
4145 failover_safe(failinfo_t *fi)
4146 {
4147 
4148 	/*
4149 	 * Does this op permit failover?
4150 	 */
4151 	if (fi == NULL || fi->vp == NULL)
4152 		return (0);
4153 
4154 	/*
4155 	 * Are there any alternates to failover to?
4156 	 */
4157 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4158 		return (0);
4159 
4160 	/*
4161 	 * Disable check; we've forced local locking
4162 	 *
4163 	 * if (flk_has_remote_locks(fi->vp))
4164 	 *	return (0);
4165 	 */
4166 
4167 	/*
4168 	 * If we have no partial path, we can't do anything
4169 	 */
4170 	if (VTOR(fi->vp)->r_path == NULL)
4171 		return (0);
4172 
4173 	return (1);
4174 }
4175 
4176 #include <sys/thread.h>
4177 
4178 /*
4179  * NFS client failover support
4180  *
4181  * failover_newserver() will start a search for a new server,
4182  * preferably by starting an async thread to do the work.  If
4183  * someone is already doing this (recognizable by MI_BINDINPROG
4184  * being set), it will simply return and the calling thread
4185  * will queue on the mi_failover_cv condition variable.
4186  */
4187 static void
4188 failover_newserver(mntinfo_t *mi)
4189 {
4190 	/*
4191 	 * Check if someone else is doing this already
4192 	 */
4193 	mutex_enter(&mi->mi_lock);
4194 	if (mi->mi_flags & MI_BINDINPROG) {
4195 		mutex_exit(&mi->mi_lock);
4196 		return;
4197 	}
4198 	mi->mi_flags |= MI_BINDINPROG;
4199 
4200 	/*
4201 	 * Need to hold the vfs struct so that it can't be released
4202 	 * while the failover thread is selecting a new server.
4203 	 */
4204 	VFS_HOLD(mi->mi_vfsp);
4205 
4206 	/*
4207 	 * Start a thread to do the real searching.
4208 	 */
4209 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4210 
4211 	mutex_exit(&mi->mi_lock);
4212 }
4213 
4214 /*
4215  * NFS client failover support
4216  *
4217  * failover_thread() will find a new server to replace the one
4218  * currently in use, wake up other threads waiting on this mount
4219  * point, and die.  It will start at the head of the server list
4220  * and poll servers until it finds one with an NFS server which is
4221  * registered and responds to a NULL procedure ping.
4222  *
4223  * XXX failover_thread is unsafe within the scope of the
4224  * present model defined for cpr to suspend the system.
4225  * Specifically, over-the-wire calls made by the thread
4226  * are unsafe. The thread needs to be reevaluated in case of
4227  * future updates to the cpr suspend model.
4228  */
4229 static void
4230 failover_thread(mntinfo_t *mi)
4231 {
4232 	servinfo_t *svp = NULL;
4233 	CLIENT *cl;
4234 	enum clnt_stat status;
4235 	struct timeval tv;
4236 	int error;
4237 	int oncethru = 0;
4238 	callb_cpr_t cprinfo;
4239 	rnode_t *rp;
4240 	int index;
4241 	char *srvnames;
4242 	size_t srvnames_len;
4243 	struct nfs_clnt *nfscl = NULL;
4244 	zoneid_t zoneid = getzoneid();
4245 
4246 #ifdef DEBUG
4247 	/*
4248 	 * This is currently only needed to access counters which exist on
4249 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4250 	 * on non-DEBUG kernels.
4251 	 */
4252 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4253 	ASSERT(nfscl != NULL);
4254 #endif
4255 
4256 	/*
4257 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4258 	 * code guarantees that there will be only one failover thread
4259 	 * per mountinfo at any instance.
4260 	 */
4261 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4262 	    "failover_thread");
4263 
4264 	mutex_enter(&mi->mi_lock);
4265 	while (mi->mi_readers) {
4266 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4267 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4268 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4269 	}
4270 	mutex_exit(&mi->mi_lock);
4271 
4272 	tv.tv_sec = 2;
4273 	tv.tv_usec = 0;
4274 
4275 	/*
4276 	 * Ping the null NFS procedure of every server in
4277 	 * the list until one responds.  We always start
4278 	 * at the head of the list and always skip the one
4279 	 * that is current, since it's caused us a problem.
4280 	 */
4281 	while (svp == NULL) {
4282 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4283 			if (!oncethru && svp == mi->mi_curr_serv)
4284 				continue;
4285 
4286 			/*
4287 			 * If the file system was forcibly umounted
4288 			 * while trying to do a failover, then just
4289 			 * give up on the failover.  It won't matter
4290 			 * what the server is.
4291 			 */
4292 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4293 				svp = NULL;
4294 				goto done;
4295 			}
4296 
4297 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4298 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4299 			if (error)
4300 				continue;
4301 
4302 			if (!(mi->mi_flags & MI_INT))
4303 				cl->cl_nosignal = TRUE;
4304 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4305 			    xdr_void, NULL, tv);
4306 			if (!(mi->mi_flags & MI_INT))
4307 				cl->cl_nosignal = FALSE;
4308 			AUTH_DESTROY(cl->cl_auth);
4309 			CLNT_DESTROY(cl);
4310 			if (status == RPC_SUCCESS) {
4311 				if (svp == mi->mi_curr_serv) {
4312 #ifdef DEBUG
4313 					zcmn_err(zoneid, CE_NOTE,
4314 			"NFS%d: failing over: selecting original server %s",
4315 					    mi->mi_vers, svp->sv_hostname);
4316 #else
4317 					zcmn_err(zoneid, CE_NOTE,
4318 			"NFS: failing over: selecting original server %s",
4319 					    svp->sv_hostname);
4320 #endif
4321 				} else {
4322 #ifdef DEBUG
4323 					zcmn_err(zoneid, CE_NOTE,
4324 				    "NFS%d: failing over from %s to %s",
4325 					    mi->mi_vers,
4326 					    mi->mi_curr_serv->sv_hostname,
4327 					    svp->sv_hostname);
4328 #else
4329 					zcmn_err(zoneid, CE_NOTE,
4330 				    "NFS: failing over from %s to %s",
4331 					    mi->mi_curr_serv->sv_hostname,
4332 					    svp->sv_hostname);
4333 #endif
4334 				}
4335 				break;
4336 			}
4337 		}
4338 
4339 		if (svp == NULL) {
4340 			if (!oncethru) {
4341 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4342 #ifdef DEBUG
4343 				zprintf(zoneid,
4344 				    "NFS%d servers %s not responding "
4345 				    "still trying\n", mi->mi_vers, srvnames);
4346 #else
4347 				zprintf(zoneid, "NFS servers %s not responding "
4348 				    "still trying\n", srvnames);
4349 #endif
4350 				oncethru = 1;
4351 			}
4352 			mutex_enter(&mi->mi_lock);
4353 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4354 			mutex_exit(&mi->mi_lock);
4355 			delay(hz);
4356 			mutex_enter(&mi->mi_lock);
4357 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4358 			mutex_exit(&mi->mi_lock);
4359 		}
4360 	}
4361 
4362 	if (oncethru) {
4363 #ifdef DEBUG
4364 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4365 #else
4366 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4367 #endif
4368 	}
4369 
4370 	if (svp != mi->mi_curr_serv) {
4371 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4372 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4373 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4374 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4375 		    mi->mi_vfsp);
4376 		if (rp != NULL) {
4377 			if (rp->r_flags & RHASHED)
4378 				rp_rmhash_locked(rp);
4379 			rw_exit(&rtable[index].r_lock);
4380 			rp->r_server = svp;
4381 			rp->r_fh = svp->sv_fhandle;
4382 			(void) nfs_free_data_reclaim(rp);
4383 			index = rtablehash(&rp->r_fh);
4384 			rp->r_hashq = &rtable[index];
4385 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4386 			vn_exists(RTOV(rp));
4387 			rp_addhash(rp);
4388 			rw_exit(&rp->r_hashq->r_lock);
4389 			VN_RELE(RTOV(rp));
4390 		} else
4391 			rw_exit(&rtable[index].r_lock);
4392 	}
4393 
4394 done:
4395 	if (oncethru)
4396 		kmem_free(srvnames, srvnames_len);
4397 	mutex_enter(&mi->mi_lock);
4398 	mi->mi_flags &= ~MI_BINDINPROG;
4399 	if (svp != NULL) {
4400 		mi->mi_curr_serv = svp;
4401 		mi->mi_failover++;
4402 #ifdef DEBUG
4403 	nfscl->nfscl_stat.failover.value.ui64++;
4404 #endif
4405 	}
4406 	cv_broadcast(&mi->mi_failover_cv);
4407 	CALLB_CPR_EXIT(&cprinfo);
4408 	VFS_RELE(mi->mi_vfsp);
4409 	zthread_exit();
4410 	/* NOTREACHED */
4411 }
4412 
4413 /*
4414  * NFS client failover support
4415  *
4416  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4417  * is cleared, meaning that failover is complete.  Called with
4418  * mi_lock mutex held.
4419  */
4420 static int
4421 failover_wait(mntinfo_t *mi)
4422 {
4423 	k_sigset_t smask;
4424 
4425 	/*
4426 	 * If someone else is hunting for a living server,
4427 	 * sleep until it's done.  After our sleep, we may
4428 	 * be bound to the right server and get off cheaply.
4429 	 */
4430 	while (mi->mi_flags & MI_BINDINPROG) {
4431 		/*
4432 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4433 		 * and SIGTERM. (Preserving the existing masks).
4434 		 * Mask out SIGINT if mount option nointr is specified.
4435 		 */
4436 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4437 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4438 			/*
4439 			 * restore original signal mask
4440 			 */
4441 			sigunintr(&smask);
4442 			return (EINTR);
4443 		}
4444 		/*
4445 		 * restore original signal mask
4446 		 */
4447 		sigunintr(&smask);
4448 	}
4449 	return (0);
4450 }
4451 
4452 /*
4453  * NFS client failover support
4454  *
4455  * failover_remap() will do a partial pathname lookup and find the
4456  * desired vnode on the current server.  The interim vnode will be
4457  * discarded after we pilfer the new filehandle.
4458  *
4459  * Side effects:
4460  * - This routine will also update the filehandle in the args structure
4461  *    pointed to by the fi->fhp pointer if it is non-NULL.
4462  */
4463 
4464 static int
4465 failover_remap(failinfo_t *fi)
4466 {
4467 	vnode_t *vp, *nvp, *rootvp;
4468 	rnode_t *rp, *nrp;
4469 	mntinfo_t *mi;
4470 	int error;
4471 #ifdef DEBUG
4472 	struct nfs_clnt *nfscl;
4473 
4474 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4475 	ASSERT(nfscl != NULL);
4476 #endif
4477 	/*
4478 	 * Sanity check
4479 	 */
4480 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4481 		return (EINVAL);
4482 	vp = fi->vp;
4483 	rp = VTOR(vp);
4484 	mi = VTOMI(vp);
4485 
4486 	if (!(vp->v_flag & VROOT)) {
4487 		/*
4488 		 * Given the root fh, use the path stored in
4489 		 * the rnode to find the fh for the new server.
4490 		 */
4491 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4492 		if (error)
4493 			return (error);
4494 
4495 		error = failover_lookup(rp->r_path, rootvp,
4496 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4497 
4498 		VN_RELE(rootvp);
4499 
4500 		if (error)
4501 			return (error);
4502 
4503 		/*
4504 		 * If we found the same rnode, we're done now
4505 		 */
4506 		if (nvp == vp) {
4507 			/*
4508 			 * Failed and the new server may physically be same
4509 			 * OR may share a same disk subsystem. In this case
4510 			 * file handle for a particular file path is not going
4511 			 * to change, given the same filehandle lookup will
4512 			 * always locate the same rnode as the existing one.
4513 			 * All we might need to do is to update the r_server
4514 			 * with the current servinfo.
4515 			 */
4516 			if (!VALID_FH(fi)) {
4517 				rp->r_server = mi->mi_curr_serv;
4518 			}
4519 			VN_RELE(nvp);
4520 			return (0);
4521 		}
4522 
4523 		/*
4524 		 * Try to make it so that no one else will find this
4525 		 * vnode because it is just a temporary to hold the
4526 		 * new file handle until that file handle can be
4527 		 * copied to the original vnode/rnode.
4528 		 */
4529 		nrp = VTOR(nvp);
4530 		mutex_enter(&mi->mi_remap_lock);
4531 		/*
4532 		 * Some other thread could have raced in here and could
4533 		 * have done the remap for this particular rnode before
4534 		 * this thread here. Check for rp->r_server and
4535 		 * mi->mi_curr_serv and return if they are same.
4536 		 */
4537 		if (VALID_FH(fi)) {
4538 			mutex_exit(&mi->mi_remap_lock);
4539 			VN_RELE(nvp);
4540 			return (0);
4541 		}
4542 
4543 		if (nrp->r_flags & RHASHED)
4544 			rp_rmhash(nrp);
4545 
4546 		/*
4547 		 * As a heuristic check on the validity of the new
4548 		 * file, check that the size and type match against
4549 		 * that we remember from the old version.
4550 		 */
4551 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4552 			mutex_exit(&mi->mi_remap_lock);
4553 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4554 			    "NFS replicas %s and %s: file %s not same.",
4555 			    rp->r_server->sv_hostname,
4556 			    nrp->r_server->sv_hostname, rp->r_path);
4557 			VN_RELE(nvp);
4558 			return (EINVAL);
4559 		}
4560 
4561 		/*
4562 		 * snarf the filehandle from the new rnode
4563 		 * then release it, again while updating the
4564 		 * hash queues for the rnode.
4565 		 */
4566 		if (rp->r_flags & RHASHED)
4567 			rp_rmhash(rp);
4568 		rp->r_server = mi->mi_curr_serv;
4569 		rp->r_fh = nrp->r_fh;
4570 		rp->r_hashq = nrp->r_hashq;
4571 		/*
4572 		 * Copy the attributes from the new rnode to the old
4573 		 * rnode.  This will help to reduce unnecessary page
4574 		 * cache flushes.
4575 		 */
4576 		rp->r_attr = nrp->r_attr;
4577 		rp->r_attrtime = nrp->r_attrtime;
4578 		rp->r_mtime = nrp->r_mtime;
4579 		(void) nfs_free_data_reclaim(rp);
4580 		nfs_setswaplike(vp, &rp->r_attr);
4581 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4582 		rp_addhash(rp);
4583 		rw_exit(&rp->r_hashq->r_lock);
4584 		mutex_exit(&mi->mi_remap_lock);
4585 		VN_RELE(nvp);
4586 	}
4587 
4588 	/*
4589 	 * Update successful failover remap count
4590 	 */
4591 	mutex_enter(&mi->mi_lock);
4592 	mi->mi_remap++;
4593 	mutex_exit(&mi->mi_lock);
4594 #ifdef DEBUG
4595 	nfscl->nfscl_stat.remap.value.ui64++;
4596 #endif
4597 
4598 	/*
4599 	 * If we have a copied filehandle to update, do it now.
4600 	 */
4601 	if (fi->fhp != NULL && fi->copyproc != NULL)
4602 		(*fi->copyproc)(fi->fhp, vp);
4603 
4604 	return (0);
4605 }
4606 
4607 /*
4608  * NFS client failover support
4609  *
4610  * We want a simple pathname lookup routine to parse the pieces
4611  * of path in rp->r_path.  We know that the path was a created
4612  * as rnodes were made, so we know we have only to deal with
4613  * paths that look like:
4614  *	dir1/dir2/dir3/file
4615  * Any evidence of anything like .., symlinks, and ENOTDIR
4616  * are hard errors, because they mean something in this filesystem
4617  * is different from the one we came from, or has changed under
4618  * us in some way.  If this is true, we want the failure.
4619  *
4620  * Extended attributes: if the filesystem is mounted with extended
4621  * attributes enabled (-o xattr), the attribute directory will be
4622  * represented in the r_path as the magic name XATTR_RPATH. So if
4623  * we see that name in the pathname, is must be because this node
4624  * is an extended attribute.  Therefore, look it up that way.
4625  */
4626 static int
4627 failover_lookup(char *path, vnode_t *root,
4628     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4629 	vnode_t *, cred_t *, int),
4630     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4631     vnode_t **new)
4632 {
4633 	vnode_t *dvp, *nvp;
4634 	int error = EINVAL;
4635 	char *s, *p, *tmppath;
4636 	size_t len;
4637 	mntinfo_t *mi;
4638 	bool_t xattr;
4639 
4640 	/* Make local copy of path */
4641 	len = strlen(path) + 1;
4642 	tmppath = kmem_alloc(len, KM_SLEEP);
4643 	(void) strcpy(tmppath, path);
4644 	s = tmppath;
4645 
4646 	dvp = root;
4647 	VN_HOLD(dvp);
4648 	mi = VTOMI(root);
4649 	xattr = mi->mi_flags & MI_EXTATTR;
4650 
4651 	do {
4652 		p = strchr(s, '/');
4653 		if (p != NULL)
4654 			*p = '\0';
4655 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4656 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4657 			    RFSCALL_SOFT);
4658 		} else {
4659 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4660 			    CRED(), RFSCALL_SOFT);
4661 		}
4662 		if (p != NULL)
4663 			*p++ = '/';
4664 		if (error) {
4665 			VN_RELE(dvp);
4666 			kmem_free(tmppath, len);
4667 			return (error);
4668 		}
4669 		s = p;
4670 		VN_RELE(dvp);
4671 		dvp = nvp;
4672 	} while (p != NULL);
4673 
4674 	if (nvp != NULL && new != NULL)
4675 		*new = nvp;
4676 	kmem_free(tmppath, len);
4677 	return (0);
4678 }
4679 
4680 /*
4681  * NFS client failover support
4682  *
4683  * sv_free() frees the malloc'd portion of a "servinfo_t".
4684  */
4685 void
4686 sv_free(servinfo_t *svp)
4687 {
4688 	servinfo_t *next;
4689 	struct knetconfig *knconf;
4690 
4691 	while (svp != NULL) {
4692 		next = svp->sv_next;
4693 		if (svp->sv_secdata)
4694 			sec_clnt_freeinfo(svp->sv_secdata);
4695 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4696 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4697 		knconf = svp->sv_knconf;
4698 		if (knconf != NULL) {
4699 			if (knconf->knc_protofmly != NULL)
4700 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4701 			if (knconf->knc_proto != NULL)
4702 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4703 			kmem_free(knconf, sizeof (*knconf));
4704 		}
4705 		knconf = svp->sv_origknconf;
4706 		if (knconf != NULL) {
4707 			if (knconf->knc_protofmly != NULL)
4708 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4709 			if (knconf->knc_proto != NULL)
4710 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4711 			kmem_free(knconf, sizeof (*knconf));
4712 		}
4713 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4714 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4715 		mutex_destroy(&svp->sv_lock);
4716 		kmem_free(svp, sizeof (*svp));
4717 		svp = next;
4718 	}
4719 }
4720 
4721 /*
4722  * Only can return non-zero if intr != 0.
4723  */
4724 int
4725 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4726 {
4727 
4728 	mutex_enter(&l->lock);
4729 
4730 	/*
4731 	 * If this is a nested enter, then allow it.  There
4732 	 * must be as many exits as enters through.
4733 	 */
4734 	if (l->owner == curthread) {
4735 		/* lock is held for writing by current thread */
4736 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4737 		l->count--;
4738 	} else if (rw == RW_READER) {
4739 		/*
4740 		 * While there is a writer active or writers waiting,
4741 		 * then wait for them to finish up and move on.  Then,
4742 		 * increment the count to indicate that a reader is
4743 		 * active.
4744 		 */
4745 		while (l->count < 0 || l->waiters > 0) {
4746 			if (intr) {
4747 				klwp_t *lwp = ttolwp(curthread);
4748 
4749 				if (lwp != NULL)
4750 					lwp->lwp_nostop++;
4751 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4752 					if (lwp != NULL)
4753 						lwp->lwp_nostop--;
4754 					mutex_exit(&l->lock);
4755 					return (EINTR);
4756 				}
4757 				if (lwp != NULL)
4758 					lwp->lwp_nostop--;
4759 			} else
4760 				cv_wait(&l->cv, &l->lock);
4761 		}
4762 		ASSERT(l->count < INT_MAX);
4763 #ifdef	DEBUG
4764 		if ((l->count % 10000) == 9999)
4765 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4766 			    "rwlock @ %p\n", l->count, (void *)&l);
4767 #endif
4768 		l->count++;
4769 	} else {
4770 		ASSERT(rw == RW_WRITER);
4771 		/*
4772 		 * While there are readers active or a writer
4773 		 * active, then wait for all of the readers
4774 		 * to finish or for the writer to finish.
4775 		 * Then, set the owner field to curthread and
4776 		 * decrement count to indicate that a writer
4777 		 * is active.
4778 		 */
4779 		while (l->count > 0 || l->owner != NULL) {
4780 			l->waiters++;
4781 			if (intr) {
4782 				klwp_t *lwp = ttolwp(curthread);
4783 
4784 				if (lwp != NULL)
4785 					lwp->lwp_nostop++;
4786 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4787 					if (lwp != NULL)
4788 						lwp->lwp_nostop--;
4789 					l->waiters--;
4790 					cv_broadcast(&l->cv);
4791 					mutex_exit(&l->lock);
4792 					return (EINTR);
4793 				}
4794 				if (lwp != NULL)
4795 					lwp->lwp_nostop--;
4796 			} else
4797 				cv_wait(&l->cv, &l->lock);
4798 			l->waiters--;
4799 		}
4800 		l->owner = curthread;
4801 		l->count--;
4802 	}
4803 
4804 	mutex_exit(&l->lock);
4805 
4806 	return (0);
4807 }
4808 
4809 /*
4810  * If the lock is available, obtain it and return non-zero.  If there is
4811  * already a conflicting lock, return 0 immediately.
4812  */
4813 
4814 int
4815 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4816 {
4817 	mutex_enter(&l->lock);
4818 
4819 	/*
4820 	 * If this is a nested enter, then allow it.  There
4821 	 * must be as many exits as enters through.
4822 	 */
4823 	if (l->owner == curthread) {
4824 		/* lock is held for writing by current thread */
4825 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4826 		l->count--;
4827 	} else if (rw == RW_READER) {
4828 		/*
4829 		 * If there is a writer active or writers waiting, deny the
4830 		 * lock.  Otherwise, bump the count of readers.
4831 		 */
4832 		if (l->count < 0 || l->waiters > 0) {
4833 			mutex_exit(&l->lock);
4834 			return (0);
4835 		}
4836 		l->count++;
4837 	} else {
4838 		ASSERT(rw == RW_WRITER);
4839 		/*
4840 		 * If there are readers active or a writer active, deny the
4841 		 * lock.  Otherwise, set the owner field to curthread and
4842 		 * decrement count to indicate that a writer is active.
4843 		 */
4844 		if (l->count > 0 || l->owner != NULL) {
4845 			mutex_exit(&l->lock);
4846 			return (0);
4847 		}
4848 		l->owner = curthread;
4849 		l->count--;
4850 	}
4851 
4852 	mutex_exit(&l->lock);
4853 
4854 	return (1);
4855 }
4856 
4857 void
4858 nfs_rw_exit(nfs_rwlock_t *l)
4859 {
4860 
4861 	mutex_enter(&l->lock);
4862 	/*
4863 	 * If this is releasing a writer lock, then increment count to
4864 	 * indicate that there is one less writer active.  If this was
4865 	 * the last of possibly nested writer locks, then clear the owner
4866 	 * field as well to indicate that there is no writer active
4867 	 * and wakeup any possible waiting writers or readers.
4868 	 *
4869 	 * If releasing a reader lock, then just decrement count to
4870 	 * indicate that there is one less reader active.  If this was
4871 	 * the last active reader and there are writer(s) waiting,
4872 	 * then wake up the first.
4873 	 */
4874 	if (l->owner != NULL) {
4875 		ASSERT(l->owner == curthread);
4876 		l->count++;
4877 		if (l->count == 0) {
4878 			l->owner = NULL;
4879 			cv_broadcast(&l->cv);
4880 		}
4881 	} else {
4882 		ASSERT(l->count > 0);
4883 		l->count--;
4884 		if (l->count == 0 && l->waiters > 0)
4885 			cv_broadcast(&l->cv);
4886 	}
4887 	mutex_exit(&l->lock);
4888 }
4889 
4890 int
4891 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4892 {
4893 
4894 	if (rw == RW_READER)
4895 		return (l->count > 0);
4896 	ASSERT(rw == RW_WRITER);
4897 	return (l->count < 0);
4898 }
4899 
4900 /* ARGSUSED */
4901 void
4902 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4903 {
4904 
4905 	l->count = 0;
4906 	l->waiters = 0;
4907 	l->owner = NULL;
4908 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4909 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4910 }
4911 
4912 void
4913 nfs_rw_destroy(nfs_rwlock_t *l)
4914 {
4915 
4916 	mutex_destroy(&l->lock);
4917 	cv_destroy(&l->cv);
4918 }
4919 
4920 int
4921 nfs3_rddir_compar(const void *x, const void *y)
4922 {
4923 	rddir_cache *a = (rddir_cache *)x;
4924 	rddir_cache *b = (rddir_cache *)y;
4925 
4926 	if (a->nfs3_cookie == b->nfs3_cookie) {
4927 		if (a->buflen == b->buflen)
4928 			return (0);
4929 		if (a->buflen < b->buflen)
4930 			return (-1);
4931 		return (1);
4932 	}
4933 
4934 	if (a->nfs3_cookie < b->nfs3_cookie)
4935 		return (-1);
4936 
4937 	return (1);
4938 }
4939 
4940 int
4941 nfs_rddir_compar(const void *x, const void *y)
4942 {
4943 	rddir_cache *a = (rddir_cache *)x;
4944 	rddir_cache *b = (rddir_cache *)y;
4945 
4946 	if (a->nfs_cookie == b->nfs_cookie) {
4947 		if (a->buflen == b->buflen)
4948 			return (0);
4949 		if (a->buflen < b->buflen)
4950 			return (-1);
4951 		return (1);
4952 	}
4953 
4954 	if (a->nfs_cookie < b->nfs_cookie)
4955 		return (-1);
4956 
4957 	return (1);
4958 }
4959 
4960 static char *
4961 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4962 {
4963 	servinfo_t *s;
4964 	char *srvnames;
4965 	char *namep;
4966 	size_t length;
4967 
4968 	/*
4969 	 * Calculate the length of the string required to hold all
4970 	 * of the server names plus either a comma or a null
4971 	 * character following each individual one.
4972 	 */
4973 	length = 0;
4974 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4975 		length += s->sv_hostnamelen;
4976 
4977 	srvnames = kmem_alloc(length, KM_SLEEP);
4978 
4979 	namep = srvnames;
4980 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4981 		(void) strcpy(namep, s->sv_hostname);
4982 		namep += s->sv_hostnamelen - 1;
4983 		*namep++ = ',';
4984 	}
4985 	*--namep = '\0';
4986 
4987 	*len = length;
4988 
4989 	return (srvnames);
4990 }
4991 
4992 /*
4993  * These two functions are temporary and designed for the upgrade-workaround
4994  * only.  They cannot be used for general zone-crossing NFS client support, and
4995  * will be removed shortly.
4996  *
4997  * When the workaround is enabled, all NFS traffic is forced into the global
4998  * zone.  These functions are called when the code needs to refer to the state
4999  * of the underlying network connection.  They're not called when the function
5000  * needs to refer to the state of the process that invoked the system call.
5001  * (E.g., when checking whether the zone is shutting down during the mount()
5002  * call.)
5003  */
5004 
5005 struct zone *
5006 nfs_zone(void)
5007 {
5008 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5009 }
5010 
5011 zoneid_t
5012 nfs_zoneid(void)
5013 {
5014 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5015 }
5016 
5017 /*
5018  * nfs_mount_label_policy:
5019  *	Determine whether the mount is allowed according to MAC check,
5020  *	by comparing (where appropriate) label of the remote server
5021  *	against the label of the zone being mounted into.
5022  *
5023  *	Returns:
5024  *		 0 :	access allowed
5025  *		-1 :	read-only access allowed (i.e., read-down)
5026  *		>0 :	error code, such as EACCES
5027  */
5028 int
5029 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5030     struct knetconfig *knconf, cred_t *cr)
5031 {
5032 	int		addr_type;
5033 	void		*ipaddr;
5034 	bslabel_t	*server_sl, *mntlabel;
5035 	zone_t		*mntzone = NULL;
5036 	ts_label_t	*zlabel;
5037 	tsol_tpc_t	*tp;
5038 	ts_label_t	*tsl = NULL;
5039 	int		retv;
5040 
5041 	/*
5042 	 * Get the zone's label.  Each zone on a labeled system has a label.
5043 	 */
5044 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5045 	zlabel = mntzone->zone_slabel;
5046 	ASSERT(zlabel != NULL);
5047 	label_hold(zlabel);
5048 
5049 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5050 		addr_type = IPV4_VERSION;
5051 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5052 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5053 		addr_type = IPV6_VERSION;
5054 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5055 	} else {
5056 		retv = 0;
5057 		goto out;
5058 	}
5059 
5060 	retv = EACCES;				/* assume the worst */
5061 
5062 	/*
5063 	 * Next, get the assigned label of the remote server.
5064 	 */
5065 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5066 	if (tp == NULL)
5067 		goto out;			/* error getting host entry */
5068 
5069 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5070 		goto rel_tpc;			/* invalid domain */
5071 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5072 	    (tp->tpc_tp.host_type != UNLABELED))
5073 		goto rel_tpc;			/* invalid hosttype */
5074 
5075 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5076 		tsl = getflabel_cipso(vfsp);
5077 		if (tsl == NULL)
5078 			goto rel_tpc;		/* error getting server lbl */
5079 
5080 		server_sl = label2bslabel(tsl);
5081 	} else {	/* UNLABELED */
5082 		server_sl = &tp->tpc_tp.tp_def_label;
5083 	}
5084 
5085 	mntlabel = label2bslabel(zlabel);
5086 
5087 	/*
5088 	 * Now compare labels to complete the MAC check.  If the labels
5089 	 * are equal or if the requestor is in the global zone and has
5090 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5091 	 * mounts into the global zone itself; restrict these to
5092 	 * read-only.)
5093 	 *
5094 	 * If the requestor is in some other zone, but his label
5095 	 * dominates the server, then allow read-down.
5096 	 *
5097 	 * Otherwise, access is denied.
5098 	 */
5099 	if (blequal(mntlabel, server_sl) ||
5100 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5101 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5102 		if ((mntzone == global_zone) ||
5103 		    !blequal(mntlabel, server_sl))
5104 			retv = -1;		/* read-only */
5105 		else
5106 			retv = 0;		/* access OK */
5107 	} else if (bldominates(mntlabel, server_sl)) {
5108 		retv = -1;			/* read-only */
5109 	} else {
5110 		retv = EACCES;
5111 	}
5112 
5113 	if (tsl != NULL)
5114 		label_rele(tsl);
5115 
5116 rel_tpc:
5117 	TPC_RELE(tp);
5118 out:
5119 	if (mntzone)
5120 		zone_rele(mntzone);
5121 	label_rele(zlabel);
5122 	return (retv);
5123 }
5124 
5125 boolean_t
5126 nfs_has_ctty(void)
5127 {
5128 	boolean_t rv;
5129 	mutex_enter(&curproc->p_splock);
5130 	rv = (curproc->p_sessp->s_vp != NULL);
5131 	mutex_exit(&curproc->p_splock);
5132 	return (rv);
5133 }
5134 
5135 /*
5136  * TX NFS routine used by NFSv3 and NFSv4 to do label check
5137  * on client label and server's file object lable.
5138  */
5139 boolean_t
5140 do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag)
5141 {
5142 	bslabel_t *slabel;
5143 	ts_label_t *tslabel;
5144 	boolean_t result;
5145 
5146 	if ((tslabel = nfs_getflabel(vp)) == NULL) {
5147 		return (B_FALSE);
5148 	}
5149 	slabel = label2bslabel(tslabel);
5150 	DTRACE_PROBE4(tx__rfs__log__info__labelcheck, char *,
5151 	    "comparing server's file label(1) with client label(2) (vp(3))",
5152 	    bslabel_t *, slabel, bslabel_t *, clabel, vnode_t *, vp);
5153 
5154 	if (flag == EQUALITY_CHECK)
5155 		result = blequal(clabel, slabel);
5156 	else
5157 		result = bldominates(clabel, slabel);
5158 	label_rele(tslabel);
5159 	return (result);
5160 }
5161