xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs_subr.c (revision 8275a87e46b79352e8c1a918b91373159c477438)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
26  *	All rights reserved.
27  */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred_impl.h>
35 #include <sys/proc.h>
36 #include <sys/user.h>
37 #include <sys/time.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/socket.h>
42 #include <sys/uio.h>
43 #include <sys/tiuser.h>
44 #include <sys/swap.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/kmem.h>
48 #include <sys/kstat.h>
49 #include <sys/cmn_err.h>
50 #include <sys/vtrace.h>
51 #include <sys/session.h>
52 #include <sys/dnlc.h>
53 #include <sys/bitmap.h>
54 #include <sys/acl.h>
55 #include <sys/ddi.h>
56 #include <sys/pathname.h>
57 #include <sys/flock.h>
58 #include <sys/dirent.h>
59 #include <sys/flock.h>
60 #include <sys/callb.h>
61 #include <sys/atomic.h>
62 #include <sys/list.h>
63 #include <sys/tsol/tnet.h>
64 #include <sys/priv.h>
65 
66 #include <inet/ip6.h>
67 
68 #include <rpc/types.h>
69 #include <rpc/xdr.h>
70 #include <rpc/auth.h>
71 #include <rpc/clnt.h>
72 
73 #include <nfs/nfs.h>
74 #include <nfs/nfs4.h>
75 #include <nfs/nfs_clnt.h>
76 #include <nfs/rnode.h>
77 #include <nfs/nfs_acl.h>
78 
79 /*
80  * The hash queues for the access to active and cached rnodes
81  * are organized as doubly linked lists.  A reader/writer lock
82  * for each hash bucket is used to control access and to synchronize
83  * lookups, additions, and deletions from the hash queue.
84  *
85  * The rnode freelist is organized as a doubly linked list with
86  * a head pointer.  Additions and deletions are synchronized via
87  * a single mutex.
88  *
89  * In order to add an rnode to the free list, it must be hashed into
90  * a hash queue and the exclusive lock to the hash queue be held.
91  * If an rnode is not hashed into a hash queue, then it is destroyed
92  * because it represents no valuable information that can be reused
93  * about the file.  The exclusive lock to the hash queue must be
94  * held in order to prevent a lookup in the hash queue from finding
95  * the rnode and using it and assuming that the rnode is not on the
96  * freelist.  The lookup in the hash queue will have the hash queue
97  * locked, either exclusive or shared.
98  *
99  * The vnode reference count for each rnode is not allowed to drop
100  * below 1.  This prevents external entities, such as the VM
101  * subsystem, from acquiring references to vnodes already on the
102  * freelist and then trying to place them back on the freelist
103  * when their reference is released.  This means that the when an
104  * rnode is looked up in the hash queues, then either the rnode
105  * is removed from the freelist and that reference is tranfered to
106  * the new reference or the vnode reference count must be incremented
107  * accordingly.  The mutex for the freelist must be held in order to
108  * accurately test to see if the rnode is on the freelist or not.
109  * The hash queue lock might be held shared and it is possible that
110  * two different threads may race to remove the rnode from the
111  * freelist.  This race can be resolved by holding the mutex for the
112  * freelist.  Please note that the mutex for the freelist does not
113  * need to held if the rnode is not on the freelist.  It can not be
114  * placed on the freelist due to the requirement that the thread
115  * putting the rnode on the freelist must hold the exclusive lock
116  * to the hash queue and the thread doing the lookup in the hash
117  * queue is holding either a shared or exclusive lock to the hash
118  * queue.
119  *
120  * The lock ordering is:
121  *
122  *	hash bucket lock -> vnode lock
123  *	hash bucket lock -> freelist lock
124  */
125 static rhashq_t *rtable;
126 
127 static kmutex_t rpfreelist_lock;
128 static rnode_t *rpfreelist = NULL;
129 static long rnew = 0;
130 long nrnode = 0;
131 
132 static int rtablesize;
133 static int rtablemask;
134 
135 static int hashlen = 4;
136 
137 static struct kmem_cache *rnode_cache;
138 
139 /*
140  * Mutex to protect the following variables:
141  *	nfs_major
142  *	nfs_minor
143  */
144 kmutex_t nfs_minor_lock;
145 int nfs_major;
146 int nfs_minor;
147 
148 /* Do we allow preepoch (negative) time values otw? */
149 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
150 
151 /*
152  * Access cache
153  */
154 static acache_hash_t *acache;
155 static long nacache;	/* used strictly to size the number of hash queues */
156 
157 static int acachesize;
158 static int acachemask;
159 static struct kmem_cache *acache_cache;
160 
161 /*
162  * Client side utilities
163  */
164 
165 /*
166  * client side statistics
167  */
168 static const struct clstat clstat_tmpl = {
169 	{ "calls",	KSTAT_DATA_UINT64 },
170 	{ "badcalls",	KSTAT_DATA_UINT64 },
171 	{ "clgets",	KSTAT_DATA_UINT64 },
172 	{ "cltoomany",	KSTAT_DATA_UINT64 },
173 #ifdef DEBUG
174 	{ "clalloc",	KSTAT_DATA_UINT64 },
175 	{ "noresponse",	KSTAT_DATA_UINT64 },
176 	{ "failover",	KSTAT_DATA_UINT64 },
177 	{ "remap",	KSTAT_DATA_UINT64 },
178 #endif
179 };
180 
181 /*
182  * The following are statistics that describe behavior of the system as a whole
183  * and doesn't correspond to any one particular zone.
184  */
185 #ifdef DEBUG
186 static struct clstat_debug {
187 	kstat_named_t	nrnode;			/* number of allocated rnodes */
188 	kstat_named_t	access;			/* size of access cache */
189 	kstat_named_t	dirent;			/* size of readdir cache */
190 	kstat_named_t	dirents;		/* size of readdir buf cache */
191 	kstat_named_t	reclaim;		/* number of reclaims */
192 	kstat_named_t	clreclaim;		/* number of cl reclaims */
193 	kstat_named_t	f_reclaim;		/* number of free reclaims */
194 	kstat_named_t	a_reclaim;		/* number of active reclaims */
195 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
196 	kstat_named_t	rpath;			/* bytes used to store rpaths */
197 } clstat_debug = {
198 	{ "nrnode",	KSTAT_DATA_UINT64 },
199 	{ "access",	KSTAT_DATA_UINT64 },
200 	{ "dirent",	KSTAT_DATA_UINT64 },
201 	{ "dirents",	KSTAT_DATA_UINT64 },
202 	{ "reclaim",	KSTAT_DATA_UINT64 },
203 	{ "clreclaim",	KSTAT_DATA_UINT64 },
204 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
205 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
206 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
207 	{ "r_path",	KSTAT_DATA_UINT64 },
208 };
209 #endif	/* DEBUG */
210 
211 /*
212  * We keep a global list of per-zone client data, so we can clean up all zones
213  * if we get low on memory.
214  */
215 static list_t nfs_clnt_list;
216 static kmutex_t nfs_clnt_list_lock;
217 static zone_key_t nfsclnt_zone_key;
218 
219 static struct kmem_cache *chtab_cache;
220 
221 /*
222  * Some servers do not properly update the attributes of the
223  * directory when changes are made.  To allow interoperability
224  * with these broken servers, the nfs_disable_rddir_cache
225  * parameter must be set in /etc/system
226  */
227 int nfs_disable_rddir_cache = 0;
228 
229 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
230 		    struct chtab **);
231 void		clfree(CLIENT *, struct chtab *);
232 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
233 		    struct chtab **, struct nfs_clnt *);
234 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
235 		    struct chtab **, struct nfs_clnt *);
236 static void	clreclaim(void *);
237 static int	nfs_feedback(int, int, mntinfo_t *);
238 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
239 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
240 		    failinfo_t *);
241 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
242 		    caddr_t, cred_t *, int *, int, failinfo_t *);
243 static void	rinactive(rnode_t *, cred_t *);
244 static int	rtablehash(nfs_fhandle *);
245 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
246 		    struct vnodeops *,
247 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
248 			cred_t *),
249 		    int (*)(const void *, const void *), int *, cred_t *,
250 		    char *, char *);
251 static void	rp_rmfree(rnode_t *);
252 static void	rp_addhash(rnode_t *);
253 static void	rp_rmhash_locked(rnode_t *);
254 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
255 static void	destroy_rnode(rnode_t *);
256 static void	rddir_cache_free(rddir_cache *);
257 static int	nfs_free_data_reclaim(rnode_t *);
258 static int	nfs_active_data_reclaim(rnode_t *);
259 static int	nfs_free_reclaim(void);
260 static int	nfs_active_reclaim(void);
261 static int	nfs_rnode_reclaim(void);
262 static void	nfs_reclaim(void *);
263 static int	failover_safe(failinfo_t *);
264 static void	failover_newserver(mntinfo_t *mi);
265 static void	failover_thread(mntinfo_t *mi);
266 static int	failover_wait(mntinfo_t *);
267 static int	failover_remap(failinfo_t *);
268 static int	failover_lookup(char *, vnode_t *,
269 		    int (*)(vnode_t *, char *, vnode_t **,
270 			struct pathname *, int, vnode_t *, cred_t *, int),
271 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
272 		    vnode_t **);
273 static void	nfs_free_r_path(rnode_t *);
274 static void	nfs_set_vroot(vnode_t *);
275 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
276 
277 /*
278  * from rpcsec module (common/rpcsec)
279  */
280 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
281 extern void sec_clnt_freeh(AUTH *);
282 extern void sec_clnt_freeinfo(struct sec_data *);
283 
284 /*
285  * used in mount policy
286  */
287 extern ts_label_t *getflabel_cipso(vfs_t *);
288 
289 /*
290  * EIO or EINTR are not recoverable errors.
291  */
292 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
293 
294 /*
295  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
296  */
297 static int
298 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
299     struct chtab **chp, struct nfs_clnt *nfscl)
300 {
301 	struct chhead *ch, *newch;
302 	struct chhead **plistp;
303 	struct chtab *cp;
304 	int error;
305 	k_sigset_t smask;
306 
307 	if (newcl == NULL || chp == NULL || ci == NULL)
308 		return (EINVAL);
309 
310 	*newcl = NULL;
311 	*chp = NULL;
312 
313 	/*
314 	 * Find an unused handle or create one
315 	 */
316 	newch = NULL;
317 	nfscl->nfscl_stat.clgets.value.ui64++;
318 top:
319 	/*
320 	 * Find the correct entry in the cache to check for free
321 	 * client handles.  The search is based on the RPC program
322 	 * number, program version number, dev_t for the transport
323 	 * device, and the protocol family.
324 	 */
325 	mutex_enter(&nfscl->nfscl_chtable_lock);
326 	plistp = &nfscl->nfscl_chtable;
327 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
328 		if (ch->ch_prog == ci->cl_prog &&
329 		    ch->ch_vers == ci->cl_vers &&
330 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
331 		    (strcmp(ch->ch_protofmly,
332 		    svp->sv_knconf->knc_protofmly) == 0))
333 			break;
334 		plistp = &ch->ch_next;
335 	}
336 
337 	/*
338 	 * If we didn't find a cache entry for this quadruple, then
339 	 * create one.  If we don't have one already preallocated,
340 	 * then drop the cache lock, create one, and then start over.
341 	 * If we did have a preallocated entry, then just add it to
342 	 * the front of the list.
343 	 */
344 	if (ch == NULL) {
345 		if (newch == NULL) {
346 			mutex_exit(&nfscl->nfscl_chtable_lock);
347 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
348 			newch->ch_timesused = 0;
349 			newch->ch_prog = ci->cl_prog;
350 			newch->ch_vers = ci->cl_vers;
351 			newch->ch_dev = svp->sv_knconf->knc_rdev;
352 			newch->ch_protofmly = kmem_alloc(
353 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
354 			    KM_SLEEP);
355 			(void) strcpy(newch->ch_protofmly,
356 			    svp->sv_knconf->knc_protofmly);
357 			newch->ch_list = NULL;
358 			goto top;
359 		}
360 		ch = newch;
361 		newch = NULL;
362 		ch->ch_next = nfscl->nfscl_chtable;
363 		nfscl->nfscl_chtable = ch;
364 	/*
365 	 * We found a cache entry, but if it isn't on the front of the
366 	 * list, then move it to the front of the list to try to take
367 	 * advantage of locality of operations.
368 	 */
369 	} else if (ch != nfscl->nfscl_chtable) {
370 		*plistp = ch->ch_next;
371 		ch->ch_next = nfscl->nfscl_chtable;
372 		nfscl->nfscl_chtable = ch;
373 	}
374 
375 	/*
376 	 * If there was a free client handle cached, then remove it
377 	 * from the list, init it, and use it.
378 	 */
379 	if (ch->ch_list != NULL) {
380 		cp = ch->ch_list;
381 		ch->ch_list = cp->ch_list;
382 		mutex_exit(&nfscl->nfscl_chtable_lock);
383 		if (newch != NULL) {
384 			kmem_free(newch->ch_protofmly,
385 			    strlen(newch->ch_protofmly) + 1);
386 			kmem_free(newch, sizeof (*newch));
387 		}
388 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
389 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
390 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
391 		    &cp->ch_client->cl_auth);
392 		if (error || cp->ch_client->cl_auth == NULL) {
393 			CLNT_DESTROY(cp->ch_client);
394 			kmem_cache_free(chtab_cache, cp);
395 			return ((error != 0) ? error : EINTR);
396 		}
397 		ch->ch_timesused++;
398 		*newcl = cp->ch_client;
399 		*chp = cp;
400 		return (0);
401 	}
402 
403 	/*
404 	 * There weren't any free client handles which fit, so allocate
405 	 * a new one and use that.
406 	 */
407 #ifdef DEBUG
408 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
409 #endif
410 	mutex_exit(&nfscl->nfscl_chtable_lock);
411 
412 	nfscl->nfscl_stat.cltoomany.value.ui64++;
413 	if (newch != NULL) {
414 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
415 		kmem_free(newch, sizeof (*newch));
416 	}
417 
418 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
419 	cp->ch_head = ch;
420 
421 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
422 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
423 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
424 	sigunintr(&smask);
425 
426 	if (error != 0) {
427 		kmem_cache_free(chtab_cache, cp);
428 #ifdef DEBUG
429 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
430 #endif
431 		/*
432 		 * Warning is unnecessary if error is EINTR.
433 		 */
434 		if (error != EINTR) {
435 			nfs_cmn_err(error, CE_WARN,
436 			    "clget: couldn't create handle: %m\n");
437 		}
438 		return (error);
439 	}
440 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
441 	auth_destroy(cp->ch_client->cl_auth);
442 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
443 	    &cp->ch_client->cl_auth);
444 	if (error || cp->ch_client->cl_auth == NULL) {
445 		CLNT_DESTROY(cp->ch_client);
446 		kmem_cache_free(chtab_cache, cp);
447 #ifdef DEBUG
448 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
449 #endif
450 		return ((error != 0) ? error : EINTR);
451 	}
452 	ch->ch_timesused++;
453 	*newcl = cp->ch_client;
454 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
455 	*chp = cp;
456 	return (0);
457 }
458 
459 int
460 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
461     struct chtab **chp)
462 {
463 	struct nfs_clnt *nfscl;
464 
465 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
466 	ASSERT(nfscl != NULL);
467 
468 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
469 }
470 
471 static int
472 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
473     struct chtab **chp, struct nfs_clnt *nfscl)
474 {
475 	clinfo_t ci;
476 	int error;
477 
478 	/*
479 	 * Set read buffer size to rsize
480 	 * and add room for RPC headers.
481 	 */
482 	ci.cl_readsize = mi->mi_tsize;
483 	if (ci.cl_readsize != 0)
484 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
485 
486 	/*
487 	 * If soft mount and server is down just try once.
488 	 * meaning: do not retransmit.
489 	 */
490 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
491 		ci.cl_retrans = 0;
492 	else
493 		ci.cl_retrans = mi->mi_retrans;
494 
495 	ci.cl_prog = NFS_ACL_PROGRAM;
496 	ci.cl_vers = mi->mi_vers;
497 	ci.cl_flags = mi->mi_flags;
498 
499 	/*
500 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
501 	 * security flavor, the client tries to establish a security context
502 	 * by contacting the server. If the connection is timed out or reset,
503 	 * e.g. server reboot, we will try again.
504 	 */
505 	do {
506 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
507 
508 		if (error == 0)
509 			break;
510 
511 		/*
512 		 * For forced unmount or zone shutdown, bail out, no retry.
513 		 */
514 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
515 			error = EIO;
516 			break;
517 		}
518 
519 		/* do not retry for softmount */
520 		if (!(mi->mi_flags & MI_HARD))
521 			break;
522 
523 		/* let the caller deal with the failover case */
524 		if (FAILOVER_MOUNT(mi))
525 			break;
526 
527 	} while (error == ETIMEDOUT || error == ECONNRESET);
528 
529 	return (error);
530 }
531 
532 static int
533 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
534     struct chtab **chp, struct nfs_clnt *nfscl)
535 {
536 	clinfo_t ci;
537 	int error;
538 
539 	/*
540 	 * Set read buffer size to rsize
541 	 * and add room for RPC headers.
542 	 */
543 	ci.cl_readsize = mi->mi_tsize;
544 	if (ci.cl_readsize != 0)
545 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
546 
547 	/*
548 	 * If soft mount and server is down just try once.
549 	 * meaning: do not retransmit.
550 	 */
551 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
552 		ci.cl_retrans = 0;
553 	else
554 		ci.cl_retrans = mi->mi_retrans;
555 
556 	ci.cl_prog = mi->mi_prog;
557 	ci.cl_vers = mi->mi_vers;
558 	ci.cl_flags = mi->mi_flags;
559 
560 	/*
561 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
562 	 * security flavor, the client tries to establish a security context
563 	 * by contacting the server. If the connection is timed out or reset,
564 	 * e.g. server reboot, we will try again.
565 	 */
566 	do {
567 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
568 
569 		if (error == 0)
570 			break;
571 
572 		/*
573 		 * For forced unmount or zone shutdown, bail out, no retry.
574 		 */
575 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
576 			error = EIO;
577 			break;
578 		}
579 
580 		/* do not retry for softmount */
581 		if (!(mi->mi_flags & MI_HARD))
582 			break;
583 
584 		/* let the caller deal with the failover case */
585 		if (FAILOVER_MOUNT(mi))
586 			break;
587 
588 	} while (error == ETIMEDOUT || error == ECONNRESET);
589 
590 	return (error);
591 }
592 
593 static void
594 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
595 {
596 	if (cl->cl_auth != NULL) {
597 		sec_clnt_freeh(cl->cl_auth);
598 		cl->cl_auth = NULL;
599 	}
600 
601 	/*
602 	 * Timestamp this cache entry so that we know when it was last
603 	 * used.
604 	 */
605 	cp->ch_freed = gethrestime_sec();
606 
607 	/*
608 	 * Add the free client handle to the front of the list.
609 	 * This way, the list will be sorted in youngest to oldest
610 	 * order.
611 	 */
612 	mutex_enter(&nfscl->nfscl_chtable_lock);
613 	cp->ch_list = cp->ch_head->ch_list;
614 	cp->ch_head->ch_list = cp;
615 	mutex_exit(&nfscl->nfscl_chtable_lock);
616 }
617 
618 void
619 clfree(CLIENT *cl, struct chtab *cp)
620 {
621 	struct nfs_clnt *nfscl;
622 
623 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
624 	ASSERT(nfscl != NULL);
625 
626 	clfree_impl(cl, cp, nfscl);
627 }
628 
629 #define	CL_HOLDTIME	60	/* time to hold client handles */
630 
631 static void
632 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
633 {
634 	struct chhead *ch;
635 	struct chtab *cp;	/* list of objects that can be reclaimed */
636 	struct chtab *cpe;
637 	struct chtab *cpl;
638 	struct chtab **cpp;
639 #ifdef DEBUG
640 	int n = 0;
641 #endif
642 
643 	/*
644 	 * Need to reclaim some memory, so step through the cache
645 	 * looking through the lists for entries which can be freed.
646 	 */
647 	cp = NULL;
648 
649 	mutex_enter(&nfscl->nfscl_chtable_lock);
650 
651 	/*
652 	 * Here we step through each non-NULL quadruple and start to
653 	 * construct the reclaim list pointed to by cp.  Note that
654 	 * cp will contain all eligible chtab entries.  When this traversal
655 	 * completes, chtab entries from the last quadruple will be at the
656 	 * front of cp and entries from previously inspected quadruples have
657 	 * been appended to the rear of cp.
658 	 */
659 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
660 		if (ch->ch_list == NULL)
661 			continue;
662 		/*
663 		 * Search each list for entries older then
664 		 * cl_holdtime seconds.  The lists are maintained
665 		 * in youngest to oldest order so that when the
666 		 * first entry is found which is old enough, then
667 		 * all of the rest of the entries on the list will
668 		 * be old enough as well.
669 		 */
670 		cpl = ch->ch_list;
671 		cpp = &ch->ch_list;
672 		while (cpl != NULL &&
673 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
674 			cpp = &cpl->ch_list;
675 			cpl = cpl->ch_list;
676 		}
677 		if (cpl != NULL) {
678 			*cpp = NULL;
679 			if (cp != NULL) {
680 				cpe = cpl;
681 				while (cpe->ch_list != NULL)
682 					cpe = cpe->ch_list;
683 				cpe->ch_list = cp;
684 			}
685 			cp = cpl;
686 		}
687 	}
688 
689 	mutex_exit(&nfscl->nfscl_chtable_lock);
690 
691 	/*
692 	 * If cp is empty, then there is nothing to reclaim here.
693 	 */
694 	if (cp == NULL)
695 		return;
696 
697 	/*
698 	 * Step through the list of entries to free, destroying each client
699 	 * handle and kmem_free'ing the memory for each entry.
700 	 */
701 	while (cp != NULL) {
702 #ifdef DEBUG
703 		n++;
704 #endif
705 		CLNT_DESTROY(cp->ch_client);
706 		cpl = cp->ch_list;
707 		kmem_cache_free(chtab_cache, cp);
708 		cp = cpl;
709 	}
710 
711 #ifdef DEBUG
712 	/*
713 	 * Update clalloc so that nfsstat shows the current number
714 	 * of allocated client handles.
715 	 */
716 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
717 #endif
718 }
719 
720 /* ARGSUSED */
721 static void
722 clreclaim(void *all)
723 {
724 	struct nfs_clnt *nfscl;
725 
726 #ifdef DEBUG
727 	clstat_debug.clreclaim.value.ui64++;
728 #endif
729 	/*
730 	 * The system is low on memory; go through and try to reclaim some from
731 	 * every zone on the system.
732 	 */
733 	mutex_enter(&nfs_clnt_list_lock);
734 	nfscl = list_head(&nfs_clnt_list);
735 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
736 		clreclaim_zone(nfscl, CL_HOLDTIME);
737 	mutex_exit(&nfs_clnt_list_lock);
738 }
739 
740 /*
741  * Minimum time-out values indexed by call type
742  * These units are in "eights" of a second to avoid multiplies
743  */
744 static unsigned int minimum_timeo[] = {
745 	6, 7, 10
746 };
747 
748 /*
749  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
750  */
751 #define	MAXTIMO	(20*hz)
752 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
753 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
754 
755 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
756 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
757 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
758 
759 /*
760  * Function called when rfscall notices that we have been
761  * re-transmitting, or when we get a response without retransmissions.
762  * Return 1 if the transfer size was adjusted down - 0 if no change.
763  */
764 static int
765 nfs_feedback(int flag, int which, mntinfo_t *mi)
766 {
767 	int kind;
768 	int r = 0;
769 
770 	mutex_enter(&mi->mi_lock);
771 	if (flag == FEEDBACK_REXMIT1) {
772 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
773 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
774 			goto done;
775 		if (mi->mi_curread > MIN_NFS_TSIZE) {
776 			mi->mi_curread /= 2;
777 			if (mi->mi_curread < MIN_NFS_TSIZE)
778 				mi->mi_curread = MIN_NFS_TSIZE;
779 			r = 1;
780 		}
781 
782 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
783 			mi->mi_curwrite /= 2;
784 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
785 				mi->mi_curwrite = MIN_NFS_TSIZE;
786 			r = 1;
787 		}
788 	} else if (flag == FEEDBACK_OK) {
789 		kind = mi->mi_timer_type[which];
790 		if (kind == 0 ||
791 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
792 			goto done;
793 		if (kind == 1) {
794 			if (mi->mi_curread >= mi->mi_tsize)
795 				goto done;
796 			mi->mi_curread +=  MIN_NFS_TSIZE;
797 			if (mi->mi_curread > mi->mi_tsize/2)
798 				mi->mi_curread = mi->mi_tsize;
799 		} else if (kind == 2) {
800 			if (mi->mi_curwrite >= mi->mi_stsize)
801 				goto done;
802 			mi->mi_curwrite += MIN_NFS_TSIZE;
803 			if (mi->mi_curwrite > mi->mi_stsize/2)
804 				mi->mi_curwrite = mi->mi_stsize;
805 		}
806 	}
807 done:
808 	mutex_exit(&mi->mi_lock);
809 	return (r);
810 }
811 
812 #ifdef DEBUG
813 static int rfs2call_hits = 0;
814 static int rfs2call_misses = 0;
815 #endif
816 
817 int
818 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
819     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
820     enum nfsstat *statusp, int flags, failinfo_t *fi)
821 {
822 	int rpcerror;
823 	enum clnt_stat rpc_status;
824 
825 	ASSERT(statusp != NULL);
826 
827 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
828 	    cr, douprintf, &rpc_status, flags, fi);
829 	if (!rpcerror) {
830 		/*
831 		 * See crnetadjust() for comments.
832 		 */
833 		if (*statusp == NFSERR_ACCES &&
834 		    (cr = crnetadjust(cr)) != NULL) {
835 #ifdef DEBUG
836 			rfs2call_hits++;
837 #endif
838 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
839 			    resp, cr, douprintf, NULL, flags, fi);
840 			crfree(cr);
841 #ifdef DEBUG
842 			if (*statusp == NFSERR_ACCES)
843 				rfs2call_misses++;
844 #endif
845 		}
846 	} else if (rpc_status == RPC_PROCUNAVAIL) {
847 		*statusp = NFSERR_OPNOTSUPP;
848 		rpcerror = 0;
849 	}
850 
851 	return (rpcerror);
852 }
853 
854 #define	NFS3_JUKEBOX_DELAY	10 * hz
855 
856 static clock_t nfs3_jukebox_delay = 0;
857 
858 #ifdef DEBUG
859 static int rfs3call_hits = 0;
860 static int rfs3call_misses = 0;
861 #endif
862 
863 int
864 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
865     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
866     nfsstat3 *statusp, int flags, failinfo_t *fi)
867 {
868 	int rpcerror;
869 	int user_informed;
870 
871 	user_informed = 0;
872 	do {
873 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
874 		    cr, douprintf, NULL, flags, fi);
875 		if (!rpcerror) {
876 			cred_t *crr;
877 			if (*statusp == NFS3ERR_JUKEBOX) {
878 				if (ttoproc(curthread) == &p0) {
879 					rpcerror = EAGAIN;
880 					break;
881 				}
882 				if (!user_informed) {
883 					user_informed = 1;
884 					uprintf(
885 		"file temporarily unavailable on the server, retrying...\n");
886 				}
887 				delay(nfs3_jukebox_delay);
888 			}
889 			/*
890 			 * See crnetadjust() for comments.
891 			 */
892 			else if (*statusp == NFS3ERR_ACCES &&
893 			    (crr = crnetadjust(cr)) != NULL) {
894 #ifdef DEBUG
895 				rfs3call_hits++;
896 #endif
897 				rpcerror = rfscall(mi, which, xdrargs, argsp,
898 				    xdrres, resp, crr, douprintf,
899 				    NULL, flags, fi);
900 
901 				crfree(crr);
902 #ifdef DEBUG
903 				if (*statusp == NFS3ERR_ACCES)
904 					rfs3call_misses++;
905 #endif
906 			}
907 		}
908 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
909 
910 	return (rpcerror);
911 }
912 
913 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
914 #define	INC_READERS(mi)		{ \
915 	mi->mi_readers++; \
916 }
917 #define	DEC_READERS(mi)		{ \
918 	mi->mi_readers--; \
919 	if (mi->mi_readers == 0) \
920 		cv_broadcast(&mi->mi_failover_cv); \
921 }
922 
923 static int
924 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
925     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
926     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
927 {
928 	CLIENT *client;
929 	struct chtab *ch;
930 	cred_t *cr = icr;
931 	enum clnt_stat status;
932 	struct rpc_err rpcerr;
933 	struct timeval wait;
934 	int timeo;		/* in units of hz */
935 	int my_rsize, my_wsize;
936 	bool_t tryagain;
937 	bool_t cred_cloned = FALSE;
938 	k_sigset_t smask;
939 	servinfo_t *svp;
940 	struct nfs_clnt *nfscl;
941 	zoneid_t zoneid = getzoneid();
942 #ifdef DEBUG
943 	char *bufp;
944 #endif
945 
946 
947 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
948 	    "rfscall_start:which %d mi %p", which, mi);
949 
950 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
951 	ASSERT(nfscl != NULL);
952 
953 	nfscl->nfscl_stat.calls.value.ui64++;
954 	mi->mi_reqs[which].value.ui64++;
955 
956 	rpcerr.re_status = RPC_SUCCESS;
957 
958 	/*
959 	 * In case of forced unmount or zone shutdown, return EIO.
960 	 */
961 
962 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
963 		rpcerr.re_status = RPC_FAILED;
964 		rpcerr.re_errno = EIO;
965 		return (rpcerr.re_errno);
966 	}
967 
968 	/*
969 	 * Remember the transfer sizes in case
970 	 * nfs_feedback changes them underneath us.
971 	 */
972 	my_rsize = mi->mi_curread;
973 	my_wsize = mi->mi_curwrite;
974 
975 	/*
976 	 * NFS client failover support
977 	 *
978 	 * If this rnode is not in sync with the current server (VALID_FH),
979 	 * we'd like to do a remap to get in sync.  We can be interrupted
980 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
981 	 * use the best info we have to try the RPC.  Part of that is
982 	 * unconditionally updating the filehandle copy kept for V3.
983 	 *
984 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
985 	 * rw_enter(); we're trying to keep the current server from being
986 	 * changed on us until we're done with the remapping and have a
987 	 * matching client handle.  We don't want to sending a filehandle
988 	 * to the wrong host.
989 	 */
990 failoverretry:
991 	if (FAILOVER_MOUNT(mi)) {
992 		mutex_enter(&mi->mi_lock);
993 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
994 			if (failover_wait(mi)) {
995 				mutex_exit(&mi->mi_lock);
996 				return (EINTR);
997 			}
998 		}
999 		INC_READERS(mi);
1000 		mutex_exit(&mi->mi_lock);
1001 		if (fi) {
1002 			if (!VALID_FH(fi) &&
1003 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1004 				int remaperr;
1005 
1006 				svp = mi->mi_curr_serv;
1007 				remaperr = failover_remap(fi);
1008 				if (remaperr != 0) {
1009 #ifdef DEBUG
1010 					if (remaperr != EINTR)
1011 						nfs_cmn_err(remaperr, CE_WARN,
1012 					    "rfscall couldn't failover: %m");
1013 #endif
1014 					mutex_enter(&mi->mi_lock);
1015 					DEC_READERS(mi);
1016 					mutex_exit(&mi->mi_lock);
1017 					/*
1018 					 * If failover_remap returns ETIMEDOUT
1019 					 * and the filesystem is hard mounted
1020 					 * we have to retry the call with a new
1021 					 * server.
1022 					 */
1023 					if ((mi->mi_flags & MI_HARD) &&
1024 					    IS_RECOVERABLE_ERROR(remaperr)) {
1025 						if (svp == mi->mi_curr_serv)
1026 							failover_newserver(mi);
1027 						rpcerr.re_status = RPC_SUCCESS;
1028 						goto failoverretry;
1029 					}
1030 					rpcerr.re_errno = remaperr;
1031 					return (remaperr);
1032 				}
1033 			}
1034 			if (fi->fhp && fi->copyproc)
1035 				(*fi->copyproc)(fi->fhp, fi->vp);
1036 		}
1037 	}
1038 
1039 	/* For TSOL, use a new cred which has net_mac_aware flag */
1040 	if (!cred_cloned && is_system_labeled()) {
1041 		cred_cloned = TRUE;
1042 		cr = crdup(icr);
1043 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1044 	}
1045 
1046 	/*
1047 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1048 	 * are guaranteed to reprocess the retry as a new request.
1049 	 */
1050 	svp = mi->mi_curr_serv;
1051 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1052 
1053 	if (FAILOVER_MOUNT(mi)) {
1054 		mutex_enter(&mi->mi_lock);
1055 		DEC_READERS(mi);
1056 		mutex_exit(&mi->mi_lock);
1057 
1058 		if ((rpcerr.re_errno == ETIMEDOUT ||
1059 		    rpcerr.re_errno == ECONNRESET) &&
1060 		    failover_safe(fi)) {
1061 			if (svp == mi->mi_curr_serv)
1062 				failover_newserver(mi);
1063 			goto failoverretry;
1064 		}
1065 	}
1066 	if (rpcerr.re_errno != 0)
1067 		return (rpcerr.re_errno);
1068 
1069 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1070 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1071 		timeo = (mi->mi_timeo * hz) / 10;
1072 	} else {
1073 		mutex_enter(&mi->mi_lock);
1074 		timeo = CLNT_SETTIMERS(client,
1075 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1076 		    &(mi->mi_timers[NFS_CALLTYPES]),
1077 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1078 		    (void (*)())NULL, (caddr_t)mi, 0);
1079 		mutex_exit(&mi->mi_lock);
1080 	}
1081 
1082 	/*
1083 	 * If hard mounted fs, retry call forever unless hard error occurs.
1084 	 */
1085 	do {
1086 		tryagain = FALSE;
1087 
1088 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1089 			status = RPC_FAILED;
1090 			rpcerr.re_status = RPC_FAILED;
1091 			rpcerr.re_errno = EIO;
1092 			break;
1093 		}
1094 
1095 		TICK_TO_TIMEVAL(timeo, &wait);
1096 
1097 		/*
1098 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1099 		 * and SIGTERM. (Preserving the existing masks).
1100 		 * Mask out SIGINT if mount option nointr is specified.
1101 		 */
1102 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1103 		if (!(mi->mi_flags & MI_INT))
1104 			client->cl_nosignal = TRUE;
1105 
1106 		/*
1107 		 * If there is a current signal, then don't bother
1108 		 * even trying to send out the request because we
1109 		 * won't be able to block waiting for the response.
1110 		 * Simply assume RPC_INTR and get on with it.
1111 		 */
1112 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1113 			status = RPC_INTR;
1114 		else {
1115 			status = CLNT_CALL(client, which, xdrargs, argsp,
1116 			    xdrres, resp, wait);
1117 		}
1118 
1119 		if (!(mi->mi_flags & MI_INT))
1120 			client->cl_nosignal = FALSE;
1121 		/*
1122 		 * restore original signal mask
1123 		 */
1124 		sigunintr(&smask);
1125 
1126 		switch (status) {
1127 		case RPC_SUCCESS:
1128 			if ((mi->mi_flags & MI_DYNAMIC) &&
1129 			    mi->mi_timer_type[which] != 0 &&
1130 			    (mi->mi_curread != my_rsize ||
1131 			    mi->mi_curwrite != my_wsize))
1132 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1133 			break;
1134 
1135 		case RPC_INTR:
1136 			/*
1137 			 * There is no way to recover from this error,
1138 			 * even if mount option nointr is specified.
1139 			 * SIGKILL, for example, cannot be blocked.
1140 			 */
1141 			rpcerr.re_status = RPC_INTR;
1142 			rpcerr.re_errno = EINTR;
1143 			break;
1144 
1145 		case RPC_UDERROR:
1146 			/*
1147 			 * If the NFS server is local (vold) and
1148 			 * it goes away then we get RPC_UDERROR.
1149 			 * This is a retryable error, so we would
1150 			 * loop, so check to see if the specific
1151 			 * error was ECONNRESET, indicating that
1152 			 * target did not exist at all.  If so,
1153 			 * return with RPC_PROGUNAVAIL and
1154 			 * ECONNRESET to indicate why.
1155 			 */
1156 			CLNT_GETERR(client, &rpcerr);
1157 			if (rpcerr.re_errno == ECONNRESET) {
1158 				rpcerr.re_status = RPC_PROGUNAVAIL;
1159 				rpcerr.re_errno = ECONNRESET;
1160 				break;
1161 			}
1162 			/*FALLTHROUGH*/
1163 
1164 		default:		/* probably RPC_TIMEDOUT */
1165 			if (IS_UNRECOVERABLE_RPC(status))
1166 				break;
1167 
1168 			/*
1169 			 * increment server not responding count
1170 			 */
1171 			mutex_enter(&mi->mi_lock);
1172 			mi->mi_noresponse++;
1173 			mutex_exit(&mi->mi_lock);
1174 #ifdef DEBUG
1175 			nfscl->nfscl_stat.noresponse.value.ui64++;
1176 #endif
1177 
1178 			if (!(mi->mi_flags & MI_HARD)) {
1179 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1180 				    (mi->mi_ss_call_type[which] == 0))
1181 					break;
1182 			}
1183 
1184 			/*
1185 			 * The call is in progress (over COTS).
1186 			 * Try the CLNT_CALL again, but don't
1187 			 * print a noisy error message.
1188 			 */
1189 			if (status == RPC_INPROGRESS) {
1190 				tryagain = TRUE;
1191 				break;
1192 			}
1193 
1194 			if (flags & RFSCALL_SOFT)
1195 				break;
1196 
1197 			/*
1198 			 * On zone shutdown, just move on.
1199 			 */
1200 			if (zone_status_get(curproc->p_zone) >=
1201 			    ZONE_IS_SHUTTING_DOWN) {
1202 				rpcerr.re_status = RPC_FAILED;
1203 				rpcerr.re_errno = EIO;
1204 				break;
1205 			}
1206 
1207 			/*
1208 			 * NFS client failover support
1209 			 *
1210 			 * If the current server just failed us, we'll
1211 			 * start the process of finding a new server.
1212 			 * After that, we can just retry.
1213 			 */
1214 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1215 				if (svp == mi->mi_curr_serv)
1216 					failover_newserver(mi);
1217 				clfree_impl(client, ch, nfscl);
1218 				goto failoverretry;
1219 			}
1220 
1221 			tryagain = TRUE;
1222 			timeo = backoff(timeo);
1223 			mutex_enter(&mi->mi_lock);
1224 			if (!(mi->mi_flags & MI_PRINTED)) {
1225 				mi->mi_flags |= MI_PRINTED;
1226 				mutex_exit(&mi->mi_lock);
1227 #ifdef DEBUG
1228 				zprintf(zoneid,
1229 			"NFS%d server %s not responding still trying\n",
1230 				    mi->mi_vers, svp->sv_hostname);
1231 #else
1232 				zprintf(zoneid,
1233 			"NFS server %s not responding still trying\n",
1234 				    svp->sv_hostname);
1235 #endif
1236 			} else
1237 				mutex_exit(&mi->mi_lock);
1238 			if (*douprintf && nfs_has_ctty()) {
1239 				*douprintf = 0;
1240 				if (!(mi->mi_flags & MI_NOPRINT))
1241 #ifdef DEBUG
1242 					uprintf(
1243 			    "NFS%d server %s not responding still trying\n",
1244 					    mi->mi_vers, svp->sv_hostname);
1245 #else
1246 					uprintf(
1247 			    "NFS server %s not responding still trying\n",
1248 					    svp->sv_hostname);
1249 #endif
1250 			}
1251 
1252 			/*
1253 			 * If doing dynamic adjustment of transfer
1254 			 * size and if it's a read or write call
1255 			 * and if the transfer size changed while
1256 			 * retransmitting or if the feedback routine
1257 			 * changed the transfer size,
1258 			 * then exit rfscall so that the transfer
1259 			 * size can be adjusted at the vnops level.
1260 			 */
1261 			if ((mi->mi_flags & MI_DYNAMIC) &&
1262 			    mi->mi_timer_type[which] != 0 &&
1263 			    (mi->mi_curread != my_rsize ||
1264 			    mi->mi_curwrite != my_wsize ||
1265 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1266 				/*
1267 				 * On read or write calls, return
1268 				 * back to the vnode ops level if
1269 				 * the transfer size changed.
1270 				 */
1271 				clfree_impl(client, ch, nfscl);
1272 				if (cred_cloned)
1273 					crfree(cr);
1274 				return (ENFS_TRYAGAIN);
1275 			}
1276 		}
1277 	} while (tryagain);
1278 
1279 	if (status != RPC_SUCCESS) {
1280 		/*
1281 		 * Let soft mounts use the timed out message.
1282 		 */
1283 		if (status == RPC_INPROGRESS)
1284 			status = RPC_TIMEDOUT;
1285 		nfscl->nfscl_stat.badcalls.value.ui64++;
1286 		if (status != RPC_INTR) {
1287 			mutex_enter(&mi->mi_lock);
1288 			mi->mi_flags |= MI_DOWN;
1289 			mutex_exit(&mi->mi_lock);
1290 			CLNT_GETERR(client, &rpcerr);
1291 #ifdef DEBUG
1292 			bufp = clnt_sperror(client, svp->sv_hostname);
1293 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1294 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1295 			if (nfs_has_ctty()) {
1296 				if (!(mi->mi_flags & MI_NOPRINT)) {
1297 					uprintf("NFS%d %s failed for %s\n",
1298 					    mi->mi_vers, mi->mi_rfsnames[which],
1299 					    bufp);
1300 				}
1301 			}
1302 			kmem_free(bufp, MAXPATHLEN);
1303 #else
1304 			zprintf(zoneid,
1305 			    "NFS %s failed for server %s: error %d (%s)\n",
1306 			    mi->mi_rfsnames[which], svp->sv_hostname,
1307 			    status, clnt_sperrno(status));
1308 			if (nfs_has_ctty()) {
1309 				if (!(mi->mi_flags & MI_NOPRINT)) {
1310 					uprintf(
1311 				"NFS %s failed for server %s: error %d (%s)\n",
1312 					    mi->mi_rfsnames[which],
1313 					    svp->sv_hostname, status,
1314 					    clnt_sperrno(status));
1315 				}
1316 			}
1317 #endif
1318 			/*
1319 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1320 			 * re_errno is set appropriately depending on
1321 			 * the authentication error
1322 			 */
1323 			if (status == RPC_VERSMISMATCH ||
1324 			    status == RPC_PROGVERSMISMATCH)
1325 				rpcerr.re_errno = EIO;
1326 		}
1327 	} else {
1328 		/*
1329 		 * Test the value of mi_down and mi_printed without
1330 		 * holding the mi_lock mutex.  If they are both zero,
1331 		 * then it is okay to skip the down and printed
1332 		 * processing.  This saves on a mutex_enter and
1333 		 * mutex_exit pair for a normal, successful RPC.
1334 		 * This was just complete overhead.
1335 		 */
1336 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1337 			mutex_enter(&mi->mi_lock);
1338 			mi->mi_flags &= ~MI_DOWN;
1339 			if (mi->mi_flags & MI_PRINTED) {
1340 				mi->mi_flags &= ~MI_PRINTED;
1341 				mutex_exit(&mi->mi_lock);
1342 #ifdef DEBUG
1343 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1344 				zprintf(zoneid, "NFS%d server %s ok\n",
1345 				    mi->mi_vers, svp->sv_hostname);
1346 #else
1347 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1348 				zprintf(zoneid, "NFS server %s ok\n",
1349 				    svp->sv_hostname);
1350 #endif
1351 			} else
1352 				mutex_exit(&mi->mi_lock);
1353 		}
1354 
1355 		if (*douprintf == 0) {
1356 			if (!(mi->mi_flags & MI_NOPRINT))
1357 #ifdef DEBUG
1358 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1359 					uprintf("NFS%d server %s ok\n",
1360 					    mi->mi_vers, svp->sv_hostname);
1361 #else
1362 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1363 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1364 #endif
1365 			*douprintf = 1;
1366 		}
1367 	}
1368 
1369 	clfree_impl(client, ch, nfscl);
1370 	if (cred_cloned)
1371 		crfree(cr);
1372 
1373 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1374 
1375 	if (rpc_status != NULL)
1376 		*rpc_status = rpcerr.re_status;
1377 
1378 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1379 	    rpcerr.re_errno);
1380 
1381 	return (rpcerr.re_errno);
1382 }
1383 
1384 #ifdef DEBUG
1385 static int acl2call_hits = 0;
1386 static int acl2call_misses = 0;
1387 #endif
1388 
1389 int
1390 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1391     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1392     enum nfsstat *statusp, int flags, failinfo_t *fi)
1393 {
1394 	int rpcerror;
1395 
1396 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1397 	    cr, douprintf, flags, fi);
1398 	if (!rpcerror) {
1399 		/*
1400 		 * See comments with crnetadjust().
1401 		 */
1402 		if (*statusp == NFSERR_ACCES &&
1403 		    (cr = crnetadjust(cr)) != NULL) {
1404 #ifdef DEBUG
1405 			acl2call_hits++;
1406 #endif
1407 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1408 			    resp, cr, douprintf, flags, fi);
1409 			crfree(cr);
1410 #ifdef DEBUG
1411 			if (*statusp == NFSERR_ACCES)
1412 				acl2call_misses++;
1413 #endif
1414 		}
1415 	}
1416 
1417 	return (rpcerror);
1418 }
1419 
1420 #ifdef DEBUG
1421 static int acl3call_hits = 0;
1422 static int acl3call_misses = 0;
1423 #endif
1424 
1425 int
1426 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1427     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1428     nfsstat3 *statusp, int flags, failinfo_t *fi)
1429 {
1430 	int rpcerror;
1431 	int user_informed;
1432 
1433 	user_informed = 0;
1434 
1435 	do {
1436 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1437 		    cr, douprintf, flags, fi);
1438 		if (!rpcerror) {
1439 			cred_t *crr;
1440 			if (*statusp == NFS3ERR_JUKEBOX) {
1441 				if (!user_informed) {
1442 					user_informed = 1;
1443 					uprintf(
1444 		"file temporarily unavailable on the server, retrying...\n");
1445 				}
1446 				delay(nfs3_jukebox_delay);
1447 			}
1448 			/*
1449 			 * See crnetadjust() for comments.
1450 			 */
1451 			else if (*statusp == NFS3ERR_ACCES &&
1452 			    (crr = crnetadjust(cr)) != NULL) {
1453 #ifdef DEBUG
1454 				acl3call_hits++;
1455 #endif
1456 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1457 				    xdrres, resp, crr, douprintf, flags, fi);
1458 
1459 				crfree(crr);
1460 #ifdef DEBUG
1461 				if (*statusp == NFS3ERR_ACCES)
1462 					acl3call_misses++;
1463 #endif
1464 			}
1465 		}
1466 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1467 
1468 	return (rpcerror);
1469 }
1470 
1471 static int
1472 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1473     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1474     int flags, failinfo_t *fi)
1475 {
1476 	CLIENT *client;
1477 	struct chtab *ch;
1478 	cred_t *cr = icr;
1479 	bool_t cred_cloned = FALSE;
1480 	enum clnt_stat status;
1481 	struct rpc_err rpcerr;
1482 	struct timeval wait;
1483 	int timeo;		/* in units of hz */
1484 #if 0 /* notyet */
1485 	int my_rsize, my_wsize;
1486 #endif
1487 	bool_t tryagain;
1488 	k_sigset_t smask;
1489 	servinfo_t *svp;
1490 	struct nfs_clnt *nfscl;
1491 	zoneid_t zoneid = getzoneid();
1492 #ifdef DEBUG
1493 	char *bufp;
1494 #endif
1495 
1496 #if 0 /* notyet */
1497 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1498 	    "rfscall_start:which %d mi %p", which, mi);
1499 #endif
1500 
1501 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1502 	ASSERT(nfscl != NULL);
1503 
1504 	nfscl->nfscl_stat.calls.value.ui64++;
1505 	mi->mi_aclreqs[which].value.ui64++;
1506 
1507 	rpcerr.re_status = RPC_SUCCESS;
1508 
1509 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1510 		rpcerr.re_status = RPC_FAILED;
1511 		rpcerr.re_errno = EIO;
1512 		return (rpcerr.re_errno);
1513 	}
1514 
1515 #if 0 /* notyet */
1516 	/*
1517 	 * Remember the transfer sizes in case
1518 	 * nfs_feedback changes them underneath us.
1519 	 */
1520 	my_rsize = mi->mi_curread;
1521 	my_wsize = mi->mi_curwrite;
1522 #endif
1523 
1524 	/*
1525 	 * NFS client failover support
1526 	 *
1527 	 * If this rnode is not in sync with the current server (VALID_FH),
1528 	 * we'd like to do a remap to get in sync.  We can be interrupted
1529 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1530 	 * use the best info we have to try the RPC.  Part of that is
1531 	 * unconditionally updating the filehandle copy kept for V3.
1532 	 *
1533 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1534 	 * rw_enter(); we're trying to keep the current server from being
1535 	 * changed on us until we're done with the remapping and have a
1536 	 * matching client handle.  We don't want to sending a filehandle
1537 	 * to the wrong host.
1538 	 */
1539 failoverretry:
1540 	if (FAILOVER_MOUNT(mi)) {
1541 		mutex_enter(&mi->mi_lock);
1542 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1543 			if (failover_wait(mi)) {
1544 				mutex_exit(&mi->mi_lock);
1545 				return (EINTR);
1546 			}
1547 		}
1548 		INC_READERS(mi);
1549 		mutex_exit(&mi->mi_lock);
1550 		if (fi) {
1551 			if (!VALID_FH(fi) &&
1552 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1553 				int remaperr;
1554 
1555 				svp = mi->mi_curr_serv;
1556 				remaperr = failover_remap(fi);
1557 				if (remaperr != 0) {
1558 #ifdef DEBUG
1559 					if (remaperr != EINTR)
1560 						nfs_cmn_err(remaperr, CE_WARN,
1561 					    "aclcall couldn't failover: %m");
1562 #endif
1563 					mutex_enter(&mi->mi_lock);
1564 					DEC_READERS(mi);
1565 					mutex_exit(&mi->mi_lock);
1566 
1567 					/*
1568 					 * If failover_remap returns ETIMEDOUT
1569 					 * and the filesystem is hard mounted
1570 					 * we have to retry the call with a new
1571 					 * server.
1572 					 */
1573 					if ((mi->mi_flags & MI_HARD) &&
1574 					    IS_RECOVERABLE_ERROR(remaperr)) {
1575 						if (svp == mi->mi_curr_serv)
1576 							failover_newserver(mi);
1577 						rpcerr.re_status = RPC_SUCCESS;
1578 						goto failoverretry;
1579 					}
1580 					return (remaperr);
1581 				}
1582 			}
1583 			if (fi->fhp && fi->copyproc)
1584 				(*fi->copyproc)(fi->fhp, fi->vp);
1585 		}
1586 	}
1587 
1588 	/* For TSOL, use a new cred which has net_mac_aware flag */
1589 	if (!cred_cloned && is_system_labeled()) {
1590 		cred_cloned = TRUE;
1591 		cr = crdup(icr);
1592 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1593 	}
1594 
1595 	/*
1596 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1597 	 * are guaranteed to reprocess the retry as a new request.
1598 	 */
1599 	svp = mi->mi_curr_serv;
1600 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1601 	if (FAILOVER_MOUNT(mi)) {
1602 		mutex_enter(&mi->mi_lock);
1603 		DEC_READERS(mi);
1604 		mutex_exit(&mi->mi_lock);
1605 
1606 		if ((rpcerr.re_errno == ETIMEDOUT ||
1607 		    rpcerr.re_errno == ECONNRESET) &&
1608 		    failover_safe(fi)) {
1609 			if (svp == mi->mi_curr_serv)
1610 				failover_newserver(mi);
1611 			goto failoverretry;
1612 		}
1613 	}
1614 	if (rpcerr.re_errno != 0) {
1615 		if (cred_cloned)
1616 			crfree(cr);
1617 		return (rpcerr.re_errno);
1618 	}
1619 
1620 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1621 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1622 		timeo = (mi->mi_timeo * hz) / 10;
1623 	} else {
1624 		mutex_enter(&mi->mi_lock);
1625 		timeo = CLNT_SETTIMERS(client,
1626 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1627 		    &(mi->mi_timers[NFS_CALLTYPES]),
1628 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1629 		    (void (*)()) 0, (caddr_t)mi, 0);
1630 		mutex_exit(&mi->mi_lock);
1631 	}
1632 
1633 	/*
1634 	 * If hard mounted fs, retry call forever unless hard error occurs.
1635 	 */
1636 	do {
1637 		tryagain = FALSE;
1638 
1639 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1640 			status = RPC_FAILED;
1641 			rpcerr.re_status = RPC_FAILED;
1642 			rpcerr.re_errno = EIO;
1643 			break;
1644 		}
1645 
1646 		TICK_TO_TIMEVAL(timeo, &wait);
1647 
1648 		/*
1649 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1650 		 * and SIGTERM. (Preserving the existing masks).
1651 		 * Mask out SIGINT if mount option nointr is specified.
1652 		 */
1653 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1654 		if (!(mi->mi_flags & MI_INT))
1655 			client->cl_nosignal = TRUE;
1656 
1657 		/*
1658 		 * If there is a current signal, then don't bother
1659 		 * even trying to send out the request because we
1660 		 * won't be able to block waiting for the response.
1661 		 * Simply assume RPC_INTR and get on with it.
1662 		 */
1663 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1664 			status = RPC_INTR;
1665 		else {
1666 			status = CLNT_CALL(client, which, xdrargs, argsp,
1667 			    xdrres, resp, wait);
1668 		}
1669 
1670 		if (!(mi->mi_flags & MI_INT))
1671 			client->cl_nosignal = FALSE;
1672 		/*
1673 		 * restore original signal mask
1674 		 */
1675 		sigunintr(&smask);
1676 
1677 		switch (status) {
1678 		case RPC_SUCCESS:
1679 #if 0 /* notyet */
1680 			if ((mi->mi_flags & MI_DYNAMIC) &&
1681 			    mi->mi_timer_type[which] != 0 &&
1682 			    (mi->mi_curread != my_rsize ||
1683 			    mi->mi_curwrite != my_wsize))
1684 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1685 #endif
1686 			break;
1687 
1688 		/*
1689 		 * Unfortunately, there are servers in the world which
1690 		 * are not coded correctly.  They are not prepared to
1691 		 * handle RPC requests to the NFS port which are not
1692 		 * NFS requests.  Thus, they may try to process the
1693 		 * NFS_ACL request as if it were an NFS request.  This
1694 		 * does not work.  Generally, an error will be generated
1695 		 * on the client because it will not be able to decode
1696 		 * the response from the server.  However, it seems
1697 		 * possible that the server may not be able to decode
1698 		 * the arguments.  Thus, the criteria for deciding
1699 		 * whether the server supports NFS_ACL or not is whether
1700 		 * the following RPC errors are returned from CLNT_CALL.
1701 		 */
1702 		case RPC_CANTDECODERES:
1703 		case RPC_PROGUNAVAIL:
1704 		case RPC_CANTDECODEARGS:
1705 		case RPC_PROGVERSMISMATCH:
1706 			mutex_enter(&mi->mi_lock);
1707 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1708 			mutex_exit(&mi->mi_lock);
1709 			break;
1710 
1711 		/*
1712 		 * If the server supports NFS_ACL but not the new ops
1713 		 * for extended attributes, make sure we don't retry.
1714 		 */
1715 		case RPC_PROCUNAVAIL:
1716 			mutex_enter(&mi->mi_lock);
1717 			mi->mi_flags &= ~MI_EXTATTR;
1718 			mutex_exit(&mi->mi_lock);
1719 			break;
1720 
1721 		case RPC_INTR:
1722 			/*
1723 			 * There is no way to recover from this error,
1724 			 * even if mount option nointr is specified.
1725 			 * SIGKILL, for example, cannot be blocked.
1726 			 */
1727 			rpcerr.re_status = RPC_INTR;
1728 			rpcerr.re_errno = EINTR;
1729 			break;
1730 
1731 		case RPC_UDERROR:
1732 			/*
1733 			 * If the NFS server is local (vold) and
1734 			 * it goes away then we get RPC_UDERROR.
1735 			 * This is a retryable error, so we would
1736 			 * loop, so check to see if the specific
1737 			 * error was ECONNRESET, indicating that
1738 			 * target did not exist at all.  If so,
1739 			 * return with RPC_PROGUNAVAIL and
1740 			 * ECONNRESET to indicate why.
1741 			 */
1742 			CLNT_GETERR(client, &rpcerr);
1743 			if (rpcerr.re_errno == ECONNRESET) {
1744 				rpcerr.re_status = RPC_PROGUNAVAIL;
1745 				rpcerr.re_errno = ECONNRESET;
1746 				break;
1747 			}
1748 			/*FALLTHROUGH*/
1749 
1750 		default:		/* probably RPC_TIMEDOUT */
1751 			if (IS_UNRECOVERABLE_RPC(status))
1752 				break;
1753 
1754 			/*
1755 			 * increment server not responding count
1756 			 */
1757 			mutex_enter(&mi->mi_lock);
1758 			mi->mi_noresponse++;
1759 			mutex_exit(&mi->mi_lock);
1760 #ifdef DEBUG
1761 			nfscl->nfscl_stat.noresponse.value.ui64++;
1762 #endif
1763 
1764 			if (!(mi->mi_flags & MI_HARD)) {
1765 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1766 				    (mi->mi_acl_ss_call_type[which] == 0))
1767 					break;
1768 			}
1769 
1770 			/*
1771 			 * The call is in progress (over COTS).
1772 			 * Try the CLNT_CALL again, but don't
1773 			 * print a noisy error message.
1774 			 */
1775 			if (status == RPC_INPROGRESS) {
1776 				tryagain = TRUE;
1777 				break;
1778 			}
1779 
1780 			if (flags & RFSCALL_SOFT)
1781 				break;
1782 
1783 			/*
1784 			 * On zone shutdown, just move on.
1785 			 */
1786 			if (zone_status_get(curproc->p_zone) >=
1787 			    ZONE_IS_SHUTTING_DOWN) {
1788 				rpcerr.re_status = RPC_FAILED;
1789 				rpcerr.re_errno = EIO;
1790 				break;
1791 			}
1792 
1793 			/*
1794 			 * NFS client failover support
1795 			 *
1796 			 * If the current server just failed us, we'll
1797 			 * start the process of finding a new server.
1798 			 * After that, we can just retry.
1799 			 */
1800 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1801 				if (svp == mi->mi_curr_serv)
1802 					failover_newserver(mi);
1803 				clfree_impl(client, ch, nfscl);
1804 				goto failoverretry;
1805 			}
1806 
1807 			tryagain = TRUE;
1808 			timeo = backoff(timeo);
1809 			mutex_enter(&mi->mi_lock);
1810 			if (!(mi->mi_flags & MI_PRINTED)) {
1811 				mi->mi_flags |= MI_PRINTED;
1812 				mutex_exit(&mi->mi_lock);
1813 #ifdef DEBUG
1814 				zprintf(zoneid,
1815 			"NFS_ACL%d server %s not responding still trying\n",
1816 				    mi->mi_vers, svp->sv_hostname);
1817 #else
1818 				zprintf(zoneid,
1819 			    "NFS server %s not responding still trying\n",
1820 				    svp->sv_hostname);
1821 #endif
1822 			} else
1823 				mutex_exit(&mi->mi_lock);
1824 			if (*douprintf && nfs_has_ctty()) {
1825 				*douprintf = 0;
1826 				if (!(mi->mi_flags & MI_NOPRINT))
1827 #ifdef DEBUG
1828 					uprintf(
1829 			"NFS_ACL%d server %s not responding still trying\n",
1830 					    mi->mi_vers, svp->sv_hostname);
1831 #else
1832 					uprintf(
1833 			    "NFS server %s not responding still trying\n",
1834 					    svp->sv_hostname);
1835 #endif
1836 			}
1837 
1838 #if 0 /* notyet */
1839 			/*
1840 			 * If doing dynamic adjustment of transfer
1841 			 * size and if it's a read or write call
1842 			 * and if the transfer size changed while
1843 			 * retransmitting or if the feedback routine
1844 			 * changed the transfer size,
1845 			 * then exit rfscall so that the transfer
1846 			 * size can be adjusted at the vnops level.
1847 			 */
1848 			if ((mi->mi_flags & MI_DYNAMIC) &&
1849 			    mi->mi_acl_timer_type[which] != 0 &&
1850 			    (mi->mi_curread != my_rsize ||
1851 			    mi->mi_curwrite != my_wsize ||
1852 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1853 				/*
1854 				 * On read or write calls, return
1855 				 * back to the vnode ops level if
1856 				 * the transfer size changed.
1857 				 */
1858 				clfree_impl(client, ch, nfscl);
1859 				if (cred_cloned)
1860 					crfree(cr);
1861 				return (ENFS_TRYAGAIN);
1862 			}
1863 #endif
1864 		}
1865 	} while (tryagain);
1866 
1867 	if (status != RPC_SUCCESS) {
1868 		/*
1869 		 * Let soft mounts use the timed out message.
1870 		 */
1871 		if (status == RPC_INPROGRESS)
1872 			status = RPC_TIMEDOUT;
1873 		nfscl->nfscl_stat.badcalls.value.ui64++;
1874 		if (status == RPC_CANTDECODERES ||
1875 		    status == RPC_PROGUNAVAIL ||
1876 		    status == RPC_PROCUNAVAIL ||
1877 		    status == RPC_CANTDECODEARGS ||
1878 		    status == RPC_PROGVERSMISMATCH)
1879 			CLNT_GETERR(client, &rpcerr);
1880 		else if (status != RPC_INTR) {
1881 			mutex_enter(&mi->mi_lock);
1882 			mi->mi_flags |= MI_DOWN;
1883 			mutex_exit(&mi->mi_lock);
1884 			CLNT_GETERR(client, &rpcerr);
1885 #ifdef DEBUG
1886 			bufp = clnt_sperror(client, svp->sv_hostname);
1887 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1888 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1889 			if (nfs_has_ctty()) {
1890 				if (!(mi->mi_flags & MI_NOPRINT)) {
1891 					uprintf("NFS_ACL%d %s failed for %s\n",
1892 					    mi->mi_vers, mi->mi_aclnames[which],
1893 					    bufp);
1894 				}
1895 			}
1896 			kmem_free(bufp, MAXPATHLEN);
1897 #else
1898 			zprintf(zoneid,
1899 			    "NFS %s failed for server %s: error %d (%s)\n",
1900 			    mi->mi_aclnames[which], svp->sv_hostname,
1901 			    status, clnt_sperrno(status));
1902 			if (nfs_has_ctty()) {
1903 				if (!(mi->mi_flags & MI_NOPRINT))
1904 					uprintf(
1905 				"NFS %s failed for server %s: error %d (%s)\n",
1906 					    mi->mi_aclnames[which],
1907 					    svp->sv_hostname, status,
1908 					    clnt_sperrno(status));
1909 			}
1910 #endif
1911 			/*
1912 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1913 			 * re_errno is set appropriately depending on
1914 			 * the authentication error
1915 			 */
1916 			if (status == RPC_VERSMISMATCH ||
1917 			    status == RPC_PROGVERSMISMATCH)
1918 				rpcerr.re_errno = EIO;
1919 		}
1920 	} else {
1921 		/*
1922 		 * Test the value of mi_down and mi_printed without
1923 		 * holding the mi_lock mutex.  If they are both zero,
1924 		 * then it is okay to skip the down and printed
1925 		 * processing.  This saves on a mutex_enter and
1926 		 * mutex_exit pair for a normal, successful RPC.
1927 		 * This was just complete overhead.
1928 		 */
1929 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1930 			mutex_enter(&mi->mi_lock);
1931 			mi->mi_flags &= ~MI_DOWN;
1932 			if (mi->mi_flags & MI_PRINTED) {
1933 				mi->mi_flags &= ~MI_PRINTED;
1934 				mutex_exit(&mi->mi_lock);
1935 #ifdef DEBUG
1936 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1937 				    mi->mi_vers, svp->sv_hostname);
1938 #else
1939 				zprintf(zoneid, "NFS server %s ok\n",
1940 				    svp->sv_hostname);
1941 #endif
1942 			} else
1943 				mutex_exit(&mi->mi_lock);
1944 		}
1945 
1946 		if (*douprintf == 0) {
1947 			if (!(mi->mi_flags & MI_NOPRINT))
1948 #ifdef DEBUG
1949 				uprintf("NFS_ACL%d server %s ok\n",
1950 				    mi->mi_vers, svp->sv_hostname);
1951 #else
1952 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1953 #endif
1954 			*douprintf = 1;
1955 		}
1956 	}
1957 
1958 	clfree_impl(client, ch, nfscl);
1959 	if (cred_cloned)
1960 		crfree(cr);
1961 
1962 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1963 
1964 #if 0 /* notyet */
1965 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1966 	    rpcerr.re_errno);
1967 #endif
1968 
1969 	return (rpcerr.re_errno);
1970 }
1971 
1972 int
1973 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1974 {
1975 	uint_t mask = vap->va_mask;
1976 
1977 	if (!(mask & AT_MODE))
1978 		sa->sa_mode = (uint32_t)-1;
1979 	else
1980 		sa->sa_mode = vap->va_mode;
1981 	if (!(mask & AT_UID))
1982 		sa->sa_uid = (uint32_t)-1;
1983 	else
1984 		sa->sa_uid = (uint32_t)vap->va_uid;
1985 	if (!(mask & AT_GID))
1986 		sa->sa_gid = (uint32_t)-1;
1987 	else
1988 		sa->sa_gid = (uint32_t)vap->va_gid;
1989 	if (!(mask & AT_SIZE))
1990 		sa->sa_size = (uint32_t)-1;
1991 	else
1992 		sa->sa_size = (uint32_t)vap->va_size;
1993 	if (!(mask & AT_ATIME))
1994 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
1995 	else {
1996 		/* check time validity */
1997 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1998 			return (EOVERFLOW);
1999 		}
2000 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2001 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2002 	}
2003 	if (!(mask & AT_MTIME))
2004 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2005 	else {
2006 		/* check time validity */
2007 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2008 			return (EOVERFLOW);
2009 		}
2010 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2011 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2012 	}
2013 	return (0);
2014 }
2015 
2016 int
2017 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2018 {
2019 	uint_t mask = vap->va_mask;
2020 
2021 	if (!(mask & AT_MODE))
2022 		sa->mode.set_it = FALSE;
2023 	else {
2024 		sa->mode.set_it = TRUE;
2025 		sa->mode.mode = (mode3)vap->va_mode;
2026 	}
2027 	if (!(mask & AT_UID))
2028 		sa->uid.set_it = FALSE;
2029 	else {
2030 		sa->uid.set_it = TRUE;
2031 		sa->uid.uid = (uid3)vap->va_uid;
2032 	}
2033 	if (!(mask & AT_GID))
2034 		sa->gid.set_it = FALSE;
2035 	else {
2036 		sa->gid.set_it = TRUE;
2037 		sa->gid.gid = (gid3)vap->va_gid;
2038 	}
2039 	if (!(mask & AT_SIZE))
2040 		sa->size.set_it = FALSE;
2041 	else {
2042 		sa->size.set_it = TRUE;
2043 		sa->size.size = (size3)vap->va_size;
2044 	}
2045 	if (!(mask & AT_ATIME))
2046 		sa->atime.set_it = DONT_CHANGE;
2047 	else {
2048 		/* check time validity */
2049 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2050 			return (EOVERFLOW);
2051 		}
2052 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2053 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2054 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2055 	}
2056 	if (!(mask & AT_MTIME))
2057 		sa->mtime.set_it = DONT_CHANGE;
2058 	else {
2059 		/* check time validity */
2060 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2061 			return (EOVERFLOW);
2062 		}
2063 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2064 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2065 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2066 	}
2067 	return (0);
2068 }
2069 
2070 void
2071 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2072 {
2073 
2074 	da->da_fhandle = VTOFH(dvp);
2075 	da->da_name = nm;
2076 	da->da_flags = 0;
2077 }
2078 
2079 void
2080 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2081 {
2082 
2083 	da->dirp = VTOFH3(dvp);
2084 	da->name = nm;
2085 }
2086 
2087 int
2088 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2089 {
2090 	int error;
2091 	rnode_t *rp;
2092 	struct vattr va;
2093 
2094 	va.va_mask = AT_MODE | AT_GID;
2095 	error = VOP_GETATTR(dvp, &va, 0, cr);
2096 	if (error)
2097 		return (error);
2098 
2099 	/*
2100 	 * To determine the expected group-id of the created file:
2101 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2102 	 *	GRPID option, and the directory's set-gid bit is clear,
2103 	 *	then use the process's gid.
2104 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2105 	 */
2106 	rp = VTOR(dvp);
2107 	mutex_enter(&rp->r_statelock);
2108 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2109 		*gidp = crgetgid(cr);
2110 	else
2111 		*gidp = va.va_gid;
2112 	mutex_exit(&rp->r_statelock);
2113 	return (0);
2114 }
2115 
2116 int
2117 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2118 {
2119 	int error;
2120 	struct vattr va;
2121 
2122 	va.va_mask = AT_MODE;
2123 	error = VOP_GETATTR(dvp, &va, 0, cr);
2124 	if (error)
2125 		return (error);
2126 
2127 	/*
2128 	 * Modify the expected mode (om) so that the set-gid bit matches
2129 	 * that of the parent directory (dvp).
2130 	 */
2131 	if (va.va_mode & VSGID)
2132 		*omp |= VSGID;
2133 	else
2134 		*omp &= ~VSGID;
2135 	return (0);
2136 }
2137 
2138 void
2139 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2140 {
2141 
2142 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2143 		if (!(vp->v_flag & VSWAPLIKE)) {
2144 			mutex_enter(&vp->v_lock);
2145 			vp->v_flag |= VSWAPLIKE;
2146 			mutex_exit(&vp->v_lock);
2147 		}
2148 	} else {
2149 		if (vp->v_flag & VSWAPLIKE) {
2150 			mutex_enter(&vp->v_lock);
2151 			vp->v_flag &= ~VSWAPLIKE;
2152 			mutex_exit(&vp->v_lock);
2153 		}
2154 	}
2155 }
2156 
2157 /*
2158  * Free the resources associated with an rnode.
2159  */
2160 static void
2161 rinactive(rnode_t *rp, cred_t *cr)
2162 {
2163 	vnode_t *vp;
2164 	cred_t *cred;
2165 	char *contents;
2166 	int size;
2167 	vsecattr_t *vsp;
2168 	int error;
2169 	nfs3_pathconf_info *info;
2170 
2171 	/*
2172 	 * Before freeing anything, wait until all asynchronous
2173 	 * activity is done on this rnode.  This will allow all
2174 	 * asynchronous read ahead and write behind i/o's to
2175 	 * finish.
2176 	 */
2177 	mutex_enter(&rp->r_statelock);
2178 	while (rp->r_count > 0)
2179 		cv_wait(&rp->r_cv, &rp->r_statelock);
2180 	mutex_exit(&rp->r_statelock);
2181 
2182 	/*
2183 	 * Flush and invalidate all pages associated with the vnode.
2184 	 */
2185 	vp = RTOV(rp);
2186 	if (vn_has_cached_data(vp)) {
2187 		ASSERT(vp->v_type != VCHR);
2188 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2189 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr);
2190 			if (error && (error == ENOSPC || error == EDQUOT)) {
2191 				mutex_enter(&rp->r_statelock);
2192 				if (!rp->r_error)
2193 					rp->r_error = error;
2194 				mutex_exit(&rp->r_statelock);
2195 			}
2196 		}
2197 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2198 	}
2199 
2200 	/*
2201 	 * Free any held credentials and caches which may be associated
2202 	 * with this rnode.
2203 	 */
2204 	mutex_enter(&rp->r_statelock);
2205 	cred = rp->r_cred;
2206 	rp->r_cred = NULL;
2207 	contents = rp->r_symlink.contents;
2208 	size = rp->r_symlink.size;
2209 	rp->r_symlink.contents = NULL;
2210 	vsp = rp->r_secattr;
2211 	rp->r_secattr = NULL;
2212 	info = rp->r_pathconf;
2213 	rp->r_pathconf = NULL;
2214 	mutex_exit(&rp->r_statelock);
2215 
2216 	/*
2217 	 * Free the held credential.
2218 	 */
2219 	if (cred != NULL)
2220 		crfree(cred);
2221 
2222 	/*
2223 	 * Free the access cache entries.
2224 	 */
2225 	(void) nfs_access_purge_rp(rp);
2226 
2227 	/*
2228 	 * Free the readdir cache entries.
2229 	 */
2230 	if (HAVE_RDDIR_CACHE(rp))
2231 		nfs_purge_rddir_cache(vp);
2232 
2233 	/*
2234 	 * Free the symbolic link cache.
2235 	 */
2236 	if (contents != NULL) {
2237 
2238 		kmem_free((void *)contents, size);
2239 	}
2240 
2241 	/*
2242 	 * Free any cached ACL.
2243 	 */
2244 	if (vsp != NULL)
2245 		nfs_acl_free(vsp);
2246 
2247 	/*
2248 	 * Free any cached pathconf information.
2249 	 */
2250 	if (info != NULL)
2251 		kmem_free(info, sizeof (*info));
2252 }
2253 
2254 /*
2255  * Return a vnode for the given NFS Version 2 file handle.
2256  * If no rnode exists for this fhandle, create one and put it
2257  * into the hash queues.  If the rnode for this fhandle
2258  * already exists, return it.
2259  *
2260  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2261  */
2262 vnode_t *
2263 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2264     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2265 {
2266 	int newnode;
2267 	int index;
2268 	vnode_t *vp;
2269 	nfs_fhandle nfh;
2270 	vattr_t va;
2271 
2272 	nfh.fh_len = NFS_FHSIZE;
2273 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2274 
2275 	index = rtablehash(&nfh);
2276 	rw_enter(&rtable[index].r_lock, RW_READER);
2277 
2278 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2279 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2280 
2281 	if (attr != NULL) {
2282 		if (!newnode) {
2283 			rw_exit(&rtable[index].r_lock);
2284 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2285 		} else {
2286 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2287 				vp->v_type = VBAD;
2288 			else
2289 				vp->v_type = n2v_type(attr);
2290 			/*
2291 			 * A translation here seems to be necessary
2292 			 * because this function can be called
2293 			 * with `attr' that has come from the wire,
2294 			 * and been operated on by vattr_to_nattr().
2295 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2296 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2297 			 * ->makenfsnode().
2298 			 */
2299 			if ((attr->na_rdev & 0xffff0000) == 0)
2300 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2301 			else
2302 				vp->v_rdev = expldev(n2v_rdev(attr));
2303 			nfs_attrcache(vp, attr, t);
2304 			rw_exit(&rtable[index].r_lock);
2305 		}
2306 	} else {
2307 		if (newnode) {
2308 			PURGE_ATTRCACHE(vp);
2309 		}
2310 		rw_exit(&rtable[index].r_lock);
2311 	}
2312 
2313 	return (vp);
2314 }
2315 
2316 /*
2317  * Return a vnode for the given NFS Version 3 file handle.
2318  * If no rnode exists for this fhandle, create one and put it
2319  * into the hash queues.  If the rnode for this fhandle
2320  * already exists, return it.
2321  *
2322  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2323  */
2324 vnode_t *
2325 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2326     cred_t *cr, char *dnm, char *nm)
2327 {
2328 	int newnode;
2329 	int index;
2330 	vnode_t *vp;
2331 
2332 	index = rtablehash((nfs_fhandle *)fh);
2333 	rw_enter(&rtable[index].r_lock, RW_READER);
2334 
2335 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2336 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2337 	    dnm, nm);
2338 
2339 	if (vap == NULL) {
2340 		if (newnode) {
2341 			PURGE_ATTRCACHE(vp);
2342 		}
2343 		rw_exit(&rtable[index].r_lock);
2344 		return (vp);
2345 	}
2346 
2347 	if (!newnode) {
2348 		rw_exit(&rtable[index].r_lock);
2349 		nfs_attr_cache(vp, vap, t, cr);
2350 	} else {
2351 		rnode_t *rp = VTOR(vp);
2352 
2353 		vp->v_type = vap->va_type;
2354 		vp->v_rdev = vap->va_rdev;
2355 
2356 		mutex_enter(&rp->r_statelock);
2357 		if (rp->r_mtime <= t)
2358 			nfs_attrcache_va(vp, vap);
2359 		mutex_exit(&rp->r_statelock);
2360 		rw_exit(&rtable[index].r_lock);
2361 	}
2362 
2363 	return (vp);
2364 }
2365 
2366 vnode_t *
2367 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2368     cred_t *cr, char *dnm, char *nm)
2369 {
2370 	int newnode;
2371 	int index;
2372 	vnode_t *vp;
2373 	vattr_t va;
2374 
2375 	index = rtablehash((nfs_fhandle *)fh);
2376 	rw_enter(&rtable[index].r_lock, RW_READER);
2377 
2378 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2379 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2380 	    dnm, nm);
2381 
2382 	if (attr == NULL) {
2383 		if (newnode) {
2384 			PURGE_ATTRCACHE(vp);
2385 		}
2386 		rw_exit(&rtable[index].r_lock);
2387 		return (vp);
2388 	}
2389 
2390 	if (!newnode) {
2391 		rw_exit(&rtable[index].r_lock);
2392 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2393 	} else {
2394 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2395 			vp->v_type = VBAD;
2396 		else
2397 			vp->v_type = nf3_to_vt[attr->type];
2398 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2399 		    attr->rdev.specdata2);
2400 		nfs3_attrcache(vp, attr, t);
2401 		rw_exit(&rtable[index].r_lock);
2402 	}
2403 
2404 	return (vp);
2405 }
2406 
2407 /*
2408  * Read this comment before making changes to rtablehash()!
2409  * This is a hash function in which seemingly obvious and harmless
2410  * changes can cause escalations costing million dollars!
2411  * Know what you are doing.
2412  *
2413  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2414  * algorithm is currently detailed here:
2415  *
2416  *   http://burtleburtle.net/bob/hash/doobs.html
2417  *
2418  * Of course, the above link may not be valid by the time you are reading
2419  * this, but suffice it to say that the one-at-a-time algorithm works well in
2420  * almost all cases.  If you are changing the algorithm be sure to verify that
2421  * the hash algorithm still provides even distribution in all cases and with
2422  * any server returning filehandles in whatever order (sequential or random).
2423  */
2424 static int
2425 rtablehash(nfs_fhandle *fh)
2426 {
2427 	ulong_t hash, len, i;
2428 	char *key;
2429 
2430 	key = fh->fh_buf;
2431 	len = (ulong_t)fh->fh_len;
2432 	for (hash = 0, i = 0; i < len; i++) {
2433 		hash += key[i];
2434 		hash += (hash << 10);
2435 		hash ^= (hash >> 6);
2436 	}
2437 	hash += (hash << 3);
2438 	hash ^= (hash >> 11);
2439 	hash += (hash << 15);
2440 	return (hash & rtablemask);
2441 }
2442 
2443 static vnode_t *
2444 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2445     struct vnodeops *vops,
2446     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2447     int (*compar)(const void *, const void *),
2448     int *newnode, cred_t *cr, char *dnm, char *nm)
2449 {
2450 	rnode_t *rp;
2451 	rnode_t *trp;
2452 	vnode_t *vp;
2453 	mntinfo_t *mi;
2454 
2455 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2456 
2457 	mi = VFTOMI(vfsp);
2458 start:
2459 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2460 		vp = RTOV(rp);
2461 		nfs_set_vroot(vp);
2462 		*newnode = 0;
2463 		return (vp);
2464 	}
2465 	rw_exit(&rhtp->r_lock);
2466 
2467 	mutex_enter(&rpfreelist_lock);
2468 	if (rpfreelist != NULL && rnew >= nrnode) {
2469 		rp = rpfreelist;
2470 		rp_rmfree(rp);
2471 		mutex_exit(&rpfreelist_lock);
2472 
2473 		vp = RTOV(rp);
2474 
2475 		if (rp->r_flags & RHASHED) {
2476 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2477 			mutex_enter(&vp->v_lock);
2478 			if (vp->v_count > 1) {
2479 				vp->v_count--;
2480 				mutex_exit(&vp->v_lock);
2481 				rw_exit(&rp->r_hashq->r_lock);
2482 				rw_enter(&rhtp->r_lock, RW_READER);
2483 				goto start;
2484 			}
2485 			mutex_exit(&vp->v_lock);
2486 			rp_rmhash_locked(rp);
2487 			rw_exit(&rp->r_hashq->r_lock);
2488 		}
2489 
2490 		rinactive(rp, cr);
2491 
2492 		mutex_enter(&vp->v_lock);
2493 		if (vp->v_count > 1) {
2494 			vp->v_count--;
2495 			mutex_exit(&vp->v_lock);
2496 			rw_enter(&rhtp->r_lock, RW_READER);
2497 			goto start;
2498 		}
2499 		mutex_exit(&vp->v_lock);
2500 		vn_invalid(vp);
2501 		/*
2502 		 * destroy old locks before bzero'ing and
2503 		 * recreating the locks below.
2504 		 */
2505 		nfs_rw_destroy(&rp->r_rwlock);
2506 		nfs_rw_destroy(&rp->r_lkserlock);
2507 		mutex_destroy(&rp->r_statelock);
2508 		cv_destroy(&rp->r_cv);
2509 		cv_destroy(&rp->r_commit.c_cv);
2510 		nfs_free_r_path(rp);
2511 		avl_destroy(&rp->r_dir);
2512 		/*
2513 		 * Make sure that if rnode is recycled then
2514 		 * VFS count is decremented properly before
2515 		 * reuse.
2516 		 */
2517 		VFS_RELE(vp->v_vfsp);
2518 		vn_reinit(vp);
2519 	} else {
2520 		vnode_t *new_vp;
2521 
2522 		mutex_exit(&rpfreelist_lock);
2523 
2524 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2525 		new_vp = vn_alloc(KM_SLEEP);
2526 
2527 		atomic_add_long((ulong_t *)&rnew, 1);
2528 #ifdef DEBUG
2529 		clstat_debug.nrnode.value.ui64++;
2530 #endif
2531 		vp = new_vp;
2532 	}
2533 
2534 	bzero(rp, sizeof (*rp));
2535 	rp->r_vnode = vp;
2536 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2537 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2538 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2539 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2540 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2541 	rp->r_fh.fh_len = fh->fh_len;
2542 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2543 	rp->r_server = mi->mi_curr_serv;
2544 	if (FAILOVER_MOUNT(mi)) {
2545 		/*
2546 		 * If replicated servers, stash pathnames
2547 		 */
2548 		if (dnm != NULL && nm != NULL) {
2549 			char *s, *p;
2550 			uint_t len;
2551 
2552 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2553 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2554 #ifdef DEBUG
2555 			clstat_debug.rpath.value.ui64 += len;
2556 #endif
2557 			s = rp->r_path;
2558 			for (p = dnm; *p; p++)
2559 				*s++ = *p;
2560 			*s++ = '/';
2561 			for (p = nm; *p; p++)
2562 				*s++ = *p;
2563 			*s = '\0';
2564 		} else {
2565 			/* special case for root */
2566 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2567 #ifdef DEBUG
2568 			clstat_debug.rpath.value.ui64 += 2;
2569 #endif
2570 			*rp->r_path = '.';
2571 			*(rp->r_path + 1) = '\0';
2572 		}
2573 	}
2574 	VFS_HOLD(vfsp);
2575 	rp->r_putapage = putapage;
2576 	rp->r_hashq = rhtp;
2577 	rp->r_flags = RREADDIRPLUS;
2578 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2579 	    offsetof(rddir_cache, tree));
2580 	vn_setops(vp, vops);
2581 	vp->v_data = (caddr_t)rp;
2582 	vp->v_vfsp = vfsp;
2583 	vp->v_type = VNON;
2584 	nfs_set_vroot(vp);
2585 
2586 	/*
2587 	 * There is a race condition if someone else
2588 	 * alloc's the rnode while no locks are held, so we
2589 	 * check again and recover if found.
2590 	 */
2591 	rw_enter(&rhtp->r_lock, RW_WRITER);
2592 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2593 		vp = RTOV(trp);
2594 		nfs_set_vroot(vp);
2595 		*newnode = 0;
2596 		rw_exit(&rhtp->r_lock);
2597 		rp_addfree(rp, cr);
2598 		rw_enter(&rhtp->r_lock, RW_READER);
2599 		return (vp);
2600 	}
2601 	rp_addhash(rp);
2602 	*newnode = 1;
2603 	return (vp);
2604 }
2605 
2606 static void
2607 nfs_set_vroot(vnode_t *vp)
2608 {
2609 	rnode_t *rp;
2610 	nfs_fhandle *rootfh;
2611 
2612 	rp = VTOR(vp);
2613 	rootfh = &rp->r_server->sv_fhandle;
2614 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2615 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2616 		if (!(vp->v_flag & VROOT)) {
2617 			mutex_enter(&vp->v_lock);
2618 			vp->v_flag |= VROOT;
2619 			mutex_exit(&vp->v_lock);
2620 		}
2621 	}
2622 }
2623 
2624 static void
2625 nfs_free_r_path(rnode_t *rp)
2626 {
2627 	char *path;
2628 	size_t len;
2629 
2630 	path = rp->r_path;
2631 	if (path) {
2632 		rp->r_path = NULL;
2633 		len = strlen(path) + 1;
2634 		kmem_free(path, len);
2635 #ifdef DEBUG
2636 		clstat_debug.rpath.value.ui64 -= len;
2637 #endif
2638 	}
2639 }
2640 
2641 /*
2642  * Put an rnode on the free list.
2643  *
2644  * Rnodes which were allocated above and beyond the normal limit
2645  * are immediately freed.
2646  */
2647 void
2648 rp_addfree(rnode_t *rp, cred_t *cr)
2649 {
2650 	vnode_t *vp;
2651 	struct vfs *vfsp;
2652 
2653 	vp = RTOV(rp);
2654 	ASSERT(vp->v_count >= 1);
2655 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2656 
2657 	/*
2658 	 * If we have too many rnodes allocated and there are no
2659 	 * references to this rnode, or if the rnode is no longer
2660 	 * accessible by it does not reside in the hash queues,
2661 	 * or if an i/o error occurred while writing to the file,
2662 	 * then just free it instead of putting it on the rnode
2663 	 * freelist.
2664 	 */
2665 	vfsp = vp->v_vfsp;
2666 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2667 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2668 		if (rp->r_flags & RHASHED) {
2669 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2670 			mutex_enter(&vp->v_lock);
2671 			if (vp->v_count > 1) {
2672 				vp->v_count--;
2673 				mutex_exit(&vp->v_lock);
2674 				rw_exit(&rp->r_hashq->r_lock);
2675 				return;
2676 			}
2677 			mutex_exit(&vp->v_lock);
2678 			rp_rmhash_locked(rp);
2679 			rw_exit(&rp->r_hashq->r_lock);
2680 		}
2681 
2682 		rinactive(rp, cr);
2683 
2684 		/*
2685 		 * Recheck the vnode reference count.  We need to
2686 		 * make sure that another reference has not been
2687 		 * acquired while we were not holding v_lock.  The
2688 		 * rnode is not in the rnode hash queues, so the
2689 		 * only way for a reference to have been acquired
2690 		 * is for a VOP_PUTPAGE because the rnode was marked
2691 		 * with RDIRTY or for a modified page.  This
2692 		 * reference may have been acquired before our call
2693 		 * to rinactive.  The i/o may have been completed,
2694 		 * thus allowing rinactive to complete, but the
2695 		 * reference to the vnode may not have been released
2696 		 * yet.  In any case, the rnode can not be destroyed
2697 		 * until the other references to this vnode have been
2698 		 * released.  The other references will take care of
2699 		 * either destroying the rnode or placing it on the
2700 		 * rnode freelist.  If there are no other references,
2701 		 * then the rnode may be safely destroyed.
2702 		 */
2703 		mutex_enter(&vp->v_lock);
2704 		if (vp->v_count > 1) {
2705 			vp->v_count--;
2706 			mutex_exit(&vp->v_lock);
2707 			return;
2708 		}
2709 		mutex_exit(&vp->v_lock);
2710 
2711 		destroy_rnode(rp);
2712 		return;
2713 	}
2714 
2715 	/*
2716 	 * Lock the hash queue and then recheck the reference count
2717 	 * to ensure that no other threads have acquired a reference
2718 	 * to indicate that the rnode should not be placed on the
2719 	 * freelist.  If another reference has been acquired, then
2720 	 * just release this one and let the other thread complete
2721 	 * the processing of adding this rnode to the freelist.
2722 	 */
2723 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2724 
2725 	mutex_enter(&vp->v_lock);
2726 	if (vp->v_count > 1) {
2727 		vp->v_count--;
2728 		mutex_exit(&vp->v_lock);
2729 		rw_exit(&rp->r_hashq->r_lock);
2730 		return;
2731 	}
2732 	mutex_exit(&vp->v_lock);
2733 
2734 	/*
2735 	 * If there is no cached data or metadata for this file, then
2736 	 * put the rnode on the front of the freelist so that it will
2737 	 * be reused before other rnodes which may have cached data or
2738 	 * metadata associated with them.
2739 	 */
2740 	mutex_enter(&rpfreelist_lock);
2741 	if (rpfreelist == NULL) {
2742 		rp->r_freef = rp;
2743 		rp->r_freeb = rp;
2744 		rpfreelist = rp;
2745 	} else {
2746 		rp->r_freef = rpfreelist;
2747 		rp->r_freeb = rpfreelist->r_freeb;
2748 		rpfreelist->r_freeb->r_freef = rp;
2749 		rpfreelist->r_freeb = rp;
2750 		if (!vn_has_cached_data(vp) &&
2751 		    !HAVE_RDDIR_CACHE(rp) &&
2752 		    rp->r_symlink.contents == NULL &&
2753 		    rp->r_secattr == NULL &&
2754 		    rp->r_pathconf == NULL)
2755 			rpfreelist = rp;
2756 	}
2757 	mutex_exit(&rpfreelist_lock);
2758 
2759 	rw_exit(&rp->r_hashq->r_lock);
2760 }
2761 
2762 /*
2763  * Remove an rnode from the free list.
2764  *
2765  * The caller must be holding rpfreelist_lock and the rnode
2766  * must be on the freelist.
2767  */
2768 static void
2769 rp_rmfree(rnode_t *rp)
2770 {
2771 
2772 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2773 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2774 
2775 	if (rp == rpfreelist) {
2776 		rpfreelist = rp->r_freef;
2777 		if (rp == rpfreelist)
2778 			rpfreelist = NULL;
2779 	}
2780 
2781 	rp->r_freeb->r_freef = rp->r_freef;
2782 	rp->r_freef->r_freeb = rp->r_freeb;
2783 
2784 	rp->r_freef = rp->r_freeb = NULL;
2785 }
2786 
2787 /*
2788  * Put a rnode in the hash table.
2789  *
2790  * The caller must be holding the exclusive hash queue lock.
2791  */
2792 static void
2793 rp_addhash(rnode_t *rp)
2794 {
2795 
2796 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2797 	ASSERT(!(rp->r_flags & RHASHED));
2798 
2799 	rp->r_hashf = rp->r_hashq->r_hashf;
2800 	rp->r_hashq->r_hashf = rp;
2801 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2802 	rp->r_hashf->r_hashb = rp;
2803 
2804 	mutex_enter(&rp->r_statelock);
2805 	rp->r_flags |= RHASHED;
2806 	mutex_exit(&rp->r_statelock);
2807 }
2808 
2809 /*
2810  * Remove a rnode from the hash table.
2811  *
2812  * The caller must be holding the hash queue lock.
2813  */
2814 static void
2815 rp_rmhash_locked(rnode_t *rp)
2816 {
2817 
2818 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2819 	ASSERT(rp->r_flags & RHASHED);
2820 
2821 	rp->r_hashb->r_hashf = rp->r_hashf;
2822 	rp->r_hashf->r_hashb = rp->r_hashb;
2823 
2824 	mutex_enter(&rp->r_statelock);
2825 	rp->r_flags &= ~RHASHED;
2826 	mutex_exit(&rp->r_statelock);
2827 }
2828 
2829 /*
2830  * Remove a rnode from the hash table.
2831  *
2832  * The caller must not be holding the hash queue lock.
2833  */
2834 void
2835 rp_rmhash(rnode_t *rp)
2836 {
2837 
2838 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2839 	rp_rmhash_locked(rp);
2840 	rw_exit(&rp->r_hashq->r_lock);
2841 }
2842 
2843 /*
2844  * Lookup a rnode by fhandle.
2845  *
2846  * The caller must be holding the hash queue lock, either shared or exclusive.
2847  */
2848 static rnode_t *
2849 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2850 {
2851 	rnode_t *rp;
2852 	vnode_t *vp;
2853 
2854 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2855 
2856 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2857 		vp = RTOV(rp);
2858 		if (vp->v_vfsp == vfsp &&
2859 		    rp->r_fh.fh_len == fh->fh_len &&
2860 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2861 			/*
2862 			 * remove rnode from free list, if necessary.
2863 			 */
2864 			if (rp->r_freef != NULL) {
2865 				mutex_enter(&rpfreelist_lock);
2866 				/*
2867 				 * If the rnode is on the freelist,
2868 				 * then remove it and use that reference
2869 				 * as the new reference.  Otherwise,
2870 				 * need to increment the reference count.
2871 				 */
2872 				if (rp->r_freef != NULL) {
2873 					rp_rmfree(rp);
2874 					mutex_exit(&rpfreelist_lock);
2875 				} else {
2876 					mutex_exit(&rpfreelist_lock);
2877 					VN_HOLD(vp);
2878 				}
2879 			} else
2880 				VN_HOLD(vp);
2881 			return (rp);
2882 		}
2883 	}
2884 	return (NULL);
2885 }
2886 
2887 /*
2888  * Return 1 if there is a active vnode belonging to this vfs in the
2889  * rtable cache.
2890  *
2891  * Several of these checks are done without holding the usual
2892  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2893  * etc. will redo the necessary checks before actually destroying
2894  * any rnodes.
2895  */
2896 int
2897 check_rtable(struct vfs *vfsp)
2898 {
2899 	int index;
2900 	rnode_t *rp;
2901 	vnode_t *vp;
2902 
2903 	for (index = 0; index < rtablesize; index++) {
2904 		rw_enter(&rtable[index].r_lock, RW_READER);
2905 		for (rp = rtable[index].r_hashf;
2906 		    rp != (rnode_t *)(&rtable[index]);
2907 		    rp = rp->r_hashf) {
2908 			vp = RTOV(rp);
2909 			if (vp->v_vfsp == vfsp) {
2910 				if (rp->r_freef == NULL ||
2911 				    (vn_has_cached_data(vp) &&
2912 				    (rp->r_flags & RDIRTY)) ||
2913 				    rp->r_count > 0) {
2914 					rw_exit(&rtable[index].r_lock);
2915 					return (1);
2916 				}
2917 			}
2918 		}
2919 		rw_exit(&rtable[index].r_lock);
2920 	}
2921 	return (0);
2922 }
2923 
2924 /*
2925  * Destroy inactive vnodes from the hash queues which belong to this
2926  * vfs.  It is essential that we destroy all inactive vnodes during a
2927  * forced unmount as well as during a normal unmount.
2928  */
2929 void
2930 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2931 {
2932 	int index;
2933 	rnode_t *rp;
2934 	rnode_t *rlist;
2935 	rnode_t *r_hashf;
2936 	vnode_t *vp;
2937 
2938 	rlist = NULL;
2939 
2940 	for (index = 0; index < rtablesize; index++) {
2941 		rw_enter(&rtable[index].r_lock, RW_WRITER);
2942 		for (rp = rtable[index].r_hashf;
2943 		    rp != (rnode_t *)(&rtable[index]);
2944 		    rp = r_hashf) {
2945 			/* save the hash pointer before destroying */
2946 			r_hashf = rp->r_hashf;
2947 			vp = RTOV(rp);
2948 			if (vp->v_vfsp == vfsp) {
2949 				mutex_enter(&rpfreelist_lock);
2950 				if (rp->r_freef != NULL) {
2951 					rp_rmfree(rp);
2952 					mutex_exit(&rpfreelist_lock);
2953 					rp_rmhash_locked(rp);
2954 					rp->r_hashf = rlist;
2955 					rlist = rp;
2956 				} else
2957 					mutex_exit(&rpfreelist_lock);
2958 			}
2959 		}
2960 		rw_exit(&rtable[index].r_lock);
2961 	}
2962 
2963 	for (rp = rlist; rp != NULL; rp = rlist) {
2964 		rlist = rp->r_hashf;
2965 		/*
2966 		 * This call to rp_addfree will end up destroying the
2967 		 * rnode, but in a safe way with the appropriate set
2968 		 * of checks done.
2969 		 */
2970 		rp_addfree(rp, cr);
2971 	}
2972 
2973 }
2974 
2975 /*
2976  * This routine destroys all the resources associated with the rnode
2977  * and then the rnode itself.
2978  */
2979 static void
2980 destroy_rnode(rnode_t *rp)
2981 {
2982 	vnode_t *vp;
2983 	vfs_t *vfsp;
2984 
2985 	vp = RTOV(rp);
2986 	vfsp = vp->v_vfsp;
2987 
2988 	ASSERT(vp->v_count == 1);
2989 	ASSERT(rp->r_count == 0);
2990 	ASSERT(rp->r_lmpl == NULL);
2991 	ASSERT(rp->r_mapcnt == 0);
2992 	ASSERT(!(rp->r_flags & RHASHED));
2993 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2994 	atomic_add_long((ulong_t *)&rnew, -1);
2995 #ifdef DEBUG
2996 	clstat_debug.nrnode.value.ui64--;
2997 #endif
2998 	nfs_rw_destroy(&rp->r_rwlock);
2999 	nfs_rw_destroy(&rp->r_lkserlock);
3000 	mutex_destroy(&rp->r_statelock);
3001 	cv_destroy(&rp->r_cv);
3002 	cv_destroy(&rp->r_commit.c_cv);
3003 	if (rp->r_flags & RDELMAPLIST)
3004 		list_destroy(&rp->r_indelmap);
3005 	nfs_free_r_path(rp);
3006 	avl_destroy(&rp->r_dir);
3007 	vn_invalid(vp);
3008 	vn_free(vp);
3009 	kmem_cache_free(rnode_cache, rp);
3010 	VFS_RELE(vfsp);
3011 }
3012 
3013 /*
3014  * Flush all vnodes in this (or every) vfs.
3015  * Used by nfs_sync and by nfs_unmount.
3016  */
3017 void
3018 rflush(struct vfs *vfsp, cred_t *cr)
3019 {
3020 	int index;
3021 	rnode_t *rp;
3022 	vnode_t *vp, **vplist;
3023 	long num, cnt;
3024 
3025 	/*
3026 	 * Check to see whether there is anything to do.
3027 	 */
3028 	num = rnew;
3029 	if (num == 0)
3030 		return;
3031 
3032 	/*
3033 	 * Allocate a slot for all currently active rnodes on the
3034 	 * supposition that they all may need flushing.
3035 	 */
3036 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3037 	cnt = 0;
3038 
3039 	/*
3040 	 * Walk the hash queues looking for rnodes with page
3041 	 * lists associated with them.  Make a list of these
3042 	 * files.
3043 	 */
3044 	for (index = 0; index < rtablesize; index++) {
3045 		rw_enter(&rtable[index].r_lock, RW_READER);
3046 		for (rp = rtable[index].r_hashf;
3047 		    rp != (rnode_t *)(&rtable[index]);
3048 		    rp = rp->r_hashf) {
3049 			vp = RTOV(rp);
3050 			/*
3051 			 * Don't bother sync'ing a vp if it
3052 			 * is part of virtual swap device or
3053 			 * if VFS is read-only
3054 			 */
3055 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3056 				continue;
3057 			/*
3058 			 * If flushing all mounted file systems or
3059 			 * the vnode belongs to this vfs, has pages
3060 			 * and is marked as either dirty or mmap'd,
3061 			 * hold and add this vnode to the list of
3062 			 * vnodes to flush.
3063 			 */
3064 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3065 			    vn_has_cached_data(vp) &&
3066 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3067 				VN_HOLD(vp);
3068 				vplist[cnt++] = vp;
3069 				if (cnt == num) {
3070 					rw_exit(&rtable[index].r_lock);
3071 					goto toomany;
3072 				}
3073 			}
3074 		}
3075 		rw_exit(&rtable[index].r_lock);
3076 	}
3077 toomany:
3078 
3079 	/*
3080 	 * Flush and release all of the files on the list.
3081 	 */
3082 	while (cnt-- > 0) {
3083 		vp = vplist[cnt];
3084 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr);
3085 		VN_RELE(vp);
3086 	}
3087 
3088 	/*
3089 	 * Free the space allocated to hold the list.
3090 	 */
3091 	kmem_free(vplist, num * sizeof (*vplist));
3092 }
3093 
3094 /*
3095  * This probably needs to be larger than or equal to
3096  * log2(sizeof (struct rnode)) due to the way that rnodes are
3097  * allocated.
3098  */
3099 #define	ACACHE_SHIFT_BITS	9
3100 
3101 static int
3102 acachehash(rnode_t *rp, cred_t *cr)
3103 {
3104 
3105 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3106 	    acachemask);
3107 }
3108 
3109 #ifdef DEBUG
3110 static long nfs_access_cache_hits = 0;
3111 static long nfs_access_cache_misses = 0;
3112 #endif
3113 
3114 nfs_access_type_t
3115 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3116 {
3117 	vnode_t *vp;
3118 	acache_t *ap;
3119 	acache_hash_t *hp;
3120 	nfs_access_type_t all;
3121 
3122 	vp = RTOV(rp);
3123 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3124 		return (NFS_ACCESS_UNKNOWN);
3125 
3126 	if (rp->r_acache != NULL) {
3127 		hp = &acache[acachehash(rp, cr)];
3128 		rw_enter(&hp->lock, RW_READER);
3129 		ap = hp->next;
3130 		while (ap != (acache_t *)hp) {
3131 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3132 				if ((ap->known & acc) == acc) {
3133 #ifdef DEBUG
3134 					nfs_access_cache_hits++;
3135 #endif
3136 					if ((ap->allowed & acc) == acc)
3137 						all = NFS_ACCESS_ALLOWED;
3138 					else
3139 						all = NFS_ACCESS_DENIED;
3140 				} else {
3141 #ifdef DEBUG
3142 					nfs_access_cache_misses++;
3143 #endif
3144 					all = NFS_ACCESS_UNKNOWN;
3145 				}
3146 				rw_exit(&hp->lock);
3147 				return (all);
3148 			}
3149 			ap = ap->next;
3150 		}
3151 		rw_exit(&hp->lock);
3152 	}
3153 
3154 #ifdef DEBUG
3155 	nfs_access_cache_misses++;
3156 #endif
3157 	return (NFS_ACCESS_UNKNOWN);
3158 }
3159 
3160 void
3161 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3162 {
3163 	acache_t *ap;
3164 	acache_t *nap;
3165 	acache_hash_t *hp;
3166 
3167 	hp = &acache[acachehash(rp, cr)];
3168 
3169 	/*
3170 	 * Allocate now assuming that mostly an allocation will be
3171 	 * required.  This allows the allocation to happen without
3172 	 * holding the hash bucket locked.
3173 	 */
3174 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3175 	if (nap != NULL) {
3176 		nap->known = acc;
3177 		nap->allowed = resacc;
3178 		nap->rnode = rp;
3179 		crhold(cr);
3180 		nap->cred = cr;
3181 		nap->hashq = hp;
3182 	}
3183 
3184 	rw_enter(&hp->lock, RW_WRITER);
3185 
3186 	if (rp->r_acache != NULL) {
3187 		ap = hp->next;
3188 		while (ap != (acache_t *)hp) {
3189 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3190 				ap->known |= acc;
3191 				ap->allowed &= ~acc;
3192 				ap->allowed |= resacc;
3193 				rw_exit(&hp->lock);
3194 				if (nap != NULL) {
3195 					crfree(nap->cred);
3196 					kmem_cache_free(acache_cache, nap);
3197 				}
3198 				return;
3199 			}
3200 			ap = ap->next;
3201 		}
3202 	}
3203 
3204 	if (nap != NULL) {
3205 #ifdef DEBUG
3206 		clstat_debug.access.value.ui64++;
3207 #endif
3208 		nap->next = hp->next;
3209 		hp->next = nap;
3210 		nap->next->prev = nap;
3211 		nap->prev = (acache_t *)hp;
3212 
3213 		mutex_enter(&rp->r_statelock);
3214 		nap->list = rp->r_acache;
3215 		rp->r_acache = nap;
3216 		mutex_exit(&rp->r_statelock);
3217 	}
3218 
3219 	rw_exit(&hp->lock);
3220 }
3221 
3222 int
3223 nfs_access_purge_rp(rnode_t *rp)
3224 {
3225 	acache_t *ap;
3226 	acache_t *tmpap;
3227 	acache_t *rplist;
3228 
3229 	/*
3230 	 * If there aren't any cached entries, then there is nothing
3231 	 * to free.
3232 	 */
3233 	if (rp->r_acache == NULL)
3234 		return (0);
3235 
3236 	mutex_enter(&rp->r_statelock);
3237 	rplist = rp->r_acache;
3238 	rp->r_acache = NULL;
3239 	mutex_exit(&rp->r_statelock);
3240 
3241 	/*
3242 	 * Loop through each entry in the list pointed to in the
3243 	 * rnode.  Remove each of these entries from the hash
3244 	 * queue that it is on and remove it from the list in
3245 	 * the rnode.
3246 	 */
3247 	for (ap = rplist; ap != NULL; ap = tmpap) {
3248 		rw_enter(&ap->hashq->lock, RW_WRITER);
3249 		ap->prev->next = ap->next;
3250 		ap->next->prev = ap->prev;
3251 		rw_exit(&ap->hashq->lock);
3252 
3253 		tmpap = ap->list;
3254 		crfree(ap->cred);
3255 		kmem_cache_free(acache_cache, ap);
3256 #ifdef DEBUG
3257 		clstat_debug.access.value.ui64--;
3258 #endif
3259 	}
3260 
3261 	return (1);
3262 }
3263 
3264 static const char prefix[] = ".nfs";
3265 
3266 static kmutex_t newnum_lock;
3267 
3268 int
3269 newnum(void)
3270 {
3271 	static uint_t newnum = 0;
3272 	uint_t id;
3273 
3274 	mutex_enter(&newnum_lock);
3275 	if (newnum == 0)
3276 		newnum = gethrestime_sec() & 0xffff;
3277 	id = newnum++;
3278 	mutex_exit(&newnum_lock);
3279 	return (id);
3280 }
3281 
3282 char *
3283 newname(void)
3284 {
3285 	char *news;
3286 	char *s;
3287 	const char *p;
3288 	uint_t id;
3289 
3290 	id = newnum();
3291 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3292 	s = news;
3293 	p = prefix;
3294 	while (*p != '\0')
3295 		*s++ = *p++;
3296 	while (id != 0) {
3297 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3298 		id >>= 4;
3299 	}
3300 	*s = '\0';
3301 	return (news);
3302 }
3303 
3304 int
3305 nfs_atoi(char *cp)
3306 {
3307 	int n;
3308 
3309 	n = 0;
3310 	while (*cp != '\0') {
3311 		n = n * 10 + (*cp - '0');
3312 		cp++;
3313 	}
3314 
3315 	return (n);
3316 }
3317 
3318 /*
3319  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3320  * framework.
3321  */
3322 static int
3323 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3324 {
3325 	ksp->ks_snaptime = gethrtime();
3326 	if (rw == KSTAT_WRITE) {
3327 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3328 #ifdef DEBUG
3329 		/*
3330 		 * Currently only the global zone can write to kstats, but we
3331 		 * add the check just for paranoia.
3332 		 */
3333 		if (INGLOBALZONE(curproc))
3334 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3335 			    sizeof (clstat_debug));
3336 #endif
3337 	} else {
3338 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3339 #ifdef DEBUG
3340 		/*
3341 		 * If we're displaying the "global" debug kstat values, we
3342 		 * display them as-is to all zones since in fact they apply to
3343 		 * the system as a whole.
3344 		 */
3345 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3346 		    sizeof (clstat_debug));
3347 #endif
3348 	}
3349 	return (0);
3350 }
3351 
3352 static void *
3353 clinit_zone(zoneid_t zoneid)
3354 {
3355 	kstat_t *nfs_client_kstat;
3356 	struct nfs_clnt *nfscl;
3357 	uint_t ndata;
3358 
3359 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3360 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3361 	nfscl->nfscl_chtable = NULL;
3362 	nfscl->nfscl_zoneid = zoneid;
3363 
3364 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3365 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3366 #ifdef DEBUG
3367 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3368 #endif
3369 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3370 	    "misc", KSTAT_TYPE_NAMED, ndata,
3371 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3372 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3373 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3374 		kstat_install(nfs_client_kstat);
3375 	}
3376 	mutex_enter(&nfs_clnt_list_lock);
3377 	list_insert_head(&nfs_clnt_list, nfscl);
3378 	mutex_exit(&nfs_clnt_list_lock);
3379 	return (nfscl);
3380 }
3381 
3382 /*ARGSUSED*/
3383 static void
3384 clfini_zone(zoneid_t zoneid, void *arg)
3385 {
3386 	struct nfs_clnt *nfscl = arg;
3387 	chhead_t *chp, *next;
3388 
3389 	if (nfscl == NULL)
3390 		return;
3391 	mutex_enter(&nfs_clnt_list_lock);
3392 	list_remove(&nfs_clnt_list, nfscl);
3393 	mutex_exit(&nfs_clnt_list_lock);
3394 	clreclaim_zone(nfscl, 0);
3395 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3396 		ASSERT(chp->ch_list == NULL);
3397 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3398 		next = chp->ch_next;
3399 		kmem_free(chp, sizeof (*chp));
3400 	}
3401 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3402 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3403 	kmem_free(nfscl, sizeof (*nfscl));
3404 }
3405 
3406 /*
3407  * Called by endpnt_destructor to make sure the client handles are
3408  * cleaned up before the RPC endpoints.  This becomes a no-op if
3409  * clfini_zone (above) is called first.  This function is needed
3410  * (rather than relying on clfini_zone to clean up) because the ZSD
3411  * callbacks have no ordering mechanism, so we have no way to ensure
3412  * that clfini_zone is called before endpnt_destructor.
3413  */
3414 void
3415 clcleanup_zone(zoneid_t zoneid)
3416 {
3417 	struct nfs_clnt *nfscl;
3418 
3419 	mutex_enter(&nfs_clnt_list_lock);
3420 	nfscl = list_head(&nfs_clnt_list);
3421 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3422 		if (nfscl->nfscl_zoneid == zoneid) {
3423 			clreclaim_zone(nfscl, 0);
3424 			break;
3425 		}
3426 	}
3427 	mutex_exit(&nfs_clnt_list_lock);
3428 }
3429 
3430 int
3431 nfs_subrinit(void)
3432 {
3433 	int i;
3434 	ulong_t nrnode_max;
3435 
3436 	/*
3437 	 * Allocate and initialize the rnode hash queues
3438 	 */
3439 	if (nrnode <= 0)
3440 		nrnode = ncsize;
3441 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3442 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3443 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3444 		    "setting nrnode to max value of %ld", nrnode_max);
3445 		nrnode = nrnode_max;
3446 	}
3447 
3448 	rtablesize = 1 << highbit(nrnode / hashlen);
3449 	rtablemask = rtablesize - 1;
3450 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3451 	for (i = 0; i < rtablesize; i++) {
3452 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3453 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3454 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3455 	}
3456 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3457 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3458 
3459 	/*
3460 	 * Allocate and initialize the access cache
3461 	 */
3462 
3463 	/*
3464 	 * Initial guess is one access cache entry per rnode unless
3465 	 * nacache is set to a non-zero value and then it is used to
3466 	 * indicate a guess at the number of access cache entries.
3467 	 */
3468 	if (nacache > 0)
3469 		acachesize = 1 << highbit(nacache / hashlen);
3470 	else
3471 		acachesize = rtablesize;
3472 	acachemask = acachesize - 1;
3473 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3474 	for (i = 0; i < acachesize; i++) {
3475 		acache[i].next = (acache_t *)&acache[i];
3476 		acache[i].prev = (acache_t *)&acache[i];
3477 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3478 	}
3479 	acache_cache = kmem_cache_create("nfs_access_cache",
3480 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3481 	/*
3482 	 * Allocate and initialize the client handle cache
3483 	 */
3484 	chtab_cache = kmem_cache_create("client_handle_cache",
3485 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3486 	/*
3487 	 * Initialize the list of per-zone client handles (and associated data).
3488 	 * This needs to be done before we call zone_key_create().
3489 	 */
3490 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3491 	    offsetof(struct nfs_clnt, nfscl_node));
3492 	/*
3493 	 * Initialize the zone_key for per-zone client handle lists.
3494 	 */
3495 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3496 	/*
3497 	 * Initialize the various mutexes and reader/writer locks
3498 	 */
3499 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3500 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3501 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3502 
3503 	/*
3504 	 * Assign unique major number for all nfs mounts
3505 	 */
3506 	if ((nfs_major = getudev()) == -1) {
3507 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3508 		    "nfs: init: can't get unique device number");
3509 		nfs_major = 0;
3510 	}
3511 	nfs_minor = 0;
3512 
3513 	if (nfs3_jukebox_delay == 0)
3514 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3515 
3516 	return (0);
3517 }
3518 
3519 void
3520 nfs_subrfini(void)
3521 {
3522 	int i;
3523 
3524 	/*
3525 	 * Deallocate the rnode hash queues
3526 	 */
3527 	kmem_cache_destroy(rnode_cache);
3528 
3529 	for (i = 0; i < rtablesize; i++)
3530 		rw_destroy(&rtable[i].r_lock);
3531 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3532 
3533 	/*
3534 	 * Deallocated the access cache
3535 	 */
3536 	kmem_cache_destroy(acache_cache);
3537 
3538 	for (i = 0; i < acachesize; i++)
3539 		rw_destroy(&acache[i].lock);
3540 	kmem_free(acache, acachesize * sizeof (*acache));
3541 
3542 	/*
3543 	 * Deallocate the client handle cache
3544 	 */
3545 	kmem_cache_destroy(chtab_cache);
3546 
3547 	/*
3548 	 * Destroy the various mutexes and reader/writer locks
3549 	 */
3550 	mutex_destroy(&rpfreelist_lock);
3551 	mutex_destroy(&newnum_lock);
3552 	mutex_destroy(&nfs_minor_lock);
3553 	(void) zone_key_delete(nfsclnt_zone_key);
3554 }
3555 
3556 enum nfsstat
3557 puterrno(int error)
3558 {
3559 
3560 	switch (error) {
3561 	case EOPNOTSUPP:
3562 		return (NFSERR_OPNOTSUPP);
3563 	case ENAMETOOLONG:
3564 		return (NFSERR_NAMETOOLONG);
3565 	case ENOTEMPTY:
3566 		return (NFSERR_NOTEMPTY);
3567 	case EDQUOT:
3568 		return (NFSERR_DQUOT);
3569 	case ESTALE:
3570 		return (NFSERR_STALE);
3571 	case EREMOTE:
3572 		return (NFSERR_REMOTE);
3573 	case ENOSYS:
3574 		return (NFSERR_OPNOTSUPP);
3575 	case EOVERFLOW:
3576 		return (NFSERR_INVAL);
3577 	default:
3578 		return ((enum nfsstat)error);
3579 	}
3580 	/* NOTREACHED */
3581 }
3582 
3583 int
3584 geterrno(enum nfsstat status)
3585 {
3586 
3587 	switch (status) {
3588 	case NFSERR_OPNOTSUPP:
3589 		return (EOPNOTSUPP);
3590 	case NFSERR_NAMETOOLONG:
3591 		return (ENAMETOOLONG);
3592 	case NFSERR_NOTEMPTY:
3593 		return (ENOTEMPTY);
3594 	case NFSERR_DQUOT:
3595 		return (EDQUOT);
3596 	case NFSERR_STALE:
3597 		return (ESTALE);
3598 	case NFSERR_REMOTE:
3599 		return (EREMOTE);
3600 	case NFSERR_WFLUSH:
3601 		return (EIO);
3602 	default:
3603 		return ((int)status);
3604 	}
3605 	/* NOTREACHED */
3606 }
3607 
3608 enum nfsstat3
3609 puterrno3(int error)
3610 {
3611 
3612 #ifdef DEBUG
3613 	switch (error) {
3614 	case 0:
3615 		return (NFS3_OK);
3616 	case EPERM:
3617 		return (NFS3ERR_PERM);
3618 	case ENOENT:
3619 		return (NFS3ERR_NOENT);
3620 	case EIO:
3621 		return (NFS3ERR_IO);
3622 	case ENXIO:
3623 		return (NFS3ERR_NXIO);
3624 	case EACCES:
3625 		return (NFS3ERR_ACCES);
3626 	case EEXIST:
3627 		return (NFS3ERR_EXIST);
3628 	case EXDEV:
3629 		return (NFS3ERR_XDEV);
3630 	case ENODEV:
3631 		return (NFS3ERR_NODEV);
3632 	case ENOTDIR:
3633 		return (NFS3ERR_NOTDIR);
3634 	case EISDIR:
3635 		return (NFS3ERR_ISDIR);
3636 	case EINVAL:
3637 		return (NFS3ERR_INVAL);
3638 	case EFBIG:
3639 		return (NFS3ERR_FBIG);
3640 	case ENOSPC:
3641 		return (NFS3ERR_NOSPC);
3642 	case EROFS:
3643 		return (NFS3ERR_ROFS);
3644 	case EMLINK:
3645 		return (NFS3ERR_MLINK);
3646 	case ENAMETOOLONG:
3647 		return (NFS3ERR_NAMETOOLONG);
3648 	case ENOTEMPTY:
3649 		return (NFS3ERR_NOTEMPTY);
3650 	case EDQUOT:
3651 		return (NFS3ERR_DQUOT);
3652 	case ESTALE:
3653 		return (NFS3ERR_STALE);
3654 	case EREMOTE:
3655 		return (NFS3ERR_REMOTE);
3656 	case ENOSYS:
3657 	case EOPNOTSUPP:
3658 		return (NFS3ERR_NOTSUPP);
3659 	case EOVERFLOW:
3660 		return (NFS3ERR_INVAL);
3661 	default:
3662 		zcmn_err(getzoneid(), CE_WARN,
3663 		    "puterrno3: got error %d", error);
3664 		return ((enum nfsstat3)error);
3665 	}
3666 #else
3667 	switch (error) {
3668 	case ENAMETOOLONG:
3669 		return (NFS3ERR_NAMETOOLONG);
3670 	case ENOTEMPTY:
3671 		return (NFS3ERR_NOTEMPTY);
3672 	case EDQUOT:
3673 		return (NFS3ERR_DQUOT);
3674 	case ESTALE:
3675 		return (NFS3ERR_STALE);
3676 	case ENOSYS:
3677 	case EOPNOTSUPP:
3678 		return (NFS3ERR_NOTSUPP);
3679 	case EREMOTE:
3680 		return (NFS3ERR_REMOTE);
3681 	case EOVERFLOW:
3682 		return (NFS3ERR_INVAL);
3683 	default:
3684 		return ((enum nfsstat3)error);
3685 	}
3686 #endif
3687 }
3688 
3689 int
3690 geterrno3(enum nfsstat3 status)
3691 {
3692 
3693 #ifdef DEBUG
3694 	switch (status) {
3695 	case NFS3_OK:
3696 		return (0);
3697 	case NFS3ERR_PERM:
3698 		return (EPERM);
3699 	case NFS3ERR_NOENT:
3700 		return (ENOENT);
3701 	case NFS3ERR_IO:
3702 		return (EIO);
3703 	case NFS3ERR_NXIO:
3704 		return (ENXIO);
3705 	case NFS3ERR_ACCES:
3706 		return (EACCES);
3707 	case NFS3ERR_EXIST:
3708 		return (EEXIST);
3709 	case NFS3ERR_XDEV:
3710 		return (EXDEV);
3711 	case NFS3ERR_NODEV:
3712 		return (ENODEV);
3713 	case NFS3ERR_NOTDIR:
3714 		return (ENOTDIR);
3715 	case NFS3ERR_ISDIR:
3716 		return (EISDIR);
3717 	case NFS3ERR_INVAL:
3718 		return (EINVAL);
3719 	case NFS3ERR_FBIG:
3720 		return (EFBIG);
3721 	case NFS3ERR_NOSPC:
3722 		return (ENOSPC);
3723 	case NFS3ERR_ROFS:
3724 		return (EROFS);
3725 	case NFS3ERR_MLINK:
3726 		return (EMLINK);
3727 	case NFS3ERR_NAMETOOLONG:
3728 		return (ENAMETOOLONG);
3729 	case NFS3ERR_NOTEMPTY:
3730 		return (ENOTEMPTY);
3731 	case NFS3ERR_DQUOT:
3732 		return (EDQUOT);
3733 	case NFS3ERR_STALE:
3734 		return (ESTALE);
3735 	case NFS3ERR_REMOTE:
3736 		return (EREMOTE);
3737 	case NFS3ERR_BADHANDLE:
3738 		return (ESTALE);
3739 	case NFS3ERR_NOT_SYNC:
3740 		return (EINVAL);
3741 	case NFS3ERR_BAD_COOKIE:
3742 		return (ENOENT);
3743 	case NFS3ERR_NOTSUPP:
3744 		return (EOPNOTSUPP);
3745 	case NFS3ERR_TOOSMALL:
3746 		return (EINVAL);
3747 	case NFS3ERR_SERVERFAULT:
3748 		return (EIO);
3749 	case NFS3ERR_BADTYPE:
3750 		return (EINVAL);
3751 	case NFS3ERR_JUKEBOX:
3752 		return (ENXIO);
3753 	default:
3754 		zcmn_err(getzoneid(), CE_WARN,
3755 		    "geterrno3: got status %d", status);
3756 		return ((int)status);
3757 	}
3758 #else
3759 	switch (status) {
3760 	case NFS3ERR_NAMETOOLONG:
3761 		return (ENAMETOOLONG);
3762 	case NFS3ERR_NOTEMPTY:
3763 		return (ENOTEMPTY);
3764 	case NFS3ERR_DQUOT:
3765 		return (EDQUOT);
3766 	case NFS3ERR_STALE:
3767 	case NFS3ERR_BADHANDLE:
3768 		return (ESTALE);
3769 	case NFS3ERR_NOTSUPP:
3770 		return (EOPNOTSUPP);
3771 	case NFS3ERR_REMOTE:
3772 		return (EREMOTE);
3773 	case NFS3ERR_NOT_SYNC:
3774 	case NFS3ERR_TOOSMALL:
3775 	case NFS3ERR_BADTYPE:
3776 		return (EINVAL);
3777 	case NFS3ERR_BAD_COOKIE:
3778 		return (ENOENT);
3779 	case NFS3ERR_SERVERFAULT:
3780 		return (EIO);
3781 	case NFS3ERR_JUKEBOX:
3782 		return (ENXIO);
3783 	default:
3784 		return ((int)status);
3785 	}
3786 #endif
3787 }
3788 
3789 rddir_cache *
3790 rddir_cache_alloc(int flags)
3791 {
3792 	rddir_cache *rc;
3793 
3794 	rc = kmem_alloc(sizeof (*rc), flags);
3795 	if (rc != NULL) {
3796 		rc->entries = NULL;
3797 		rc->flags = RDDIR;
3798 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3799 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3800 		rc->count = 1;
3801 #ifdef DEBUG
3802 		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3803 #endif
3804 	}
3805 	return (rc);
3806 }
3807 
3808 static void
3809 rddir_cache_free(rddir_cache *rc)
3810 {
3811 
3812 #ifdef DEBUG
3813 	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3814 #endif
3815 	if (rc->entries != NULL) {
3816 #ifdef DEBUG
3817 		rddir_cache_buf_free(rc->entries, rc->buflen);
3818 #else
3819 		kmem_free(rc->entries, rc->buflen);
3820 #endif
3821 	}
3822 	cv_destroy(&rc->cv);
3823 	mutex_destroy(&rc->lock);
3824 	kmem_free(rc, sizeof (*rc));
3825 }
3826 
3827 void
3828 rddir_cache_hold(rddir_cache *rc)
3829 {
3830 
3831 	mutex_enter(&rc->lock);
3832 	rc->count++;
3833 	mutex_exit(&rc->lock);
3834 }
3835 
3836 void
3837 rddir_cache_rele(rddir_cache *rc)
3838 {
3839 
3840 	mutex_enter(&rc->lock);
3841 	ASSERT(rc->count > 0);
3842 	if (--rc->count == 0) {
3843 		mutex_exit(&rc->lock);
3844 		rddir_cache_free(rc);
3845 	} else
3846 		mutex_exit(&rc->lock);
3847 }
3848 
3849 #ifdef DEBUG
3850 char *
3851 rddir_cache_buf_alloc(size_t size, int flags)
3852 {
3853 	char *rc;
3854 
3855 	rc = kmem_alloc(size, flags);
3856 	if (rc != NULL)
3857 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3858 	return (rc);
3859 }
3860 
3861 void
3862 rddir_cache_buf_free(void *addr, size_t size)
3863 {
3864 
3865 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3866 	kmem_free(addr, size);
3867 }
3868 #endif
3869 
3870 static int
3871 nfs_free_data_reclaim(rnode_t *rp)
3872 {
3873 	char *contents;
3874 	int size;
3875 	vsecattr_t *vsp;
3876 	nfs3_pathconf_info *info;
3877 	int freed;
3878 	cred_t *cred;
3879 
3880 	/*
3881 	 * Free any held credentials and caches which
3882 	 * may be associated with this rnode.
3883 	 */
3884 	mutex_enter(&rp->r_statelock);
3885 	cred = rp->r_cred;
3886 	rp->r_cred = NULL;
3887 	contents = rp->r_symlink.contents;
3888 	size = rp->r_symlink.size;
3889 	rp->r_symlink.contents = NULL;
3890 	vsp = rp->r_secattr;
3891 	rp->r_secattr = NULL;
3892 	info = rp->r_pathconf;
3893 	rp->r_pathconf = NULL;
3894 	mutex_exit(&rp->r_statelock);
3895 
3896 	if (cred != NULL)
3897 		crfree(cred);
3898 
3899 	/*
3900 	 * Free the access cache entries.
3901 	 */
3902 	freed = nfs_access_purge_rp(rp);
3903 
3904 	if (!HAVE_RDDIR_CACHE(rp) &&
3905 	    contents == NULL &&
3906 	    vsp == NULL &&
3907 	    info == NULL)
3908 		return (freed);
3909 
3910 	/*
3911 	 * Free the readdir cache entries
3912 	 */
3913 	if (HAVE_RDDIR_CACHE(rp))
3914 		nfs_purge_rddir_cache(RTOV(rp));
3915 
3916 	/*
3917 	 * Free the symbolic link cache.
3918 	 */
3919 	if (contents != NULL) {
3920 
3921 		kmem_free((void *)contents, size);
3922 	}
3923 
3924 	/*
3925 	 * Free any cached ACL.
3926 	 */
3927 	if (vsp != NULL)
3928 		nfs_acl_free(vsp);
3929 
3930 	/*
3931 	 * Free any cached pathconf information.
3932 	 */
3933 	if (info != NULL)
3934 		kmem_free(info, sizeof (*info));
3935 
3936 	return (1);
3937 }
3938 
3939 static int
3940 nfs_active_data_reclaim(rnode_t *rp)
3941 {
3942 	char *contents;
3943 	int size;
3944 	vsecattr_t *vsp;
3945 	nfs3_pathconf_info *info;
3946 	int freed;
3947 
3948 	/*
3949 	 * Free any held credentials and caches which
3950 	 * may be associated with this rnode.
3951 	 */
3952 	if (!mutex_tryenter(&rp->r_statelock))
3953 		return (0);
3954 	contents = rp->r_symlink.contents;
3955 	size = rp->r_symlink.size;
3956 	rp->r_symlink.contents = NULL;
3957 	vsp = rp->r_secattr;
3958 	rp->r_secattr = NULL;
3959 	info = rp->r_pathconf;
3960 	rp->r_pathconf = NULL;
3961 	mutex_exit(&rp->r_statelock);
3962 
3963 	/*
3964 	 * Free the access cache entries.
3965 	 */
3966 	freed = nfs_access_purge_rp(rp);
3967 
3968 	if (!HAVE_RDDIR_CACHE(rp) &&
3969 	    contents == NULL &&
3970 	    vsp == NULL &&
3971 	    info == NULL)
3972 		return (freed);
3973 
3974 	/*
3975 	 * Free the readdir cache entries
3976 	 */
3977 	if (HAVE_RDDIR_CACHE(rp))
3978 		nfs_purge_rddir_cache(RTOV(rp));
3979 
3980 	/*
3981 	 * Free the symbolic link cache.
3982 	 */
3983 	if (contents != NULL) {
3984 
3985 		kmem_free((void *)contents, size);
3986 	}
3987 
3988 	/*
3989 	 * Free any cached ACL.
3990 	 */
3991 	if (vsp != NULL)
3992 		nfs_acl_free(vsp);
3993 
3994 	/*
3995 	 * Free any cached pathconf information.
3996 	 */
3997 	if (info != NULL)
3998 		kmem_free(info, sizeof (*info));
3999 
4000 	return (1);
4001 }
4002 
4003 static int
4004 nfs_free_reclaim(void)
4005 {
4006 	int freed;
4007 	rnode_t *rp;
4008 
4009 #ifdef DEBUG
4010 	clstat_debug.f_reclaim.value.ui64++;
4011 #endif
4012 	freed = 0;
4013 	mutex_enter(&rpfreelist_lock);
4014 	rp = rpfreelist;
4015 	if (rp != NULL) {
4016 		do {
4017 			if (nfs_free_data_reclaim(rp))
4018 				freed = 1;
4019 		} while ((rp = rp->r_freef) != rpfreelist);
4020 	}
4021 	mutex_exit(&rpfreelist_lock);
4022 	return (freed);
4023 }
4024 
4025 static int
4026 nfs_active_reclaim(void)
4027 {
4028 	int freed;
4029 	int index;
4030 	rnode_t *rp;
4031 
4032 #ifdef DEBUG
4033 	clstat_debug.a_reclaim.value.ui64++;
4034 #endif
4035 	freed = 0;
4036 	for (index = 0; index < rtablesize; index++) {
4037 		rw_enter(&rtable[index].r_lock, RW_READER);
4038 		for (rp = rtable[index].r_hashf;
4039 		    rp != (rnode_t *)(&rtable[index]);
4040 		    rp = rp->r_hashf) {
4041 			if (nfs_active_data_reclaim(rp))
4042 				freed = 1;
4043 		}
4044 		rw_exit(&rtable[index].r_lock);
4045 	}
4046 	return (freed);
4047 }
4048 
4049 static int
4050 nfs_rnode_reclaim(void)
4051 {
4052 	int freed;
4053 	rnode_t *rp;
4054 	vnode_t *vp;
4055 
4056 #ifdef DEBUG
4057 	clstat_debug.r_reclaim.value.ui64++;
4058 #endif
4059 	freed = 0;
4060 	mutex_enter(&rpfreelist_lock);
4061 	while ((rp = rpfreelist) != NULL) {
4062 		rp_rmfree(rp);
4063 		mutex_exit(&rpfreelist_lock);
4064 		if (rp->r_flags & RHASHED) {
4065 			vp = RTOV(rp);
4066 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4067 			mutex_enter(&vp->v_lock);
4068 			if (vp->v_count > 1) {
4069 				vp->v_count--;
4070 				mutex_exit(&vp->v_lock);
4071 				rw_exit(&rp->r_hashq->r_lock);
4072 				mutex_enter(&rpfreelist_lock);
4073 				continue;
4074 			}
4075 			mutex_exit(&vp->v_lock);
4076 			rp_rmhash_locked(rp);
4077 			rw_exit(&rp->r_hashq->r_lock);
4078 		}
4079 		/*
4080 		 * This call to rp_addfree will end up destroying the
4081 		 * rnode, but in a safe way with the appropriate set
4082 		 * of checks done.
4083 		 */
4084 		rp_addfree(rp, CRED());
4085 		mutex_enter(&rpfreelist_lock);
4086 	}
4087 	mutex_exit(&rpfreelist_lock);
4088 	return (freed);
4089 }
4090 
4091 /*ARGSUSED*/
4092 static void
4093 nfs_reclaim(void *cdrarg)
4094 {
4095 
4096 #ifdef DEBUG
4097 	clstat_debug.reclaim.value.ui64++;
4098 #endif
4099 	if (nfs_free_reclaim())
4100 		return;
4101 
4102 	if (nfs_active_reclaim())
4103 		return;
4104 
4105 	(void) nfs_rnode_reclaim();
4106 }
4107 
4108 /*
4109  * NFS client failover support
4110  *
4111  * Routines to copy filehandles
4112  */
4113 void
4114 nfscopyfh(caddr_t fhp, vnode_t *vp)
4115 {
4116 	fhandle_t *dest = (fhandle_t *)fhp;
4117 
4118 	if (dest != NULL)
4119 		*dest = *VTOFH(vp);
4120 }
4121 
4122 void
4123 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4124 {
4125 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4126 
4127 	if (dest != NULL)
4128 		*dest = *VTOFH3(vp);
4129 }
4130 
4131 /*
4132  * NFS client failover support
4133  *
4134  * failover_safe() will test various conditions to ensure that
4135  * failover is permitted for this vnode.  It will be denied
4136  * if:
4137  *	1) the operation in progress does not support failover (NULL fi)
4138  *	2) there are no available replicas (NULL mi_servers->sv_next)
4139  *	3) any locks are outstanding on this file
4140  */
4141 static int
4142 failover_safe(failinfo_t *fi)
4143 {
4144 
4145 	/*
4146 	 * Does this op permit failover?
4147 	 */
4148 	if (fi == NULL || fi->vp == NULL)
4149 		return (0);
4150 
4151 	/*
4152 	 * Are there any alternates to failover to?
4153 	 */
4154 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4155 		return (0);
4156 
4157 	/*
4158 	 * Disable check; we've forced local locking
4159 	 *
4160 	 * if (flk_has_remote_locks(fi->vp))
4161 	 *	return (0);
4162 	 */
4163 
4164 	/*
4165 	 * If we have no partial path, we can't do anything
4166 	 */
4167 	if (VTOR(fi->vp)->r_path == NULL)
4168 		return (0);
4169 
4170 	return (1);
4171 }
4172 
4173 #include <sys/thread.h>
4174 
4175 /*
4176  * NFS client failover support
4177  *
4178  * failover_newserver() will start a search for a new server,
4179  * preferably by starting an async thread to do the work.  If
4180  * someone is already doing this (recognizable by MI_BINDINPROG
4181  * being set), it will simply return and the calling thread
4182  * will queue on the mi_failover_cv condition variable.
4183  */
4184 static void
4185 failover_newserver(mntinfo_t *mi)
4186 {
4187 	/*
4188 	 * Check if someone else is doing this already
4189 	 */
4190 	mutex_enter(&mi->mi_lock);
4191 	if (mi->mi_flags & MI_BINDINPROG) {
4192 		mutex_exit(&mi->mi_lock);
4193 		return;
4194 	}
4195 	mi->mi_flags |= MI_BINDINPROG;
4196 
4197 	/*
4198 	 * Need to hold the vfs struct so that it can't be released
4199 	 * while the failover thread is selecting a new server.
4200 	 */
4201 	VFS_HOLD(mi->mi_vfsp);
4202 
4203 	/*
4204 	 * Start a thread to do the real searching.
4205 	 */
4206 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4207 
4208 	mutex_exit(&mi->mi_lock);
4209 }
4210 
4211 /*
4212  * NFS client failover support
4213  *
4214  * failover_thread() will find a new server to replace the one
4215  * currently in use, wake up other threads waiting on this mount
4216  * point, and die.  It will start at the head of the server list
4217  * and poll servers until it finds one with an NFS server which is
4218  * registered and responds to a NULL procedure ping.
4219  *
4220  * XXX failover_thread is unsafe within the scope of the
4221  * present model defined for cpr to suspend the system.
4222  * Specifically, over-the-wire calls made by the thread
4223  * are unsafe. The thread needs to be reevaluated in case of
4224  * future updates to the cpr suspend model.
4225  */
4226 static void
4227 failover_thread(mntinfo_t *mi)
4228 {
4229 	servinfo_t *svp = NULL;
4230 	CLIENT *cl;
4231 	enum clnt_stat status;
4232 	struct timeval tv;
4233 	int error;
4234 	int oncethru = 0;
4235 	callb_cpr_t cprinfo;
4236 	rnode_t *rp;
4237 	int index;
4238 	char *srvnames;
4239 	size_t srvnames_len;
4240 	struct nfs_clnt *nfscl = NULL;
4241 	zoneid_t zoneid = getzoneid();
4242 
4243 #ifdef DEBUG
4244 	/*
4245 	 * This is currently only needed to access counters which exist on
4246 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4247 	 * on non-DEBUG kernels.
4248 	 */
4249 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4250 	ASSERT(nfscl != NULL);
4251 #endif
4252 
4253 	/*
4254 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4255 	 * code guarantees that there will be only one failover thread
4256 	 * per mountinfo at any instance.
4257 	 */
4258 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4259 	    "failover_thread");
4260 
4261 	mutex_enter(&mi->mi_lock);
4262 	while (mi->mi_readers) {
4263 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4264 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4265 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4266 	}
4267 	mutex_exit(&mi->mi_lock);
4268 
4269 	tv.tv_sec = 2;
4270 	tv.tv_usec = 0;
4271 
4272 	/*
4273 	 * Ping the null NFS procedure of every server in
4274 	 * the list until one responds.  We always start
4275 	 * at the head of the list and always skip the one
4276 	 * that is current, since it's caused us a problem.
4277 	 */
4278 	while (svp == NULL) {
4279 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4280 			if (!oncethru && svp == mi->mi_curr_serv)
4281 				continue;
4282 
4283 			/*
4284 			 * If the file system was forcibly umounted
4285 			 * while trying to do a failover, then just
4286 			 * give up on the failover.  It won't matter
4287 			 * what the server is.
4288 			 */
4289 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4290 				svp = NULL;
4291 				goto done;
4292 			}
4293 
4294 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4295 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4296 			if (error)
4297 				continue;
4298 
4299 			if (!(mi->mi_flags & MI_INT))
4300 				cl->cl_nosignal = TRUE;
4301 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4302 			    xdr_void, NULL, tv);
4303 			if (!(mi->mi_flags & MI_INT))
4304 				cl->cl_nosignal = FALSE;
4305 			AUTH_DESTROY(cl->cl_auth);
4306 			CLNT_DESTROY(cl);
4307 			if (status == RPC_SUCCESS) {
4308 				if (svp == mi->mi_curr_serv) {
4309 #ifdef DEBUG
4310 					zcmn_err(zoneid, CE_NOTE,
4311 			"NFS%d: failing over: selecting original server %s",
4312 					    mi->mi_vers, svp->sv_hostname);
4313 #else
4314 					zcmn_err(zoneid, CE_NOTE,
4315 			"NFS: failing over: selecting original server %s",
4316 					    svp->sv_hostname);
4317 #endif
4318 				} else {
4319 #ifdef DEBUG
4320 					zcmn_err(zoneid, CE_NOTE,
4321 				    "NFS%d: failing over from %s to %s",
4322 					    mi->mi_vers,
4323 					    mi->mi_curr_serv->sv_hostname,
4324 					    svp->sv_hostname);
4325 #else
4326 					zcmn_err(zoneid, CE_NOTE,
4327 				    "NFS: failing over from %s to %s",
4328 					    mi->mi_curr_serv->sv_hostname,
4329 					    svp->sv_hostname);
4330 #endif
4331 				}
4332 				break;
4333 			}
4334 		}
4335 
4336 		if (svp == NULL) {
4337 			if (!oncethru) {
4338 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4339 #ifdef DEBUG
4340 				zprintf(zoneid,
4341 				    "NFS%d servers %s not responding "
4342 				    "still trying\n", mi->mi_vers, srvnames);
4343 #else
4344 				zprintf(zoneid, "NFS servers %s not responding "
4345 				    "still trying\n", srvnames);
4346 #endif
4347 				oncethru = 1;
4348 			}
4349 			mutex_enter(&mi->mi_lock);
4350 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4351 			mutex_exit(&mi->mi_lock);
4352 			delay(hz);
4353 			mutex_enter(&mi->mi_lock);
4354 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4355 			mutex_exit(&mi->mi_lock);
4356 		}
4357 	}
4358 
4359 	if (oncethru) {
4360 #ifdef DEBUG
4361 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4362 #else
4363 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4364 #endif
4365 	}
4366 
4367 	if (svp != mi->mi_curr_serv) {
4368 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4369 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4370 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4371 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4372 		    mi->mi_vfsp);
4373 		if (rp != NULL) {
4374 			if (rp->r_flags & RHASHED)
4375 				rp_rmhash_locked(rp);
4376 			rw_exit(&rtable[index].r_lock);
4377 			rp->r_server = svp;
4378 			rp->r_fh = svp->sv_fhandle;
4379 			(void) nfs_free_data_reclaim(rp);
4380 			index = rtablehash(&rp->r_fh);
4381 			rp->r_hashq = &rtable[index];
4382 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4383 			vn_exists(RTOV(rp));
4384 			rp_addhash(rp);
4385 			rw_exit(&rp->r_hashq->r_lock);
4386 			VN_RELE(RTOV(rp));
4387 		} else
4388 			rw_exit(&rtable[index].r_lock);
4389 	}
4390 
4391 done:
4392 	if (oncethru)
4393 		kmem_free(srvnames, srvnames_len);
4394 	mutex_enter(&mi->mi_lock);
4395 	mi->mi_flags &= ~MI_BINDINPROG;
4396 	if (svp != NULL) {
4397 		mi->mi_curr_serv = svp;
4398 		mi->mi_failover++;
4399 #ifdef DEBUG
4400 	nfscl->nfscl_stat.failover.value.ui64++;
4401 #endif
4402 	}
4403 	cv_broadcast(&mi->mi_failover_cv);
4404 	CALLB_CPR_EXIT(&cprinfo);
4405 	VFS_RELE(mi->mi_vfsp);
4406 	zthread_exit();
4407 	/* NOTREACHED */
4408 }
4409 
4410 /*
4411  * NFS client failover support
4412  *
4413  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4414  * is cleared, meaning that failover is complete.  Called with
4415  * mi_lock mutex held.
4416  */
4417 static int
4418 failover_wait(mntinfo_t *mi)
4419 {
4420 	k_sigset_t smask;
4421 
4422 	/*
4423 	 * If someone else is hunting for a living server,
4424 	 * sleep until it's done.  After our sleep, we may
4425 	 * be bound to the right server and get off cheaply.
4426 	 */
4427 	while (mi->mi_flags & MI_BINDINPROG) {
4428 		/*
4429 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4430 		 * and SIGTERM. (Preserving the existing masks).
4431 		 * Mask out SIGINT if mount option nointr is specified.
4432 		 */
4433 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4434 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4435 			/*
4436 			 * restore original signal mask
4437 			 */
4438 			sigunintr(&smask);
4439 			return (EINTR);
4440 		}
4441 		/*
4442 		 * restore original signal mask
4443 		 */
4444 		sigunintr(&smask);
4445 	}
4446 	return (0);
4447 }
4448 
4449 /*
4450  * NFS client failover support
4451  *
4452  * failover_remap() will do a partial pathname lookup and find the
4453  * desired vnode on the current server.  The interim vnode will be
4454  * discarded after we pilfer the new filehandle.
4455  *
4456  * Side effects:
4457  * - This routine will also update the filehandle in the args structure
4458  *    pointed to by the fi->fhp pointer if it is non-NULL.
4459  */
4460 
4461 static int
4462 failover_remap(failinfo_t *fi)
4463 {
4464 	vnode_t *vp, *nvp, *rootvp;
4465 	rnode_t *rp, *nrp;
4466 	mntinfo_t *mi;
4467 	int error;
4468 #ifdef DEBUG
4469 	struct nfs_clnt *nfscl;
4470 
4471 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4472 	ASSERT(nfscl != NULL);
4473 #endif
4474 	/*
4475 	 * Sanity check
4476 	 */
4477 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4478 		return (EINVAL);
4479 	vp = fi->vp;
4480 	rp = VTOR(vp);
4481 	mi = VTOMI(vp);
4482 
4483 	if (!(vp->v_flag & VROOT)) {
4484 		/*
4485 		 * Given the root fh, use the path stored in
4486 		 * the rnode to find the fh for the new server.
4487 		 */
4488 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4489 		if (error)
4490 			return (error);
4491 
4492 		error = failover_lookup(rp->r_path, rootvp,
4493 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4494 
4495 		VN_RELE(rootvp);
4496 
4497 		if (error)
4498 			return (error);
4499 
4500 		/*
4501 		 * If we found the same rnode, we're done now
4502 		 */
4503 		if (nvp == vp) {
4504 			/*
4505 			 * Failed and the new server may physically be same
4506 			 * OR may share a same disk subsystem. In this case
4507 			 * file handle for a particular file path is not going
4508 			 * to change, given the same filehandle lookup will
4509 			 * always locate the same rnode as the existing one.
4510 			 * All we might need to do is to update the r_server
4511 			 * with the current servinfo.
4512 			 */
4513 			if (!VALID_FH(fi)) {
4514 				rp->r_server = mi->mi_curr_serv;
4515 			}
4516 			VN_RELE(nvp);
4517 			return (0);
4518 		}
4519 
4520 		/*
4521 		 * Try to make it so that no one else will find this
4522 		 * vnode because it is just a temporary to hold the
4523 		 * new file handle until that file handle can be
4524 		 * copied to the original vnode/rnode.
4525 		 */
4526 		nrp = VTOR(nvp);
4527 		mutex_enter(&mi->mi_remap_lock);
4528 		/*
4529 		 * Some other thread could have raced in here and could
4530 		 * have done the remap for this particular rnode before
4531 		 * this thread here. Check for rp->r_server and
4532 		 * mi->mi_curr_serv and return if they are same.
4533 		 */
4534 		if (VALID_FH(fi)) {
4535 			mutex_exit(&mi->mi_remap_lock);
4536 			VN_RELE(nvp);
4537 			return (0);
4538 		}
4539 
4540 		if (nrp->r_flags & RHASHED)
4541 			rp_rmhash(nrp);
4542 
4543 		/*
4544 		 * As a heuristic check on the validity of the new
4545 		 * file, check that the size and type match against
4546 		 * that we remember from the old version.
4547 		 */
4548 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4549 			mutex_exit(&mi->mi_remap_lock);
4550 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4551 			    "NFS replicas %s and %s: file %s not same.",
4552 			    rp->r_server->sv_hostname,
4553 			    nrp->r_server->sv_hostname, rp->r_path);
4554 			VN_RELE(nvp);
4555 			return (EINVAL);
4556 		}
4557 
4558 		/*
4559 		 * snarf the filehandle from the new rnode
4560 		 * then release it, again while updating the
4561 		 * hash queues for the rnode.
4562 		 */
4563 		if (rp->r_flags & RHASHED)
4564 			rp_rmhash(rp);
4565 		rp->r_server = mi->mi_curr_serv;
4566 		rp->r_fh = nrp->r_fh;
4567 		rp->r_hashq = nrp->r_hashq;
4568 		/*
4569 		 * Copy the attributes from the new rnode to the old
4570 		 * rnode.  This will help to reduce unnecessary page
4571 		 * cache flushes.
4572 		 */
4573 		rp->r_attr = nrp->r_attr;
4574 		rp->r_attrtime = nrp->r_attrtime;
4575 		rp->r_mtime = nrp->r_mtime;
4576 		(void) nfs_free_data_reclaim(rp);
4577 		nfs_setswaplike(vp, &rp->r_attr);
4578 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4579 		rp_addhash(rp);
4580 		rw_exit(&rp->r_hashq->r_lock);
4581 		mutex_exit(&mi->mi_remap_lock);
4582 		VN_RELE(nvp);
4583 	}
4584 
4585 	/*
4586 	 * Update successful failover remap count
4587 	 */
4588 	mutex_enter(&mi->mi_lock);
4589 	mi->mi_remap++;
4590 	mutex_exit(&mi->mi_lock);
4591 #ifdef DEBUG
4592 	nfscl->nfscl_stat.remap.value.ui64++;
4593 #endif
4594 
4595 	/*
4596 	 * If we have a copied filehandle to update, do it now.
4597 	 */
4598 	if (fi->fhp != NULL && fi->copyproc != NULL)
4599 		(*fi->copyproc)(fi->fhp, vp);
4600 
4601 	return (0);
4602 }
4603 
4604 /*
4605  * NFS client failover support
4606  *
4607  * We want a simple pathname lookup routine to parse the pieces
4608  * of path in rp->r_path.  We know that the path was a created
4609  * as rnodes were made, so we know we have only to deal with
4610  * paths that look like:
4611  *	dir1/dir2/dir3/file
4612  * Any evidence of anything like .., symlinks, and ENOTDIR
4613  * are hard errors, because they mean something in this filesystem
4614  * is different from the one we came from, or has changed under
4615  * us in some way.  If this is true, we want the failure.
4616  *
4617  * Extended attributes: if the filesystem is mounted with extended
4618  * attributes enabled (-o xattr), the attribute directory will be
4619  * represented in the r_path as the magic name XATTR_RPATH. So if
4620  * we see that name in the pathname, is must be because this node
4621  * is an extended attribute.  Therefore, look it up that way.
4622  */
4623 static int
4624 failover_lookup(char *path, vnode_t *root,
4625     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4626 	vnode_t *, cred_t *, int),
4627     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4628     vnode_t **new)
4629 {
4630 	vnode_t *dvp, *nvp;
4631 	int error = EINVAL;
4632 	char *s, *p, *tmppath;
4633 	size_t len;
4634 	mntinfo_t *mi;
4635 	bool_t xattr;
4636 
4637 	/* Make local copy of path */
4638 	len = strlen(path) + 1;
4639 	tmppath = kmem_alloc(len, KM_SLEEP);
4640 	(void) strcpy(tmppath, path);
4641 	s = tmppath;
4642 
4643 	dvp = root;
4644 	VN_HOLD(dvp);
4645 	mi = VTOMI(root);
4646 	xattr = mi->mi_flags & MI_EXTATTR;
4647 
4648 	do {
4649 		p = strchr(s, '/');
4650 		if (p != NULL)
4651 			*p = '\0';
4652 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4653 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4654 			    RFSCALL_SOFT);
4655 		} else {
4656 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4657 			    CRED(), RFSCALL_SOFT);
4658 		}
4659 		if (p != NULL)
4660 			*p++ = '/';
4661 		if (error) {
4662 			VN_RELE(dvp);
4663 			kmem_free(tmppath, len);
4664 			return (error);
4665 		}
4666 		s = p;
4667 		VN_RELE(dvp);
4668 		dvp = nvp;
4669 	} while (p != NULL);
4670 
4671 	if (nvp != NULL && new != NULL)
4672 		*new = nvp;
4673 	kmem_free(tmppath, len);
4674 	return (0);
4675 }
4676 
4677 /*
4678  * NFS client failover support
4679  *
4680  * sv_free() frees the malloc'd portion of a "servinfo_t".
4681  */
4682 void
4683 sv_free(servinfo_t *svp)
4684 {
4685 	servinfo_t *next;
4686 	struct knetconfig *knconf;
4687 
4688 	while (svp != NULL) {
4689 		next = svp->sv_next;
4690 		if (svp->sv_secdata)
4691 			sec_clnt_freeinfo(svp->sv_secdata);
4692 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4693 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4694 		knconf = svp->sv_knconf;
4695 		if (knconf != NULL) {
4696 			if (knconf->knc_protofmly != NULL)
4697 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4698 			if (knconf->knc_proto != NULL)
4699 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4700 			kmem_free(knconf, sizeof (*knconf));
4701 		}
4702 		knconf = svp->sv_origknconf;
4703 		if (knconf != NULL) {
4704 			if (knconf->knc_protofmly != NULL)
4705 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4706 			if (knconf->knc_proto != NULL)
4707 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4708 			kmem_free(knconf, sizeof (*knconf));
4709 		}
4710 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4711 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4712 		mutex_destroy(&svp->sv_lock);
4713 		kmem_free(svp, sizeof (*svp));
4714 		svp = next;
4715 	}
4716 }
4717 
4718 /*
4719  * Only can return non-zero if intr != 0.
4720  */
4721 int
4722 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4723 {
4724 
4725 	mutex_enter(&l->lock);
4726 
4727 	/*
4728 	 * If this is a nested enter, then allow it.  There
4729 	 * must be as many exits as enters through.
4730 	 */
4731 	if (l->owner == curthread) {
4732 		/* lock is held for writing by current thread */
4733 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4734 		l->count--;
4735 	} else if (rw == RW_READER) {
4736 		/*
4737 		 * While there is a writer active or writers waiting,
4738 		 * then wait for them to finish up and move on.  Then,
4739 		 * increment the count to indicate that a reader is
4740 		 * active.
4741 		 */
4742 		while (l->count < 0 || l->waiters > 0) {
4743 			if (intr) {
4744 				klwp_t *lwp = ttolwp(curthread);
4745 
4746 				if (lwp != NULL)
4747 					lwp->lwp_nostop++;
4748 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4749 					if (lwp != NULL)
4750 						lwp->lwp_nostop--;
4751 					mutex_exit(&l->lock);
4752 					return (EINTR);
4753 				}
4754 				if (lwp != NULL)
4755 					lwp->lwp_nostop--;
4756 			} else
4757 				cv_wait(&l->cv, &l->lock);
4758 		}
4759 		ASSERT(l->count < INT_MAX);
4760 #ifdef	DEBUG
4761 		if ((l->count % 10000) == 9999)
4762 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4763 			    "rwlock @ %p\n", l->count, (void *)&l);
4764 #endif
4765 		l->count++;
4766 	} else {
4767 		ASSERT(rw == RW_WRITER);
4768 		/*
4769 		 * While there are readers active or a writer
4770 		 * active, then wait for all of the readers
4771 		 * to finish or for the writer to finish.
4772 		 * Then, set the owner field to curthread and
4773 		 * decrement count to indicate that a writer
4774 		 * is active.
4775 		 */
4776 		while (l->count > 0 || l->owner != NULL) {
4777 			l->waiters++;
4778 			if (intr) {
4779 				klwp_t *lwp = ttolwp(curthread);
4780 
4781 				if (lwp != NULL)
4782 					lwp->lwp_nostop++;
4783 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4784 					if (lwp != NULL)
4785 						lwp->lwp_nostop--;
4786 					l->waiters--;
4787 					cv_broadcast(&l->cv);
4788 					mutex_exit(&l->lock);
4789 					return (EINTR);
4790 				}
4791 				if (lwp != NULL)
4792 					lwp->lwp_nostop--;
4793 			} else
4794 				cv_wait(&l->cv, &l->lock);
4795 			l->waiters--;
4796 		}
4797 		l->owner = curthread;
4798 		l->count--;
4799 	}
4800 
4801 	mutex_exit(&l->lock);
4802 
4803 	return (0);
4804 }
4805 
4806 /*
4807  * If the lock is available, obtain it and return non-zero.  If there is
4808  * already a conflicting lock, return 0 immediately.
4809  */
4810 
4811 int
4812 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4813 {
4814 	mutex_enter(&l->lock);
4815 
4816 	/*
4817 	 * If this is a nested enter, then allow it.  There
4818 	 * must be as many exits as enters through.
4819 	 */
4820 	if (l->owner == curthread) {
4821 		/* lock is held for writing by current thread */
4822 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4823 		l->count--;
4824 	} else if (rw == RW_READER) {
4825 		/*
4826 		 * If there is a writer active or writers waiting, deny the
4827 		 * lock.  Otherwise, bump the count of readers.
4828 		 */
4829 		if (l->count < 0 || l->waiters > 0) {
4830 			mutex_exit(&l->lock);
4831 			return (0);
4832 		}
4833 		l->count++;
4834 	} else {
4835 		ASSERT(rw == RW_WRITER);
4836 		/*
4837 		 * If there are readers active or a writer active, deny the
4838 		 * lock.  Otherwise, set the owner field to curthread and
4839 		 * decrement count to indicate that a writer is active.
4840 		 */
4841 		if (l->count > 0 || l->owner != NULL) {
4842 			mutex_exit(&l->lock);
4843 			return (0);
4844 		}
4845 		l->owner = curthread;
4846 		l->count--;
4847 	}
4848 
4849 	mutex_exit(&l->lock);
4850 
4851 	return (1);
4852 }
4853 
4854 void
4855 nfs_rw_exit(nfs_rwlock_t *l)
4856 {
4857 
4858 	mutex_enter(&l->lock);
4859 	/*
4860 	 * If this is releasing a writer lock, then increment count to
4861 	 * indicate that there is one less writer active.  If this was
4862 	 * the last of possibly nested writer locks, then clear the owner
4863 	 * field as well to indicate that there is no writer active
4864 	 * and wakeup any possible waiting writers or readers.
4865 	 *
4866 	 * If releasing a reader lock, then just decrement count to
4867 	 * indicate that there is one less reader active.  If this was
4868 	 * the last active reader and there are writer(s) waiting,
4869 	 * then wake up the first.
4870 	 */
4871 	if (l->owner != NULL) {
4872 		ASSERT(l->owner == curthread);
4873 		l->count++;
4874 		if (l->count == 0) {
4875 			l->owner = NULL;
4876 			cv_broadcast(&l->cv);
4877 		}
4878 	} else {
4879 		ASSERT(l->count > 0);
4880 		l->count--;
4881 		if (l->count == 0 && l->waiters > 0)
4882 			cv_broadcast(&l->cv);
4883 	}
4884 	mutex_exit(&l->lock);
4885 }
4886 
4887 int
4888 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4889 {
4890 
4891 	if (rw == RW_READER)
4892 		return (l->count > 0);
4893 	ASSERT(rw == RW_WRITER);
4894 	return (l->count < 0);
4895 }
4896 
4897 /* ARGSUSED */
4898 void
4899 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4900 {
4901 
4902 	l->count = 0;
4903 	l->waiters = 0;
4904 	l->owner = NULL;
4905 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4906 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4907 }
4908 
4909 void
4910 nfs_rw_destroy(nfs_rwlock_t *l)
4911 {
4912 
4913 	mutex_destroy(&l->lock);
4914 	cv_destroy(&l->cv);
4915 }
4916 
4917 int
4918 nfs3_rddir_compar(const void *x, const void *y)
4919 {
4920 	rddir_cache *a = (rddir_cache *)x;
4921 	rddir_cache *b = (rddir_cache *)y;
4922 
4923 	if (a->nfs3_cookie == b->nfs3_cookie) {
4924 		if (a->buflen == b->buflen)
4925 			return (0);
4926 		if (a->buflen < b->buflen)
4927 			return (-1);
4928 		return (1);
4929 	}
4930 
4931 	if (a->nfs3_cookie < b->nfs3_cookie)
4932 		return (-1);
4933 
4934 	return (1);
4935 }
4936 
4937 int
4938 nfs_rddir_compar(const void *x, const void *y)
4939 {
4940 	rddir_cache *a = (rddir_cache *)x;
4941 	rddir_cache *b = (rddir_cache *)y;
4942 
4943 	if (a->nfs_cookie == b->nfs_cookie) {
4944 		if (a->buflen == b->buflen)
4945 			return (0);
4946 		if (a->buflen < b->buflen)
4947 			return (-1);
4948 		return (1);
4949 	}
4950 
4951 	if (a->nfs_cookie < b->nfs_cookie)
4952 		return (-1);
4953 
4954 	return (1);
4955 }
4956 
4957 static char *
4958 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4959 {
4960 	servinfo_t *s;
4961 	char *srvnames;
4962 	char *namep;
4963 	size_t length;
4964 
4965 	/*
4966 	 * Calculate the length of the string required to hold all
4967 	 * of the server names plus either a comma or a null
4968 	 * character following each individual one.
4969 	 */
4970 	length = 0;
4971 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4972 		length += s->sv_hostnamelen;
4973 
4974 	srvnames = kmem_alloc(length, KM_SLEEP);
4975 
4976 	namep = srvnames;
4977 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4978 		(void) strcpy(namep, s->sv_hostname);
4979 		namep += s->sv_hostnamelen - 1;
4980 		*namep++ = ',';
4981 	}
4982 	*--namep = '\0';
4983 
4984 	*len = length;
4985 
4986 	return (srvnames);
4987 }
4988 
4989 /*
4990  * These two functions are temporary and designed for the upgrade-workaround
4991  * only.  They cannot be used for general zone-crossing NFS client support, and
4992  * will be removed shortly.
4993  *
4994  * When the workaround is enabled, all NFS traffic is forced into the global
4995  * zone.  These functions are called when the code needs to refer to the state
4996  * of the underlying network connection.  They're not called when the function
4997  * needs to refer to the state of the process that invoked the system call.
4998  * (E.g., when checking whether the zone is shutting down during the mount()
4999  * call.)
5000  */
5001 
5002 struct zone *
5003 nfs_zone(void)
5004 {
5005 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5006 }
5007 
5008 zoneid_t
5009 nfs_zoneid(void)
5010 {
5011 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5012 }
5013 
5014 /*
5015  * nfs_mount_label_policy:
5016  *	Determine whether the mount is allowed according to MAC check,
5017  *	by comparing (where appropriate) label of the remote server
5018  *	against the label of the zone being mounted into.
5019  *
5020  *	Returns:
5021  *		 0 :	access allowed
5022  *		-1 :	read-only access allowed (i.e., read-down)
5023  *		>0 :	error code, such as EACCES
5024  */
5025 int
5026 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5027     struct knetconfig *knconf, cred_t *cr)
5028 {
5029 	int		addr_type;
5030 	void		*ipaddr;
5031 	bslabel_t	*server_sl, *mntlabel;
5032 	zone_t		*mntzone = NULL;
5033 	ts_label_t	*zlabel;
5034 	tsol_tpc_t	*tp;
5035 	ts_label_t	*tsl = NULL;
5036 	int		retv;
5037 
5038 	/*
5039 	 * Get the zone's label.  Each zone on a labeled system has a label.
5040 	 */
5041 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5042 	zlabel = mntzone->zone_slabel;
5043 	ASSERT(zlabel != NULL);
5044 	label_hold(zlabel);
5045 
5046 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5047 		addr_type = IPV4_VERSION;
5048 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5049 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5050 		addr_type = IPV6_VERSION;
5051 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5052 	} else {
5053 		retv = 0;
5054 		goto out;
5055 	}
5056 
5057 	retv = EACCES;				/* assume the worst */
5058 
5059 	/*
5060 	 * Next, get the assigned label of the remote server.
5061 	 */
5062 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5063 	if (tp == NULL)
5064 		goto out;			/* error getting host entry */
5065 
5066 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5067 		goto rel_tpc;			/* invalid domain */
5068 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5069 	    (tp->tpc_tp.host_type != UNLABELED))
5070 		goto rel_tpc;			/* invalid hosttype */
5071 
5072 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5073 		tsl = getflabel_cipso(vfsp);
5074 		if (tsl == NULL)
5075 			goto rel_tpc;		/* error getting server lbl */
5076 
5077 		server_sl = label2bslabel(tsl);
5078 	} else {	/* UNLABELED */
5079 		server_sl = &tp->tpc_tp.tp_def_label;
5080 	}
5081 
5082 	mntlabel = label2bslabel(zlabel);
5083 
5084 	/*
5085 	 * Now compare labels to complete the MAC check.  If the labels
5086 	 * are equal or if the requestor is in the global zone and has
5087 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5088 	 * mounts into the global zone itself; restrict these to
5089 	 * read-only.)
5090 	 *
5091 	 * If the requestor is in some other zone, but his label
5092 	 * dominates the server, then allow read-down.
5093 	 *
5094 	 * Otherwise, access is denied.
5095 	 */
5096 	if (blequal(mntlabel, server_sl) ||
5097 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5098 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5099 		if ((mntzone == global_zone) ||
5100 		    !blequal(mntlabel, server_sl))
5101 			retv = -1;		/* read-only */
5102 		else
5103 			retv = 0;		/* access OK */
5104 	} else if (bldominates(mntlabel, server_sl)) {
5105 		retv = -1;			/* read-only */
5106 	} else {
5107 		retv = EACCES;
5108 	}
5109 
5110 	if (tsl != NULL)
5111 		label_rele(tsl);
5112 
5113 rel_tpc:
5114 	TPC_RELE(tp);
5115 out:
5116 	if (mntzone)
5117 		zone_rele(mntzone);
5118 	label_rele(zlabel);
5119 	return (retv);
5120 }
5121 
5122 boolean_t
5123 nfs_has_ctty(void)
5124 {
5125 	boolean_t rv;
5126 	mutex_enter(&curproc->p_splock);
5127 	rv = (curproc->p_sessp->s_vp != NULL);
5128 	mutex_exit(&curproc->p_splock);
5129 	return (rv);
5130 }
5131