xref: /titanic_51/usr/src/uts/common/fs/nfs/nfs_subr.c (revision 87c5f7b3eef6309c168257f261ac6ace4581d234)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/cred_impl.h>
32 #include <sys/proc.h>
33 #include <sys/user.h>
34 #include <sys/time.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/socket.h>
39 #include <sys/uio.h>
40 #include <sys/tiuser.h>
41 #include <sys/swap.h>
42 #include <sys/errno.h>
43 #include <sys/debug.h>
44 #include <sys/kmem.h>
45 #include <sys/kstat.h>
46 #include <sys/cmn_err.h>
47 #include <sys/vtrace.h>
48 #include <sys/session.h>
49 #include <sys/dnlc.h>
50 #include <sys/bitmap.h>
51 #include <sys/acl.h>
52 #include <sys/ddi.h>
53 #include <sys/pathname.h>
54 #include <sys/flock.h>
55 #include <sys/dirent.h>
56 #include <sys/flock.h>
57 #include <sys/callb.h>
58 #include <sys/atomic.h>
59 #include <sys/list.h>
60 #include <sys/tsol/tnet.h>
61 #include <sys/priv.h>
62 #include <sys/sdt.h>
63 #include <sys/attr.h>
64 
65 #include <inet/ip6.h>
66 
67 #include <rpc/types.h>
68 #include <rpc/xdr.h>
69 #include <rpc/auth.h>
70 #include <rpc/clnt.h>
71 
72 #include <nfs/nfs.h>
73 #include <nfs/nfs4.h>
74 #include <nfs/nfs_clnt.h>
75 #include <nfs/rnode.h>
76 #include <nfs/nfs_acl.h>
77 
78 #include <sys/tsol/label.h>
79 
80 /*
81  * The hash queues for the access to active and cached rnodes
82  * are organized as doubly linked lists.  A reader/writer lock
83  * for each hash bucket is used to control access and to synchronize
84  * lookups, additions, and deletions from the hash queue.
85  *
86  * The rnode freelist is organized as a doubly linked list with
87  * a head pointer.  Additions and deletions are synchronized via
88  * a single mutex.
89  *
90  * In order to add an rnode to the free list, it must be hashed into
91  * a hash queue and the exclusive lock to the hash queue be held.
92  * If an rnode is not hashed into a hash queue, then it is destroyed
93  * because it represents no valuable information that can be reused
94  * about the file.  The exclusive lock to the hash queue must be
95  * held in order to prevent a lookup in the hash queue from finding
96  * the rnode and using it and assuming that the rnode is not on the
97  * freelist.  The lookup in the hash queue will have the hash queue
98  * locked, either exclusive or shared.
99  *
100  * The vnode reference count for each rnode is not allowed to drop
101  * below 1.  This prevents external entities, such as the VM
102  * subsystem, from acquiring references to vnodes already on the
103  * freelist and then trying to place them back on the freelist
104  * when their reference is released.  This means that the when an
105  * rnode is looked up in the hash queues, then either the rnode
106  * is removed from the freelist and that reference is transferred to
107  * the new reference or the vnode reference count must be incremented
108  * accordingly.  The mutex for the freelist must be held in order to
109  * accurately test to see if the rnode is on the freelist or not.
110  * The hash queue lock might be held shared and it is possible that
111  * two different threads may race to remove the rnode from the
112  * freelist.  This race can be resolved by holding the mutex for the
113  * freelist.  Please note that the mutex for the freelist does not
114  * need to held if the rnode is not on the freelist.  It can not be
115  * placed on the freelist due to the requirement that the thread
116  * putting the rnode on the freelist must hold the exclusive lock
117  * to the hash queue and the thread doing the lookup in the hash
118  * queue is holding either a shared or exclusive lock to the hash
119  * queue.
120  *
121  * The lock ordering is:
122  *
123  *	hash bucket lock -> vnode lock
124  *	hash bucket lock -> freelist lock
125  */
126 static rhashq_t *rtable;
127 
128 static kmutex_t rpfreelist_lock;
129 static rnode_t *rpfreelist = NULL;
130 static long rnew = 0;
131 long nrnode = 0;
132 
133 static int rtablesize;
134 static int rtablemask;
135 
136 static int hashlen = 4;
137 
138 static struct kmem_cache *rnode_cache;
139 
140 /*
141  * Mutex to protect the following variables:
142  *	nfs_major
143  *	nfs_minor
144  */
145 kmutex_t nfs_minor_lock;
146 int nfs_major;
147 int nfs_minor;
148 
149 /* Do we allow preepoch (negative) time values otw? */
150 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
151 
152 /*
153  * Access cache
154  */
155 static acache_hash_t *acache;
156 static long nacache;	/* used strictly to size the number of hash queues */
157 
158 static int acachesize;
159 static int acachemask;
160 static struct kmem_cache *acache_cache;
161 
162 /*
163  * Client side utilities
164  */
165 
166 /*
167  * client side statistics
168  */
169 static const struct clstat clstat_tmpl = {
170 	{ "calls",	KSTAT_DATA_UINT64 },
171 	{ "badcalls",	KSTAT_DATA_UINT64 },
172 	{ "clgets",	KSTAT_DATA_UINT64 },
173 	{ "cltoomany",	KSTAT_DATA_UINT64 },
174 #ifdef DEBUG
175 	{ "clalloc",	KSTAT_DATA_UINT64 },
176 	{ "noresponse",	KSTAT_DATA_UINT64 },
177 	{ "failover",	KSTAT_DATA_UINT64 },
178 	{ "remap",	KSTAT_DATA_UINT64 },
179 #endif
180 };
181 
182 /*
183  * The following are statistics that describe behavior of the system as a whole
184  * and doesn't correspond to any one particular zone.
185  */
186 #ifdef DEBUG
187 static struct clstat_debug {
188 	kstat_named_t	nrnode;			/* number of allocated rnodes */
189 	kstat_named_t	access;			/* size of access cache */
190 	kstat_named_t	dirent;			/* size of readdir cache */
191 	kstat_named_t	dirents;		/* size of readdir buf cache */
192 	kstat_named_t	reclaim;		/* number of reclaims */
193 	kstat_named_t	clreclaim;		/* number of cl reclaims */
194 	kstat_named_t	f_reclaim;		/* number of free reclaims */
195 	kstat_named_t	a_reclaim;		/* number of active reclaims */
196 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
197 	kstat_named_t	rpath;			/* bytes used to store rpaths */
198 } clstat_debug = {
199 	{ "nrnode",	KSTAT_DATA_UINT64 },
200 	{ "access",	KSTAT_DATA_UINT64 },
201 	{ "dirent",	KSTAT_DATA_UINT64 },
202 	{ "dirents",	KSTAT_DATA_UINT64 },
203 	{ "reclaim",	KSTAT_DATA_UINT64 },
204 	{ "clreclaim",	KSTAT_DATA_UINT64 },
205 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
206 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
207 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
208 	{ "r_path",	KSTAT_DATA_UINT64 },
209 };
210 #endif	/* DEBUG */
211 
212 /*
213  * We keep a global list of per-zone client data, so we can clean up all zones
214  * if we get low on memory.
215  */
216 static list_t nfs_clnt_list;
217 static kmutex_t nfs_clnt_list_lock;
218 static zone_key_t nfsclnt_zone_key;
219 
220 static struct kmem_cache *chtab_cache;
221 
222 /*
223  * Some servers do not properly update the attributes of the
224  * directory when changes are made.  To allow interoperability
225  * with these broken servers, the nfs_disable_rddir_cache
226  * parameter must be set in /etc/system
227  */
228 int nfs_disable_rddir_cache = 0;
229 
230 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
231 		    struct chtab **);
232 void		clfree(CLIENT *, struct chtab *);
233 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234 		    struct chtab **, struct nfs_clnt *);
235 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
236 		    struct chtab **, struct nfs_clnt *);
237 static void	clreclaim(void *);
238 static int	nfs_feedback(int, int, mntinfo_t *);
239 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
240 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
241 		    failinfo_t *);
242 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
243 		    caddr_t, cred_t *, int *, int, failinfo_t *);
244 static void	rinactive(rnode_t *, cred_t *);
245 static int	rtablehash(nfs_fhandle *);
246 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
247 		    struct vnodeops *,
248 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
249 			cred_t *),
250 		    int (*)(const void *, const void *), int *, cred_t *,
251 		    char *, char *);
252 static void	rp_rmfree(rnode_t *);
253 static void	rp_addhash(rnode_t *);
254 static void	rp_rmhash_locked(rnode_t *);
255 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
256 static void	destroy_rnode(rnode_t *);
257 static void	rddir_cache_free(rddir_cache *);
258 static int	nfs_free_data_reclaim(rnode_t *);
259 static int	nfs_active_data_reclaim(rnode_t *);
260 static int	nfs_free_reclaim(void);
261 static int	nfs_active_reclaim(void);
262 static int	nfs_rnode_reclaim(void);
263 static void	nfs_reclaim(void *);
264 static int	failover_safe(failinfo_t *);
265 static void	failover_newserver(mntinfo_t *mi);
266 static void	failover_thread(mntinfo_t *mi);
267 static int	failover_wait(mntinfo_t *);
268 static int	failover_remap(failinfo_t *);
269 static int	failover_lookup(char *, vnode_t *,
270 		    int (*)(vnode_t *, char *, vnode_t **,
271 			struct pathname *, int, vnode_t *, cred_t *, int),
272 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
273 		    vnode_t **);
274 static void	nfs_free_r_path(rnode_t *);
275 static void	nfs_set_vroot(vnode_t *);
276 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
277 
278 /*
279  * from rpcsec module (common/rpcsec)
280  */
281 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
282 extern void sec_clnt_freeh(AUTH *);
283 extern void sec_clnt_freeinfo(struct sec_data *);
284 
285 /*
286  * used in mount policy
287  */
288 extern ts_label_t *getflabel_cipso(vfs_t *);
289 
290 /*
291  * EIO or EINTR are not recoverable errors.
292  */
293 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
294 
295 /*
296  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
297  */
298 static int
299 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
300     struct chtab **chp, struct nfs_clnt *nfscl)
301 {
302 	struct chhead *ch, *newch;
303 	struct chhead **plistp;
304 	struct chtab *cp;
305 	int error;
306 	k_sigset_t smask;
307 
308 	if (newcl == NULL || chp == NULL || ci == NULL)
309 		return (EINVAL);
310 
311 	*newcl = NULL;
312 	*chp = NULL;
313 
314 	/*
315 	 * Find an unused handle or create one
316 	 */
317 	newch = NULL;
318 	nfscl->nfscl_stat.clgets.value.ui64++;
319 top:
320 	/*
321 	 * Find the correct entry in the cache to check for free
322 	 * client handles.  The search is based on the RPC program
323 	 * number, program version number, dev_t for the transport
324 	 * device, and the protocol family.
325 	 */
326 	mutex_enter(&nfscl->nfscl_chtable_lock);
327 	plistp = &nfscl->nfscl_chtable;
328 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
329 		if (ch->ch_prog == ci->cl_prog &&
330 		    ch->ch_vers == ci->cl_vers &&
331 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
332 		    (strcmp(ch->ch_protofmly,
333 		    svp->sv_knconf->knc_protofmly) == 0))
334 			break;
335 		plistp = &ch->ch_next;
336 	}
337 
338 	/*
339 	 * If we didn't find a cache entry for this quadruple, then
340 	 * create one.  If we don't have one already preallocated,
341 	 * then drop the cache lock, create one, and then start over.
342 	 * If we did have a preallocated entry, then just add it to
343 	 * the front of the list.
344 	 */
345 	if (ch == NULL) {
346 		if (newch == NULL) {
347 			mutex_exit(&nfscl->nfscl_chtable_lock);
348 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
349 			newch->ch_timesused = 0;
350 			newch->ch_prog = ci->cl_prog;
351 			newch->ch_vers = ci->cl_vers;
352 			newch->ch_dev = svp->sv_knconf->knc_rdev;
353 			newch->ch_protofmly = kmem_alloc(
354 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
355 			    KM_SLEEP);
356 			(void) strcpy(newch->ch_protofmly,
357 			    svp->sv_knconf->knc_protofmly);
358 			newch->ch_list = NULL;
359 			goto top;
360 		}
361 		ch = newch;
362 		newch = NULL;
363 		ch->ch_next = nfscl->nfscl_chtable;
364 		nfscl->nfscl_chtable = ch;
365 	/*
366 	 * We found a cache entry, but if it isn't on the front of the
367 	 * list, then move it to the front of the list to try to take
368 	 * advantage of locality of operations.
369 	 */
370 	} else if (ch != nfscl->nfscl_chtable) {
371 		*plistp = ch->ch_next;
372 		ch->ch_next = nfscl->nfscl_chtable;
373 		nfscl->nfscl_chtable = ch;
374 	}
375 
376 	/*
377 	 * If there was a free client handle cached, then remove it
378 	 * from the list, init it, and use it.
379 	 */
380 	if (ch->ch_list != NULL) {
381 		cp = ch->ch_list;
382 		ch->ch_list = cp->ch_list;
383 		mutex_exit(&nfscl->nfscl_chtable_lock);
384 		if (newch != NULL) {
385 			kmem_free(newch->ch_protofmly,
386 			    strlen(newch->ch_protofmly) + 1);
387 			kmem_free(newch, sizeof (*newch));
388 		}
389 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
390 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
391 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
392 		    &cp->ch_client->cl_auth);
393 		if (error || cp->ch_client->cl_auth == NULL) {
394 			CLNT_DESTROY(cp->ch_client);
395 			kmem_cache_free(chtab_cache, cp);
396 			return ((error != 0) ? error : EINTR);
397 		}
398 		ch->ch_timesused++;
399 		*newcl = cp->ch_client;
400 		*chp = cp;
401 		return (0);
402 	}
403 
404 	/*
405 	 * There weren't any free client handles which fit, so allocate
406 	 * a new one and use that.
407 	 */
408 #ifdef DEBUG
409 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
410 #endif
411 	mutex_exit(&nfscl->nfscl_chtable_lock);
412 
413 	nfscl->nfscl_stat.cltoomany.value.ui64++;
414 	if (newch != NULL) {
415 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
416 		kmem_free(newch, sizeof (*newch));
417 	}
418 
419 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
420 	cp->ch_head = ch;
421 
422 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
423 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
424 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
425 	sigunintr(&smask);
426 
427 	if (error != 0) {
428 		kmem_cache_free(chtab_cache, cp);
429 #ifdef DEBUG
430 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
431 #endif
432 		/*
433 		 * Warning is unnecessary if error is EINTR.
434 		 */
435 		if (error != EINTR) {
436 			nfs_cmn_err(error, CE_WARN,
437 			    "clget: couldn't create handle: %m\n");
438 		}
439 		return (error);
440 	}
441 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
442 	auth_destroy(cp->ch_client->cl_auth);
443 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
444 	    &cp->ch_client->cl_auth);
445 	if (error || cp->ch_client->cl_auth == NULL) {
446 		CLNT_DESTROY(cp->ch_client);
447 		kmem_cache_free(chtab_cache, cp);
448 #ifdef DEBUG
449 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
450 #endif
451 		return ((error != 0) ? error : EINTR);
452 	}
453 	ch->ch_timesused++;
454 	*newcl = cp->ch_client;
455 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
456 	*chp = cp;
457 	return (0);
458 }
459 
460 int
461 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
462     struct chtab **chp)
463 {
464 	struct nfs_clnt *nfscl;
465 
466 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
467 	ASSERT(nfscl != NULL);
468 
469 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
470 }
471 
472 static int
473 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
474     struct chtab **chp, struct nfs_clnt *nfscl)
475 {
476 	clinfo_t ci;
477 	int error;
478 
479 	/*
480 	 * Set read buffer size to rsize
481 	 * and add room for RPC headers.
482 	 */
483 	ci.cl_readsize = mi->mi_tsize;
484 	if (ci.cl_readsize != 0)
485 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
486 
487 	/*
488 	 * If soft mount and server is down just try once.
489 	 * meaning: do not retransmit.
490 	 */
491 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
492 		ci.cl_retrans = 0;
493 	else
494 		ci.cl_retrans = mi->mi_retrans;
495 
496 	ci.cl_prog = NFS_ACL_PROGRAM;
497 	ci.cl_vers = mi->mi_vers;
498 	ci.cl_flags = mi->mi_flags;
499 
500 	/*
501 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
502 	 * security flavor, the client tries to establish a security context
503 	 * by contacting the server. If the connection is timed out or reset,
504 	 * e.g. server reboot, we will try again.
505 	 */
506 	do {
507 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
508 
509 		if (error == 0)
510 			break;
511 
512 		/*
513 		 * For forced unmount or zone shutdown, bail out, no retry.
514 		 */
515 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
516 			error = EIO;
517 			break;
518 		}
519 
520 		/* do not retry for softmount */
521 		if (!(mi->mi_flags & MI_HARD))
522 			break;
523 
524 		/* let the caller deal with the failover case */
525 		if (FAILOVER_MOUNT(mi))
526 			break;
527 
528 	} while (error == ETIMEDOUT || error == ECONNRESET);
529 
530 	return (error);
531 }
532 
533 static int
534 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
535     struct chtab **chp, struct nfs_clnt *nfscl)
536 {
537 	clinfo_t ci;
538 	int error;
539 
540 	/*
541 	 * Set read buffer size to rsize
542 	 * and add room for RPC headers.
543 	 */
544 	ci.cl_readsize = mi->mi_tsize;
545 	if (ci.cl_readsize != 0)
546 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
547 
548 	/*
549 	 * If soft mount and server is down just try once.
550 	 * meaning: do not retransmit.
551 	 */
552 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
553 		ci.cl_retrans = 0;
554 	else
555 		ci.cl_retrans = mi->mi_retrans;
556 
557 	ci.cl_prog = mi->mi_prog;
558 	ci.cl_vers = mi->mi_vers;
559 	ci.cl_flags = mi->mi_flags;
560 
561 	/*
562 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
563 	 * security flavor, the client tries to establish a security context
564 	 * by contacting the server. If the connection is timed out or reset,
565 	 * e.g. server reboot, we will try again.
566 	 */
567 	do {
568 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
569 
570 		if (error == 0)
571 			break;
572 
573 		/*
574 		 * For forced unmount or zone shutdown, bail out, no retry.
575 		 */
576 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
577 			error = EIO;
578 			break;
579 		}
580 
581 		/* do not retry for softmount */
582 		if (!(mi->mi_flags & MI_HARD))
583 			break;
584 
585 		/* let the caller deal with the failover case */
586 		if (FAILOVER_MOUNT(mi))
587 			break;
588 
589 	} while (error == ETIMEDOUT || error == ECONNRESET);
590 
591 	return (error);
592 }
593 
594 static void
595 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
596 {
597 	if (cl->cl_auth != NULL) {
598 		sec_clnt_freeh(cl->cl_auth);
599 		cl->cl_auth = NULL;
600 	}
601 
602 	/*
603 	 * Timestamp this cache entry so that we know when it was last
604 	 * used.
605 	 */
606 	cp->ch_freed = gethrestime_sec();
607 
608 	/*
609 	 * Add the free client handle to the front of the list.
610 	 * This way, the list will be sorted in youngest to oldest
611 	 * order.
612 	 */
613 	mutex_enter(&nfscl->nfscl_chtable_lock);
614 	cp->ch_list = cp->ch_head->ch_list;
615 	cp->ch_head->ch_list = cp;
616 	mutex_exit(&nfscl->nfscl_chtable_lock);
617 }
618 
619 void
620 clfree(CLIENT *cl, struct chtab *cp)
621 {
622 	struct nfs_clnt *nfscl;
623 
624 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
625 	ASSERT(nfscl != NULL);
626 
627 	clfree_impl(cl, cp, nfscl);
628 }
629 
630 #define	CL_HOLDTIME	60	/* time to hold client handles */
631 
632 static void
633 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
634 {
635 	struct chhead *ch;
636 	struct chtab *cp;	/* list of objects that can be reclaimed */
637 	struct chtab *cpe;
638 	struct chtab *cpl;
639 	struct chtab **cpp;
640 #ifdef DEBUG
641 	int n = 0;
642 #endif
643 
644 	/*
645 	 * Need to reclaim some memory, so step through the cache
646 	 * looking through the lists for entries which can be freed.
647 	 */
648 	cp = NULL;
649 
650 	mutex_enter(&nfscl->nfscl_chtable_lock);
651 
652 	/*
653 	 * Here we step through each non-NULL quadruple and start to
654 	 * construct the reclaim list pointed to by cp.  Note that
655 	 * cp will contain all eligible chtab entries.  When this traversal
656 	 * completes, chtab entries from the last quadruple will be at the
657 	 * front of cp and entries from previously inspected quadruples have
658 	 * been appended to the rear of cp.
659 	 */
660 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
661 		if (ch->ch_list == NULL)
662 			continue;
663 		/*
664 		 * Search each list for entries older then
665 		 * cl_holdtime seconds.  The lists are maintained
666 		 * in youngest to oldest order so that when the
667 		 * first entry is found which is old enough, then
668 		 * all of the rest of the entries on the list will
669 		 * be old enough as well.
670 		 */
671 		cpl = ch->ch_list;
672 		cpp = &ch->ch_list;
673 		while (cpl != NULL &&
674 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
675 			cpp = &cpl->ch_list;
676 			cpl = cpl->ch_list;
677 		}
678 		if (cpl != NULL) {
679 			*cpp = NULL;
680 			if (cp != NULL) {
681 				cpe = cpl;
682 				while (cpe->ch_list != NULL)
683 					cpe = cpe->ch_list;
684 				cpe->ch_list = cp;
685 			}
686 			cp = cpl;
687 		}
688 	}
689 
690 	mutex_exit(&nfscl->nfscl_chtable_lock);
691 
692 	/*
693 	 * If cp is empty, then there is nothing to reclaim here.
694 	 */
695 	if (cp == NULL)
696 		return;
697 
698 	/*
699 	 * Step through the list of entries to free, destroying each client
700 	 * handle and kmem_free'ing the memory for each entry.
701 	 */
702 	while (cp != NULL) {
703 #ifdef DEBUG
704 		n++;
705 #endif
706 		CLNT_DESTROY(cp->ch_client);
707 		cpl = cp->ch_list;
708 		kmem_cache_free(chtab_cache, cp);
709 		cp = cpl;
710 	}
711 
712 #ifdef DEBUG
713 	/*
714 	 * Update clalloc so that nfsstat shows the current number
715 	 * of allocated client handles.
716 	 */
717 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
718 #endif
719 }
720 
721 /* ARGSUSED */
722 static void
723 clreclaim(void *all)
724 {
725 	struct nfs_clnt *nfscl;
726 
727 #ifdef DEBUG
728 	clstat_debug.clreclaim.value.ui64++;
729 #endif
730 	/*
731 	 * The system is low on memory; go through and try to reclaim some from
732 	 * every zone on the system.
733 	 */
734 	mutex_enter(&nfs_clnt_list_lock);
735 	nfscl = list_head(&nfs_clnt_list);
736 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
737 		clreclaim_zone(nfscl, CL_HOLDTIME);
738 	mutex_exit(&nfs_clnt_list_lock);
739 }
740 
741 /*
742  * Minimum time-out values indexed by call type
743  * These units are in "eights" of a second to avoid multiplies
744  */
745 static unsigned int minimum_timeo[] = {
746 	6, 7, 10
747 };
748 
749 /*
750  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
751  */
752 #define	MAXTIMO	(20*hz)
753 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
754 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
755 
756 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
757 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
758 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
759 
760 /*
761  * Function called when rfscall notices that we have been
762  * re-transmitting, or when we get a response without retransmissions.
763  * Return 1 if the transfer size was adjusted down - 0 if no change.
764  */
765 static int
766 nfs_feedback(int flag, int which, mntinfo_t *mi)
767 {
768 	int kind;
769 	int r = 0;
770 
771 	mutex_enter(&mi->mi_lock);
772 	if (flag == FEEDBACK_REXMIT1) {
773 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
774 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
775 			goto done;
776 		if (mi->mi_curread > MIN_NFS_TSIZE) {
777 			mi->mi_curread /= 2;
778 			if (mi->mi_curread < MIN_NFS_TSIZE)
779 				mi->mi_curread = MIN_NFS_TSIZE;
780 			r = 1;
781 		}
782 
783 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
784 			mi->mi_curwrite /= 2;
785 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
786 				mi->mi_curwrite = MIN_NFS_TSIZE;
787 			r = 1;
788 		}
789 	} else if (flag == FEEDBACK_OK) {
790 		kind = mi->mi_timer_type[which];
791 		if (kind == 0 ||
792 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
793 			goto done;
794 		if (kind == 1) {
795 			if (mi->mi_curread >= mi->mi_tsize)
796 				goto done;
797 			mi->mi_curread +=  MIN_NFS_TSIZE;
798 			if (mi->mi_curread > mi->mi_tsize/2)
799 				mi->mi_curread = mi->mi_tsize;
800 		} else if (kind == 2) {
801 			if (mi->mi_curwrite >= mi->mi_stsize)
802 				goto done;
803 			mi->mi_curwrite += MIN_NFS_TSIZE;
804 			if (mi->mi_curwrite > mi->mi_stsize/2)
805 				mi->mi_curwrite = mi->mi_stsize;
806 		}
807 	}
808 done:
809 	mutex_exit(&mi->mi_lock);
810 	return (r);
811 }
812 
813 #ifdef DEBUG
814 static int rfs2call_hits = 0;
815 static int rfs2call_misses = 0;
816 #endif
817 
818 int
819 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
820     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
821     enum nfsstat *statusp, int flags, failinfo_t *fi)
822 {
823 	int rpcerror;
824 	enum clnt_stat rpc_status;
825 
826 	ASSERT(statusp != NULL);
827 
828 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
829 	    cr, douprintf, &rpc_status, flags, fi);
830 	if (!rpcerror) {
831 		/*
832 		 * See crnetadjust() for comments.
833 		 */
834 		if (*statusp == NFSERR_ACCES &&
835 		    (cr = crnetadjust(cr)) != NULL) {
836 #ifdef DEBUG
837 			rfs2call_hits++;
838 #endif
839 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
840 			    resp, cr, douprintf, NULL, flags, fi);
841 			crfree(cr);
842 #ifdef DEBUG
843 			if (*statusp == NFSERR_ACCES)
844 				rfs2call_misses++;
845 #endif
846 		}
847 	} else if (rpc_status == RPC_PROCUNAVAIL) {
848 		*statusp = NFSERR_OPNOTSUPP;
849 		rpcerror = 0;
850 	}
851 
852 	return (rpcerror);
853 }
854 
855 #define	NFS3_JUKEBOX_DELAY	10 * hz
856 
857 static clock_t nfs3_jukebox_delay = 0;
858 
859 #ifdef DEBUG
860 static int rfs3call_hits = 0;
861 static int rfs3call_misses = 0;
862 #endif
863 
864 int
865 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
866     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
867     nfsstat3 *statusp, int flags, failinfo_t *fi)
868 {
869 	int rpcerror;
870 	int user_informed;
871 
872 	user_informed = 0;
873 	do {
874 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
875 		    cr, douprintf, NULL, flags, fi);
876 		if (!rpcerror) {
877 			cred_t *crr;
878 			if (*statusp == NFS3ERR_JUKEBOX) {
879 				if (ttoproc(curthread) == &p0) {
880 					rpcerror = EAGAIN;
881 					break;
882 				}
883 				if (!user_informed) {
884 					user_informed = 1;
885 					uprintf(
886 		"file temporarily unavailable on the server, retrying...\n");
887 				}
888 				delay(nfs3_jukebox_delay);
889 			}
890 			/*
891 			 * See crnetadjust() for comments.
892 			 */
893 			else if (*statusp == NFS3ERR_ACCES &&
894 			    (crr = crnetadjust(cr)) != NULL) {
895 #ifdef DEBUG
896 				rfs3call_hits++;
897 #endif
898 				rpcerror = rfscall(mi, which, xdrargs, argsp,
899 				    xdrres, resp, crr, douprintf,
900 				    NULL, flags, fi);
901 
902 				crfree(crr);
903 #ifdef DEBUG
904 				if (*statusp == NFS3ERR_ACCES)
905 					rfs3call_misses++;
906 #endif
907 			}
908 		}
909 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
910 
911 	return (rpcerror);
912 }
913 
914 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
915 #define	INC_READERS(mi)		{ \
916 	mi->mi_readers++; \
917 }
918 #define	DEC_READERS(mi)		{ \
919 	mi->mi_readers--; \
920 	if (mi->mi_readers == 0) \
921 		cv_broadcast(&mi->mi_failover_cv); \
922 }
923 
924 static int
925 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
926     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
927     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
928 {
929 	CLIENT *client;
930 	struct chtab *ch;
931 	cred_t *cr = icr;
932 	enum clnt_stat status;
933 	struct rpc_err rpcerr;
934 	struct timeval wait;
935 	int timeo;		/* in units of hz */
936 	int my_rsize, my_wsize;
937 	bool_t tryagain;
938 	bool_t cred_cloned = FALSE;
939 	k_sigset_t smask;
940 	servinfo_t *svp;
941 	struct nfs_clnt *nfscl;
942 	zoneid_t zoneid = getzoneid();
943 #ifdef DEBUG
944 	char *bufp;
945 #endif
946 
947 
948 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
949 	    "rfscall_start:which %d mi %p", which, mi);
950 
951 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
952 	ASSERT(nfscl != NULL);
953 
954 	nfscl->nfscl_stat.calls.value.ui64++;
955 	mi->mi_reqs[which].value.ui64++;
956 
957 	rpcerr.re_status = RPC_SUCCESS;
958 
959 	/*
960 	 * In case of forced unmount or zone shutdown, return EIO.
961 	 */
962 
963 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
964 		rpcerr.re_status = RPC_FAILED;
965 		rpcerr.re_errno = EIO;
966 		return (rpcerr.re_errno);
967 	}
968 
969 	/*
970 	 * Remember the transfer sizes in case
971 	 * nfs_feedback changes them underneath us.
972 	 */
973 	my_rsize = mi->mi_curread;
974 	my_wsize = mi->mi_curwrite;
975 
976 	/*
977 	 * NFS client failover support
978 	 *
979 	 * If this rnode is not in sync with the current server (VALID_FH),
980 	 * we'd like to do a remap to get in sync.  We can be interrupted
981 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
982 	 * use the best info we have to try the RPC.  Part of that is
983 	 * unconditionally updating the filehandle copy kept for V3.
984 	 *
985 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
986 	 * rw_enter(); we're trying to keep the current server from being
987 	 * changed on us until we're done with the remapping and have a
988 	 * matching client handle.  We don't want to sending a filehandle
989 	 * to the wrong host.
990 	 */
991 failoverretry:
992 	if (FAILOVER_MOUNT(mi)) {
993 		mutex_enter(&mi->mi_lock);
994 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
995 			if (failover_wait(mi)) {
996 				mutex_exit(&mi->mi_lock);
997 				return (EINTR);
998 			}
999 		}
1000 		INC_READERS(mi);
1001 		mutex_exit(&mi->mi_lock);
1002 		if (fi) {
1003 			if (!VALID_FH(fi) &&
1004 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1005 				int remaperr;
1006 
1007 				svp = mi->mi_curr_serv;
1008 				remaperr = failover_remap(fi);
1009 				if (remaperr != 0) {
1010 #ifdef DEBUG
1011 					if (remaperr != EINTR)
1012 						nfs_cmn_err(remaperr, CE_WARN,
1013 					    "rfscall couldn't failover: %m");
1014 #endif
1015 					mutex_enter(&mi->mi_lock);
1016 					DEC_READERS(mi);
1017 					mutex_exit(&mi->mi_lock);
1018 					/*
1019 					 * If failover_remap returns ETIMEDOUT
1020 					 * and the filesystem is hard mounted
1021 					 * we have to retry the call with a new
1022 					 * server.
1023 					 */
1024 					if ((mi->mi_flags & MI_HARD) &&
1025 					    IS_RECOVERABLE_ERROR(remaperr)) {
1026 						if (svp == mi->mi_curr_serv)
1027 							failover_newserver(mi);
1028 						rpcerr.re_status = RPC_SUCCESS;
1029 						goto failoverretry;
1030 					}
1031 					rpcerr.re_errno = remaperr;
1032 					return (remaperr);
1033 				}
1034 			}
1035 			if (fi->fhp && fi->copyproc)
1036 				(*fi->copyproc)(fi->fhp, fi->vp);
1037 		}
1038 	}
1039 
1040 	/* For TSOL, use a new cred which has net_mac_aware flag */
1041 	if (!cred_cloned && is_system_labeled()) {
1042 		cred_cloned = TRUE;
1043 		cr = crdup(icr);
1044 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1045 	}
1046 
1047 	/*
1048 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1049 	 * are guaranteed to reprocess the retry as a new request.
1050 	 */
1051 	svp = mi->mi_curr_serv;
1052 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1053 
1054 	if (FAILOVER_MOUNT(mi)) {
1055 		mutex_enter(&mi->mi_lock);
1056 		DEC_READERS(mi);
1057 		mutex_exit(&mi->mi_lock);
1058 
1059 		if ((rpcerr.re_errno == ETIMEDOUT ||
1060 		    rpcerr.re_errno == ECONNRESET) &&
1061 		    failover_safe(fi)) {
1062 			if (svp == mi->mi_curr_serv)
1063 				failover_newserver(mi);
1064 			goto failoverretry;
1065 		}
1066 	}
1067 	if (rpcerr.re_errno != 0)
1068 		return (rpcerr.re_errno);
1069 
1070 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1071 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1072 		timeo = (mi->mi_timeo * hz) / 10;
1073 	} else {
1074 		mutex_enter(&mi->mi_lock);
1075 		timeo = CLNT_SETTIMERS(client,
1076 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1077 		    &(mi->mi_timers[NFS_CALLTYPES]),
1078 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1079 		    (void (*)())NULL, (caddr_t)mi, 0);
1080 		mutex_exit(&mi->mi_lock);
1081 	}
1082 
1083 	/*
1084 	 * If hard mounted fs, retry call forever unless hard error occurs.
1085 	 */
1086 	do {
1087 		tryagain = FALSE;
1088 
1089 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1090 			status = RPC_FAILED;
1091 			rpcerr.re_status = RPC_FAILED;
1092 			rpcerr.re_errno = EIO;
1093 			break;
1094 		}
1095 
1096 		TICK_TO_TIMEVAL(timeo, &wait);
1097 
1098 		/*
1099 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1100 		 * and SIGTERM. (Preserving the existing masks).
1101 		 * Mask out SIGINT if mount option nointr is specified.
1102 		 */
1103 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1104 		if (!(mi->mi_flags & MI_INT))
1105 			client->cl_nosignal = TRUE;
1106 
1107 		/*
1108 		 * If there is a current signal, then don't bother
1109 		 * even trying to send out the request because we
1110 		 * won't be able to block waiting for the response.
1111 		 * Simply assume RPC_INTR and get on with it.
1112 		 */
1113 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1114 			status = RPC_INTR;
1115 		else {
1116 			status = CLNT_CALL(client, which, xdrargs, argsp,
1117 			    xdrres, resp, wait);
1118 		}
1119 
1120 		if (!(mi->mi_flags & MI_INT))
1121 			client->cl_nosignal = FALSE;
1122 		/*
1123 		 * restore original signal mask
1124 		 */
1125 		sigunintr(&smask);
1126 
1127 		switch (status) {
1128 		case RPC_SUCCESS:
1129 			if ((mi->mi_flags & MI_DYNAMIC) &&
1130 			    mi->mi_timer_type[which] != 0 &&
1131 			    (mi->mi_curread != my_rsize ||
1132 			    mi->mi_curwrite != my_wsize))
1133 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1134 			break;
1135 
1136 		case RPC_INTR:
1137 			/*
1138 			 * There is no way to recover from this error,
1139 			 * even if mount option nointr is specified.
1140 			 * SIGKILL, for example, cannot be blocked.
1141 			 */
1142 			rpcerr.re_status = RPC_INTR;
1143 			rpcerr.re_errno = EINTR;
1144 			break;
1145 
1146 		case RPC_UDERROR:
1147 			/*
1148 			 * If the NFS server is local (vold) and
1149 			 * it goes away then we get RPC_UDERROR.
1150 			 * This is a retryable error, so we would
1151 			 * loop, so check to see if the specific
1152 			 * error was ECONNRESET, indicating that
1153 			 * target did not exist at all.  If so,
1154 			 * return with RPC_PROGUNAVAIL and
1155 			 * ECONNRESET to indicate why.
1156 			 */
1157 			CLNT_GETERR(client, &rpcerr);
1158 			if (rpcerr.re_errno == ECONNRESET) {
1159 				rpcerr.re_status = RPC_PROGUNAVAIL;
1160 				rpcerr.re_errno = ECONNRESET;
1161 				break;
1162 			}
1163 			/*FALLTHROUGH*/
1164 
1165 		default:		/* probably RPC_TIMEDOUT */
1166 			if (IS_UNRECOVERABLE_RPC(status))
1167 				break;
1168 
1169 			/*
1170 			 * increment server not responding count
1171 			 */
1172 			mutex_enter(&mi->mi_lock);
1173 			mi->mi_noresponse++;
1174 			mutex_exit(&mi->mi_lock);
1175 #ifdef DEBUG
1176 			nfscl->nfscl_stat.noresponse.value.ui64++;
1177 #endif
1178 
1179 			if (!(mi->mi_flags & MI_HARD)) {
1180 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1181 				    (mi->mi_ss_call_type[which] == 0))
1182 					break;
1183 			}
1184 
1185 			/*
1186 			 * The call is in progress (over COTS).
1187 			 * Try the CLNT_CALL again, but don't
1188 			 * print a noisy error message.
1189 			 */
1190 			if (status == RPC_INPROGRESS) {
1191 				tryagain = TRUE;
1192 				break;
1193 			}
1194 
1195 			if (flags & RFSCALL_SOFT)
1196 				break;
1197 
1198 			/*
1199 			 * On zone shutdown, just move on.
1200 			 */
1201 			if (zone_status_get(curproc->p_zone) >=
1202 			    ZONE_IS_SHUTTING_DOWN) {
1203 				rpcerr.re_status = RPC_FAILED;
1204 				rpcerr.re_errno = EIO;
1205 				break;
1206 			}
1207 
1208 			/*
1209 			 * NFS client failover support
1210 			 *
1211 			 * If the current server just failed us, we'll
1212 			 * start the process of finding a new server.
1213 			 * After that, we can just retry.
1214 			 */
1215 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1216 				if (svp == mi->mi_curr_serv)
1217 					failover_newserver(mi);
1218 				clfree_impl(client, ch, nfscl);
1219 				goto failoverretry;
1220 			}
1221 
1222 			tryagain = TRUE;
1223 			timeo = backoff(timeo);
1224 			mutex_enter(&mi->mi_lock);
1225 			if (!(mi->mi_flags & MI_PRINTED)) {
1226 				mi->mi_flags |= MI_PRINTED;
1227 				mutex_exit(&mi->mi_lock);
1228 #ifdef DEBUG
1229 				zprintf(zoneid,
1230 			"NFS%d server %s not responding still trying\n",
1231 				    mi->mi_vers, svp->sv_hostname);
1232 #else
1233 				zprintf(zoneid,
1234 			"NFS server %s not responding still trying\n",
1235 				    svp->sv_hostname);
1236 #endif
1237 			} else
1238 				mutex_exit(&mi->mi_lock);
1239 			if (*douprintf && nfs_has_ctty()) {
1240 				*douprintf = 0;
1241 				if (!(mi->mi_flags & MI_NOPRINT))
1242 #ifdef DEBUG
1243 					uprintf(
1244 			    "NFS%d server %s not responding still trying\n",
1245 					    mi->mi_vers, svp->sv_hostname);
1246 #else
1247 					uprintf(
1248 			    "NFS server %s not responding still trying\n",
1249 					    svp->sv_hostname);
1250 #endif
1251 			}
1252 
1253 			/*
1254 			 * If doing dynamic adjustment of transfer
1255 			 * size and if it's a read or write call
1256 			 * and if the transfer size changed while
1257 			 * retransmitting or if the feedback routine
1258 			 * changed the transfer size,
1259 			 * then exit rfscall so that the transfer
1260 			 * size can be adjusted at the vnops level.
1261 			 */
1262 			if ((mi->mi_flags & MI_DYNAMIC) &&
1263 			    mi->mi_timer_type[which] != 0 &&
1264 			    (mi->mi_curread != my_rsize ||
1265 			    mi->mi_curwrite != my_wsize ||
1266 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1267 				/*
1268 				 * On read or write calls, return
1269 				 * back to the vnode ops level if
1270 				 * the transfer size changed.
1271 				 */
1272 				clfree_impl(client, ch, nfscl);
1273 				if (cred_cloned)
1274 					crfree(cr);
1275 				return (ENFS_TRYAGAIN);
1276 			}
1277 		}
1278 	} while (tryagain);
1279 
1280 	if (status != RPC_SUCCESS) {
1281 		/*
1282 		 * Let soft mounts use the timed out message.
1283 		 */
1284 		if (status == RPC_INPROGRESS)
1285 			status = RPC_TIMEDOUT;
1286 		nfscl->nfscl_stat.badcalls.value.ui64++;
1287 		if (status != RPC_INTR) {
1288 			mutex_enter(&mi->mi_lock);
1289 			mi->mi_flags |= MI_DOWN;
1290 			mutex_exit(&mi->mi_lock);
1291 			CLNT_GETERR(client, &rpcerr);
1292 #ifdef DEBUG
1293 			bufp = clnt_sperror(client, svp->sv_hostname);
1294 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1295 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1296 			if (nfs_has_ctty()) {
1297 				if (!(mi->mi_flags & MI_NOPRINT)) {
1298 					uprintf("NFS%d %s failed for %s\n",
1299 					    mi->mi_vers, mi->mi_rfsnames[which],
1300 					    bufp);
1301 				}
1302 			}
1303 			kmem_free(bufp, MAXPATHLEN);
1304 #else
1305 			zprintf(zoneid,
1306 			    "NFS %s failed for server %s: error %d (%s)\n",
1307 			    mi->mi_rfsnames[which], svp->sv_hostname,
1308 			    status, clnt_sperrno(status));
1309 			if (nfs_has_ctty()) {
1310 				if (!(mi->mi_flags & MI_NOPRINT)) {
1311 					uprintf(
1312 				"NFS %s failed for server %s: error %d (%s)\n",
1313 					    mi->mi_rfsnames[which],
1314 					    svp->sv_hostname, status,
1315 					    clnt_sperrno(status));
1316 				}
1317 			}
1318 #endif
1319 			/*
1320 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1321 			 * re_errno is set appropriately depending on
1322 			 * the authentication error
1323 			 */
1324 			if (status == RPC_VERSMISMATCH ||
1325 			    status == RPC_PROGVERSMISMATCH)
1326 				rpcerr.re_errno = EIO;
1327 		}
1328 	} else {
1329 		/*
1330 		 * Test the value of mi_down and mi_printed without
1331 		 * holding the mi_lock mutex.  If they are both zero,
1332 		 * then it is okay to skip the down and printed
1333 		 * processing.  This saves on a mutex_enter and
1334 		 * mutex_exit pair for a normal, successful RPC.
1335 		 * This was just complete overhead.
1336 		 */
1337 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1338 			mutex_enter(&mi->mi_lock);
1339 			mi->mi_flags &= ~MI_DOWN;
1340 			if (mi->mi_flags & MI_PRINTED) {
1341 				mi->mi_flags &= ~MI_PRINTED;
1342 				mutex_exit(&mi->mi_lock);
1343 #ifdef DEBUG
1344 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1345 				zprintf(zoneid, "NFS%d server %s ok\n",
1346 				    mi->mi_vers, svp->sv_hostname);
1347 #else
1348 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1349 				zprintf(zoneid, "NFS server %s ok\n",
1350 				    svp->sv_hostname);
1351 #endif
1352 			} else
1353 				mutex_exit(&mi->mi_lock);
1354 		}
1355 
1356 		if (*douprintf == 0) {
1357 			if (!(mi->mi_flags & MI_NOPRINT))
1358 #ifdef DEBUG
1359 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1360 					uprintf("NFS%d server %s ok\n",
1361 					    mi->mi_vers, svp->sv_hostname);
1362 #else
1363 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1364 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1365 #endif
1366 			*douprintf = 1;
1367 		}
1368 	}
1369 
1370 	clfree_impl(client, ch, nfscl);
1371 	if (cred_cloned)
1372 		crfree(cr);
1373 
1374 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1375 
1376 	if (rpc_status != NULL)
1377 		*rpc_status = rpcerr.re_status;
1378 
1379 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1380 	    rpcerr.re_errno);
1381 
1382 	return (rpcerr.re_errno);
1383 }
1384 
1385 #ifdef DEBUG
1386 static int acl2call_hits = 0;
1387 static int acl2call_misses = 0;
1388 #endif
1389 
1390 int
1391 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1392     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1393     enum nfsstat *statusp, int flags, failinfo_t *fi)
1394 {
1395 	int rpcerror;
1396 
1397 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1398 	    cr, douprintf, flags, fi);
1399 	if (!rpcerror) {
1400 		/*
1401 		 * See comments with crnetadjust().
1402 		 */
1403 		if (*statusp == NFSERR_ACCES &&
1404 		    (cr = crnetadjust(cr)) != NULL) {
1405 #ifdef DEBUG
1406 			acl2call_hits++;
1407 #endif
1408 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1409 			    resp, cr, douprintf, flags, fi);
1410 			crfree(cr);
1411 #ifdef DEBUG
1412 			if (*statusp == NFSERR_ACCES)
1413 				acl2call_misses++;
1414 #endif
1415 		}
1416 	}
1417 
1418 	return (rpcerror);
1419 }
1420 
1421 #ifdef DEBUG
1422 static int acl3call_hits = 0;
1423 static int acl3call_misses = 0;
1424 #endif
1425 
1426 int
1427 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1428     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1429     nfsstat3 *statusp, int flags, failinfo_t *fi)
1430 {
1431 	int rpcerror;
1432 	int user_informed;
1433 
1434 	user_informed = 0;
1435 
1436 	do {
1437 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1438 		    cr, douprintf, flags, fi);
1439 		if (!rpcerror) {
1440 			cred_t *crr;
1441 			if (*statusp == NFS3ERR_JUKEBOX) {
1442 				if (!user_informed) {
1443 					user_informed = 1;
1444 					uprintf(
1445 		"file temporarily unavailable on the server, retrying...\n");
1446 				}
1447 				delay(nfs3_jukebox_delay);
1448 			}
1449 			/*
1450 			 * See crnetadjust() for comments.
1451 			 */
1452 			else if (*statusp == NFS3ERR_ACCES &&
1453 			    (crr = crnetadjust(cr)) != NULL) {
1454 #ifdef DEBUG
1455 				acl3call_hits++;
1456 #endif
1457 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1458 				    xdrres, resp, crr, douprintf, flags, fi);
1459 
1460 				crfree(crr);
1461 #ifdef DEBUG
1462 				if (*statusp == NFS3ERR_ACCES)
1463 					acl3call_misses++;
1464 #endif
1465 			}
1466 		}
1467 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1468 
1469 	return (rpcerror);
1470 }
1471 
1472 static int
1473 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1474     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1475     int flags, failinfo_t *fi)
1476 {
1477 	CLIENT *client;
1478 	struct chtab *ch;
1479 	cred_t *cr = icr;
1480 	bool_t cred_cloned = FALSE;
1481 	enum clnt_stat status;
1482 	struct rpc_err rpcerr;
1483 	struct timeval wait;
1484 	int timeo;		/* in units of hz */
1485 #if 0 /* notyet */
1486 	int my_rsize, my_wsize;
1487 #endif
1488 	bool_t tryagain;
1489 	k_sigset_t smask;
1490 	servinfo_t *svp;
1491 	struct nfs_clnt *nfscl;
1492 	zoneid_t zoneid = getzoneid();
1493 #ifdef DEBUG
1494 	char *bufp;
1495 #endif
1496 
1497 #if 0 /* notyet */
1498 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1499 	    "rfscall_start:which %d mi %p", which, mi);
1500 #endif
1501 
1502 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1503 	ASSERT(nfscl != NULL);
1504 
1505 	nfscl->nfscl_stat.calls.value.ui64++;
1506 	mi->mi_aclreqs[which].value.ui64++;
1507 
1508 	rpcerr.re_status = RPC_SUCCESS;
1509 
1510 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1511 		rpcerr.re_status = RPC_FAILED;
1512 		rpcerr.re_errno = EIO;
1513 		return (rpcerr.re_errno);
1514 	}
1515 
1516 #if 0 /* notyet */
1517 	/*
1518 	 * Remember the transfer sizes in case
1519 	 * nfs_feedback changes them underneath us.
1520 	 */
1521 	my_rsize = mi->mi_curread;
1522 	my_wsize = mi->mi_curwrite;
1523 #endif
1524 
1525 	/*
1526 	 * NFS client failover support
1527 	 *
1528 	 * If this rnode is not in sync with the current server (VALID_FH),
1529 	 * we'd like to do a remap to get in sync.  We can be interrupted
1530 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1531 	 * use the best info we have to try the RPC.  Part of that is
1532 	 * unconditionally updating the filehandle copy kept for V3.
1533 	 *
1534 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1535 	 * rw_enter(); we're trying to keep the current server from being
1536 	 * changed on us until we're done with the remapping and have a
1537 	 * matching client handle.  We don't want to sending a filehandle
1538 	 * to the wrong host.
1539 	 */
1540 failoverretry:
1541 	if (FAILOVER_MOUNT(mi)) {
1542 		mutex_enter(&mi->mi_lock);
1543 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1544 			if (failover_wait(mi)) {
1545 				mutex_exit(&mi->mi_lock);
1546 				return (EINTR);
1547 			}
1548 		}
1549 		INC_READERS(mi);
1550 		mutex_exit(&mi->mi_lock);
1551 		if (fi) {
1552 			if (!VALID_FH(fi) &&
1553 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1554 				int remaperr;
1555 
1556 				svp = mi->mi_curr_serv;
1557 				remaperr = failover_remap(fi);
1558 				if (remaperr != 0) {
1559 #ifdef DEBUG
1560 					if (remaperr != EINTR)
1561 						nfs_cmn_err(remaperr, CE_WARN,
1562 					    "aclcall couldn't failover: %m");
1563 #endif
1564 					mutex_enter(&mi->mi_lock);
1565 					DEC_READERS(mi);
1566 					mutex_exit(&mi->mi_lock);
1567 
1568 					/*
1569 					 * If failover_remap returns ETIMEDOUT
1570 					 * and the filesystem is hard mounted
1571 					 * we have to retry the call with a new
1572 					 * server.
1573 					 */
1574 					if ((mi->mi_flags & MI_HARD) &&
1575 					    IS_RECOVERABLE_ERROR(remaperr)) {
1576 						if (svp == mi->mi_curr_serv)
1577 							failover_newserver(mi);
1578 						rpcerr.re_status = RPC_SUCCESS;
1579 						goto failoverretry;
1580 					}
1581 					return (remaperr);
1582 				}
1583 			}
1584 			if (fi->fhp && fi->copyproc)
1585 				(*fi->copyproc)(fi->fhp, fi->vp);
1586 		}
1587 	}
1588 
1589 	/* For TSOL, use a new cred which has net_mac_aware flag */
1590 	if (!cred_cloned && is_system_labeled()) {
1591 		cred_cloned = TRUE;
1592 		cr = crdup(icr);
1593 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1594 	}
1595 
1596 	/*
1597 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1598 	 * are guaranteed to reprocess the retry as a new request.
1599 	 */
1600 	svp = mi->mi_curr_serv;
1601 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1602 	if (FAILOVER_MOUNT(mi)) {
1603 		mutex_enter(&mi->mi_lock);
1604 		DEC_READERS(mi);
1605 		mutex_exit(&mi->mi_lock);
1606 
1607 		if ((rpcerr.re_errno == ETIMEDOUT ||
1608 		    rpcerr.re_errno == ECONNRESET) &&
1609 		    failover_safe(fi)) {
1610 			if (svp == mi->mi_curr_serv)
1611 				failover_newserver(mi);
1612 			goto failoverretry;
1613 		}
1614 	}
1615 	if (rpcerr.re_errno != 0) {
1616 		if (cred_cloned)
1617 			crfree(cr);
1618 		return (rpcerr.re_errno);
1619 	}
1620 
1621 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1622 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1623 		timeo = (mi->mi_timeo * hz) / 10;
1624 	} else {
1625 		mutex_enter(&mi->mi_lock);
1626 		timeo = CLNT_SETTIMERS(client,
1627 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1628 		    &(mi->mi_timers[NFS_CALLTYPES]),
1629 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1630 		    (void (*)()) 0, (caddr_t)mi, 0);
1631 		mutex_exit(&mi->mi_lock);
1632 	}
1633 
1634 	/*
1635 	 * If hard mounted fs, retry call forever unless hard error occurs.
1636 	 */
1637 	do {
1638 		tryagain = FALSE;
1639 
1640 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1641 			status = RPC_FAILED;
1642 			rpcerr.re_status = RPC_FAILED;
1643 			rpcerr.re_errno = EIO;
1644 			break;
1645 		}
1646 
1647 		TICK_TO_TIMEVAL(timeo, &wait);
1648 
1649 		/*
1650 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1651 		 * and SIGTERM. (Preserving the existing masks).
1652 		 * Mask out SIGINT if mount option nointr is specified.
1653 		 */
1654 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1655 		if (!(mi->mi_flags & MI_INT))
1656 			client->cl_nosignal = TRUE;
1657 
1658 		/*
1659 		 * If there is a current signal, then don't bother
1660 		 * even trying to send out the request because we
1661 		 * won't be able to block waiting for the response.
1662 		 * Simply assume RPC_INTR and get on with it.
1663 		 */
1664 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1665 			status = RPC_INTR;
1666 		else {
1667 			status = CLNT_CALL(client, which, xdrargs, argsp,
1668 			    xdrres, resp, wait);
1669 		}
1670 
1671 		if (!(mi->mi_flags & MI_INT))
1672 			client->cl_nosignal = FALSE;
1673 		/*
1674 		 * restore original signal mask
1675 		 */
1676 		sigunintr(&smask);
1677 
1678 		switch (status) {
1679 		case RPC_SUCCESS:
1680 #if 0 /* notyet */
1681 			if ((mi->mi_flags & MI_DYNAMIC) &&
1682 			    mi->mi_timer_type[which] != 0 &&
1683 			    (mi->mi_curread != my_rsize ||
1684 			    mi->mi_curwrite != my_wsize))
1685 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1686 #endif
1687 			break;
1688 
1689 		/*
1690 		 * Unfortunately, there are servers in the world which
1691 		 * are not coded correctly.  They are not prepared to
1692 		 * handle RPC requests to the NFS port which are not
1693 		 * NFS requests.  Thus, they may try to process the
1694 		 * NFS_ACL request as if it were an NFS request.  This
1695 		 * does not work.  Generally, an error will be generated
1696 		 * on the client because it will not be able to decode
1697 		 * the response from the server.  However, it seems
1698 		 * possible that the server may not be able to decode
1699 		 * the arguments.  Thus, the criteria for deciding
1700 		 * whether the server supports NFS_ACL or not is whether
1701 		 * the following RPC errors are returned from CLNT_CALL.
1702 		 */
1703 		case RPC_CANTDECODERES:
1704 		case RPC_PROGUNAVAIL:
1705 		case RPC_CANTDECODEARGS:
1706 		case RPC_PROGVERSMISMATCH:
1707 			mutex_enter(&mi->mi_lock);
1708 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1709 			mutex_exit(&mi->mi_lock);
1710 			break;
1711 
1712 		/*
1713 		 * If the server supports NFS_ACL but not the new ops
1714 		 * for extended attributes, make sure we don't retry.
1715 		 */
1716 		case RPC_PROCUNAVAIL:
1717 			mutex_enter(&mi->mi_lock);
1718 			mi->mi_flags &= ~MI_EXTATTR;
1719 			mutex_exit(&mi->mi_lock);
1720 			break;
1721 
1722 		case RPC_INTR:
1723 			/*
1724 			 * There is no way to recover from this error,
1725 			 * even if mount option nointr is specified.
1726 			 * SIGKILL, for example, cannot be blocked.
1727 			 */
1728 			rpcerr.re_status = RPC_INTR;
1729 			rpcerr.re_errno = EINTR;
1730 			break;
1731 
1732 		case RPC_UDERROR:
1733 			/*
1734 			 * If the NFS server is local (vold) and
1735 			 * it goes away then we get RPC_UDERROR.
1736 			 * This is a retryable error, so we would
1737 			 * loop, so check to see if the specific
1738 			 * error was ECONNRESET, indicating that
1739 			 * target did not exist at all.  If so,
1740 			 * return with RPC_PROGUNAVAIL and
1741 			 * ECONNRESET to indicate why.
1742 			 */
1743 			CLNT_GETERR(client, &rpcerr);
1744 			if (rpcerr.re_errno == ECONNRESET) {
1745 				rpcerr.re_status = RPC_PROGUNAVAIL;
1746 				rpcerr.re_errno = ECONNRESET;
1747 				break;
1748 			}
1749 			/*FALLTHROUGH*/
1750 
1751 		default:		/* probably RPC_TIMEDOUT */
1752 			if (IS_UNRECOVERABLE_RPC(status))
1753 				break;
1754 
1755 			/*
1756 			 * increment server not responding count
1757 			 */
1758 			mutex_enter(&mi->mi_lock);
1759 			mi->mi_noresponse++;
1760 			mutex_exit(&mi->mi_lock);
1761 #ifdef DEBUG
1762 			nfscl->nfscl_stat.noresponse.value.ui64++;
1763 #endif
1764 
1765 			if (!(mi->mi_flags & MI_HARD)) {
1766 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1767 				    (mi->mi_acl_ss_call_type[which] == 0))
1768 					break;
1769 			}
1770 
1771 			/*
1772 			 * The call is in progress (over COTS).
1773 			 * Try the CLNT_CALL again, but don't
1774 			 * print a noisy error message.
1775 			 */
1776 			if (status == RPC_INPROGRESS) {
1777 				tryagain = TRUE;
1778 				break;
1779 			}
1780 
1781 			if (flags & RFSCALL_SOFT)
1782 				break;
1783 
1784 			/*
1785 			 * On zone shutdown, just move on.
1786 			 */
1787 			if (zone_status_get(curproc->p_zone) >=
1788 			    ZONE_IS_SHUTTING_DOWN) {
1789 				rpcerr.re_status = RPC_FAILED;
1790 				rpcerr.re_errno = EIO;
1791 				break;
1792 			}
1793 
1794 			/*
1795 			 * NFS client failover support
1796 			 *
1797 			 * If the current server just failed us, we'll
1798 			 * start the process of finding a new server.
1799 			 * After that, we can just retry.
1800 			 */
1801 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1802 				if (svp == mi->mi_curr_serv)
1803 					failover_newserver(mi);
1804 				clfree_impl(client, ch, nfscl);
1805 				goto failoverretry;
1806 			}
1807 
1808 			tryagain = TRUE;
1809 			timeo = backoff(timeo);
1810 			mutex_enter(&mi->mi_lock);
1811 			if (!(mi->mi_flags & MI_PRINTED)) {
1812 				mi->mi_flags |= MI_PRINTED;
1813 				mutex_exit(&mi->mi_lock);
1814 #ifdef DEBUG
1815 				zprintf(zoneid,
1816 			"NFS_ACL%d server %s not responding still trying\n",
1817 				    mi->mi_vers, svp->sv_hostname);
1818 #else
1819 				zprintf(zoneid,
1820 			    "NFS server %s not responding still trying\n",
1821 				    svp->sv_hostname);
1822 #endif
1823 			} else
1824 				mutex_exit(&mi->mi_lock);
1825 			if (*douprintf && nfs_has_ctty()) {
1826 				*douprintf = 0;
1827 				if (!(mi->mi_flags & MI_NOPRINT))
1828 #ifdef DEBUG
1829 					uprintf(
1830 			"NFS_ACL%d server %s not responding still trying\n",
1831 					    mi->mi_vers, svp->sv_hostname);
1832 #else
1833 					uprintf(
1834 			    "NFS server %s not responding still trying\n",
1835 					    svp->sv_hostname);
1836 #endif
1837 			}
1838 
1839 #if 0 /* notyet */
1840 			/*
1841 			 * If doing dynamic adjustment of transfer
1842 			 * size and if it's a read or write call
1843 			 * and if the transfer size changed while
1844 			 * retransmitting or if the feedback routine
1845 			 * changed the transfer size,
1846 			 * then exit rfscall so that the transfer
1847 			 * size can be adjusted at the vnops level.
1848 			 */
1849 			if ((mi->mi_flags & MI_DYNAMIC) &&
1850 			    mi->mi_acl_timer_type[which] != 0 &&
1851 			    (mi->mi_curread != my_rsize ||
1852 			    mi->mi_curwrite != my_wsize ||
1853 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1854 				/*
1855 				 * On read or write calls, return
1856 				 * back to the vnode ops level if
1857 				 * the transfer size changed.
1858 				 */
1859 				clfree_impl(client, ch, nfscl);
1860 				if (cred_cloned)
1861 					crfree(cr);
1862 				return (ENFS_TRYAGAIN);
1863 			}
1864 #endif
1865 		}
1866 	} while (tryagain);
1867 
1868 	if (status != RPC_SUCCESS) {
1869 		/*
1870 		 * Let soft mounts use the timed out message.
1871 		 */
1872 		if (status == RPC_INPROGRESS)
1873 			status = RPC_TIMEDOUT;
1874 		nfscl->nfscl_stat.badcalls.value.ui64++;
1875 		if (status == RPC_CANTDECODERES ||
1876 		    status == RPC_PROGUNAVAIL ||
1877 		    status == RPC_PROCUNAVAIL ||
1878 		    status == RPC_CANTDECODEARGS ||
1879 		    status == RPC_PROGVERSMISMATCH)
1880 			CLNT_GETERR(client, &rpcerr);
1881 		else if (status != RPC_INTR) {
1882 			mutex_enter(&mi->mi_lock);
1883 			mi->mi_flags |= MI_DOWN;
1884 			mutex_exit(&mi->mi_lock);
1885 			CLNT_GETERR(client, &rpcerr);
1886 #ifdef DEBUG
1887 			bufp = clnt_sperror(client, svp->sv_hostname);
1888 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1889 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1890 			if (nfs_has_ctty()) {
1891 				if (!(mi->mi_flags & MI_NOPRINT)) {
1892 					uprintf("NFS_ACL%d %s failed for %s\n",
1893 					    mi->mi_vers, mi->mi_aclnames[which],
1894 					    bufp);
1895 				}
1896 			}
1897 			kmem_free(bufp, MAXPATHLEN);
1898 #else
1899 			zprintf(zoneid,
1900 			    "NFS %s failed for server %s: error %d (%s)\n",
1901 			    mi->mi_aclnames[which], svp->sv_hostname,
1902 			    status, clnt_sperrno(status));
1903 			if (nfs_has_ctty()) {
1904 				if (!(mi->mi_flags & MI_NOPRINT))
1905 					uprintf(
1906 				"NFS %s failed for server %s: error %d (%s)\n",
1907 					    mi->mi_aclnames[which],
1908 					    svp->sv_hostname, status,
1909 					    clnt_sperrno(status));
1910 			}
1911 #endif
1912 			/*
1913 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1914 			 * re_errno is set appropriately depending on
1915 			 * the authentication error
1916 			 */
1917 			if (status == RPC_VERSMISMATCH ||
1918 			    status == RPC_PROGVERSMISMATCH)
1919 				rpcerr.re_errno = EIO;
1920 		}
1921 	} else {
1922 		/*
1923 		 * Test the value of mi_down and mi_printed without
1924 		 * holding the mi_lock mutex.  If they are both zero,
1925 		 * then it is okay to skip the down and printed
1926 		 * processing.  This saves on a mutex_enter and
1927 		 * mutex_exit pair for a normal, successful RPC.
1928 		 * This was just complete overhead.
1929 		 */
1930 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1931 			mutex_enter(&mi->mi_lock);
1932 			mi->mi_flags &= ~MI_DOWN;
1933 			if (mi->mi_flags & MI_PRINTED) {
1934 				mi->mi_flags &= ~MI_PRINTED;
1935 				mutex_exit(&mi->mi_lock);
1936 #ifdef DEBUG
1937 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1938 				    mi->mi_vers, svp->sv_hostname);
1939 #else
1940 				zprintf(zoneid, "NFS server %s ok\n",
1941 				    svp->sv_hostname);
1942 #endif
1943 			} else
1944 				mutex_exit(&mi->mi_lock);
1945 		}
1946 
1947 		if (*douprintf == 0) {
1948 			if (!(mi->mi_flags & MI_NOPRINT))
1949 #ifdef DEBUG
1950 				uprintf("NFS_ACL%d server %s ok\n",
1951 				    mi->mi_vers, svp->sv_hostname);
1952 #else
1953 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1954 #endif
1955 			*douprintf = 1;
1956 		}
1957 	}
1958 
1959 	clfree_impl(client, ch, nfscl);
1960 	if (cred_cloned)
1961 		crfree(cr);
1962 
1963 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1964 
1965 #if 0 /* notyet */
1966 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1967 	    rpcerr.re_errno);
1968 #endif
1969 
1970 	return (rpcerr.re_errno);
1971 }
1972 
1973 int
1974 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1975 {
1976 	uint_t mask = vap->va_mask;
1977 
1978 	if (!(mask & AT_MODE))
1979 		sa->sa_mode = (uint32_t)-1;
1980 	else
1981 		sa->sa_mode = vap->va_mode;
1982 	if (!(mask & AT_UID))
1983 		sa->sa_uid = (uint32_t)-1;
1984 	else
1985 		sa->sa_uid = (uint32_t)vap->va_uid;
1986 	if (!(mask & AT_GID))
1987 		sa->sa_gid = (uint32_t)-1;
1988 	else
1989 		sa->sa_gid = (uint32_t)vap->va_gid;
1990 	if (!(mask & AT_SIZE))
1991 		sa->sa_size = (uint32_t)-1;
1992 	else
1993 		sa->sa_size = (uint32_t)vap->va_size;
1994 	if (!(mask & AT_ATIME))
1995 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
1996 	else {
1997 		/* check time validity */
1998 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1999 			return (EOVERFLOW);
2000 		}
2001 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2002 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2003 	}
2004 	if (!(mask & AT_MTIME))
2005 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2006 	else {
2007 		/* check time validity */
2008 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2009 			return (EOVERFLOW);
2010 		}
2011 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2012 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2013 	}
2014 	return (0);
2015 }
2016 
2017 int
2018 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2019 {
2020 	uint_t mask = vap->va_mask;
2021 
2022 	if (!(mask & AT_MODE))
2023 		sa->mode.set_it = FALSE;
2024 	else {
2025 		sa->mode.set_it = TRUE;
2026 		sa->mode.mode = (mode3)vap->va_mode;
2027 	}
2028 	if (!(mask & AT_UID))
2029 		sa->uid.set_it = FALSE;
2030 	else {
2031 		sa->uid.set_it = TRUE;
2032 		sa->uid.uid = (uid3)vap->va_uid;
2033 	}
2034 	if (!(mask & AT_GID))
2035 		sa->gid.set_it = FALSE;
2036 	else {
2037 		sa->gid.set_it = TRUE;
2038 		sa->gid.gid = (gid3)vap->va_gid;
2039 	}
2040 	if (!(mask & AT_SIZE))
2041 		sa->size.set_it = FALSE;
2042 	else {
2043 		sa->size.set_it = TRUE;
2044 		sa->size.size = (size3)vap->va_size;
2045 	}
2046 	if (!(mask & AT_ATIME))
2047 		sa->atime.set_it = DONT_CHANGE;
2048 	else {
2049 		/* check time validity */
2050 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2051 			return (EOVERFLOW);
2052 		}
2053 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2054 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2055 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2056 	}
2057 	if (!(mask & AT_MTIME))
2058 		sa->mtime.set_it = DONT_CHANGE;
2059 	else {
2060 		/* check time validity */
2061 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2062 			return (EOVERFLOW);
2063 		}
2064 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2065 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2066 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2067 	}
2068 	return (0);
2069 }
2070 
2071 void
2072 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2073 {
2074 
2075 	da->da_fhandle = VTOFH(dvp);
2076 	da->da_name = nm;
2077 	da->da_flags = 0;
2078 }
2079 
2080 void
2081 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2082 {
2083 
2084 	da->dirp = VTOFH3(dvp);
2085 	da->name = nm;
2086 }
2087 
2088 int
2089 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2090 {
2091 	int error;
2092 	rnode_t *rp;
2093 	struct vattr va;
2094 
2095 	va.va_mask = AT_MODE | AT_GID;
2096 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2097 	if (error)
2098 		return (error);
2099 
2100 	/*
2101 	 * To determine the expected group-id of the created file:
2102 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2103 	 *	GRPID option, and the directory's set-gid bit is clear,
2104 	 *	then use the process's gid.
2105 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2106 	 */
2107 	rp = VTOR(dvp);
2108 	mutex_enter(&rp->r_statelock);
2109 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2110 		*gidp = crgetgid(cr);
2111 	else
2112 		*gidp = va.va_gid;
2113 	mutex_exit(&rp->r_statelock);
2114 	return (0);
2115 }
2116 
2117 int
2118 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2119 {
2120 	int error;
2121 	struct vattr va;
2122 
2123 	va.va_mask = AT_MODE;
2124 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2125 	if (error)
2126 		return (error);
2127 
2128 	/*
2129 	 * Modify the expected mode (om) so that the set-gid bit matches
2130 	 * that of the parent directory (dvp).
2131 	 */
2132 	if (va.va_mode & VSGID)
2133 		*omp |= VSGID;
2134 	else
2135 		*omp &= ~VSGID;
2136 	return (0);
2137 }
2138 
2139 void
2140 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2141 {
2142 
2143 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2144 		if (!(vp->v_flag & VSWAPLIKE)) {
2145 			mutex_enter(&vp->v_lock);
2146 			vp->v_flag |= VSWAPLIKE;
2147 			mutex_exit(&vp->v_lock);
2148 		}
2149 	} else {
2150 		if (vp->v_flag & VSWAPLIKE) {
2151 			mutex_enter(&vp->v_lock);
2152 			vp->v_flag &= ~VSWAPLIKE;
2153 			mutex_exit(&vp->v_lock);
2154 		}
2155 	}
2156 }
2157 
2158 /*
2159  * Free the resources associated with an rnode.
2160  */
2161 static void
2162 rinactive(rnode_t *rp, cred_t *cr)
2163 {
2164 	vnode_t *vp;
2165 	cred_t *cred;
2166 	char *contents;
2167 	int size;
2168 	vsecattr_t *vsp;
2169 	int error;
2170 	nfs3_pathconf_info *info;
2171 
2172 	/*
2173 	 * Before freeing anything, wait until all asynchronous
2174 	 * activity is done on this rnode.  This will allow all
2175 	 * asynchronous read ahead and write behind i/o's to
2176 	 * finish.
2177 	 */
2178 	mutex_enter(&rp->r_statelock);
2179 	while (rp->r_count > 0)
2180 		cv_wait(&rp->r_cv, &rp->r_statelock);
2181 	mutex_exit(&rp->r_statelock);
2182 
2183 	/*
2184 	 * Flush and invalidate all pages associated with the vnode.
2185 	 */
2186 	vp = RTOV(rp);
2187 	if (vn_has_cached_data(vp)) {
2188 		ASSERT(vp->v_type != VCHR);
2189 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2190 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2191 			if (error && (error == ENOSPC || error == EDQUOT)) {
2192 				mutex_enter(&rp->r_statelock);
2193 				if (!rp->r_error)
2194 					rp->r_error = error;
2195 				mutex_exit(&rp->r_statelock);
2196 			}
2197 		}
2198 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2199 	}
2200 
2201 	/*
2202 	 * Free any held credentials and caches which may be associated
2203 	 * with this rnode.
2204 	 */
2205 	mutex_enter(&rp->r_statelock);
2206 	cred = rp->r_cred;
2207 	rp->r_cred = NULL;
2208 	contents = rp->r_symlink.contents;
2209 	size = rp->r_symlink.size;
2210 	rp->r_symlink.contents = NULL;
2211 	vsp = rp->r_secattr;
2212 	rp->r_secattr = NULL;
2213 	info = rp->r_pathconf;
2214 	rp->r_pathconf = NULL;
2215 	mutex_exit(&rp->r_statelock);
2216 
2217 	/*
2218 	 * Free the held credential.
2219 	 */
2220 	if (cred != NULL)
2221 		crfree(cred);
2222 
2223 	/*
2224 	 * Free the access cache entries.
2225 	 */
2226 	(void) nfs_access_purge_rp(rp);
2227 
2228 	/*
2229 	 * Free the readdir cache entries.
2230 	 */
2231 	if (HAVE_RDDIR_CACHE(rp))
2232 		nfs_purge_rddir_cache(vp);
2233 
2234 	/*
2235 	 * Free the symbolic link cache.
2236 	 */
2237 	if (contents != NULL) {
2238 
2239 		kmem_free((void *)contents, size);
2240 	}
2241 
2242 	/*
2243 	 * Free any cached ACL.
2244 	 */
2245 	if (vsp != NULL)
2246 		nfs_acl_free(vsp);
2247 
2248 	/*
2249 	 * Free any cached pathconf information.
2250 	 */
2251 	if (info != NULL)
2252 		kmem_free(info, sizeof (*info));
2253 }
2254 
2255 /*
2256  * Return a vnode for the given NFS Version 2 file handle.
2257  * If no rnode exists for this fhandle, create one and put it
2258  * into the hash queues.  If the rnode for this fhandle
2259  * already exists, return it.
2260  *
2261  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2262  */
2263 vnode_t *
2264 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2265     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2266 {
2267 	int newnode;
2268 	int index;
2269 	vnode_t *vp;
2270 	nfs_fhandle nfh;
2271 	vattr_t va;
2272 
2273 	nfh.fh_len = NFS_FHSIZE;
2274 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2275 
2276 	index = rtablehash(&nfh);
2277 	rw_enter(&rtable[index].r_lock, RW_READER);
2278 
2279 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2280 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2281 
2282 	if (attr != NULL) {
2283 		if (!newnode) {
2284 			rw_exit(&rtable[index].r_lock);
2285 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2286 		} else {
2287 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2288 				vp->v_type = VBAD;
2289 			else
2290 				vp->v_type = n2v_type(attr);
2291 			/*
2292 			 * A translation here seems to be necessary
2293 			 * because this function can be called
2294 			 * with `attr' that has come from the wire,
2295 			 * and been operated on by vattr_to_nattr().
2296 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2297 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2298 			 * ->makenfsnode().
2299 			 */
2300 			if ((attr->na_rdev & 0xffff0000) == 0)
2301 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2302 			else
2303 				vp->v_rdev = expldev(n2v_rdev(attr));
2304 			nfs_attrcache(vp, attr, t);
2305 			rw_exit(&rtable[index].r_lock);
2306 		}
2307 	} else {
2308 		if (newnode) {
2309 			PURGE_ATTRCACHE(vp);
2310 		}
2311 		rw_exit(&rtable[index].r_lock);
2312 	}
2313 
2314 	return (vp);
2315 }
2316 
2317 /*
2318  * Return a vnode for the given NFS Version 3 file handle.
2319  * If no rnode exists for this fhandle, create one and put it
2320  * into the hash queues.  If the rnode for this fhandle
2321  * already exists, return it.
2322  *
2323  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2324  */
2325 vnode_t *
2326 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2327     cred_t *cr, char *dnm, char *nm)
2328 {
2329 	int newnode;
2330 	int index;
2331 	vnode_t *vp;
2332 
2333 	index = rtablehash((nfs_fhandle *)fh);
2334 	rw_enter(&rtable[index].r_lock, RW_READER);
2335 
2336 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2337 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2338 	    dnm, nm);
2339 
2340 	if (vap == NULL) {
2341 		if (newnode) {
2342 			PURGE_ATTRCACHE(vp);
2343 		}
2344 		rw_exit(&rtable[index].r_lock);
2345 		return (vp);
2346 	}
2347 
2348 	if (!newnode) {
2349 		rw_exit(&rtable[index].r_lock);
2350 		nfs_attr_cache(vp, vap, t, cr);
2351 	} else {
2352 		rnode_t *rp = VTOR(vp);
2353 
2354 		vp->v_type = vap->va_type;
2355 		vp->v_rdev = vap->va_rdev;
2356 
2357 		mutex_enter(&rp->r_statelock);
2358 		if (rp->r_mtime <= t)
2359 			nfs_attrcache_va(vp, vap);
2360 		mutex_exit(&rp->r_statelock);
2361 		rw_exit(&rtable[index].r_lock);
2362 	}
2363 
2364 	return (vp);
2365 }
2366 
2367 vnode_t *
2368 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2369     cred_t *cr, char *dnm, char *nm)
2370 {
2371 	int newnode;
2372 	int index;
2373 	vnode_t *vp;
2374 	vattr_t va;
2375 
2376 	index = rtablehash((nfs_fhandle *)fh);
2377 	rw_enter(&rtable[index].r_lock, RW_READER);
2378 
2379 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2380 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2381 	    dnm, nm);
2382 
2383 	if (attr == NULL) {
2384 		if (newnode) {
2385 			PURGE_ATTRCACHE(vp);
2386 		}
2387 		rw_exit(&rtable[index].r_lock);
2388 		return (vp);
2389 	}
2390 
2391 	if (!newnode) {
2392 		rw_exit(&rtable[index].r_lock);
2393 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2394 	} else {
2395 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2396 			vp->v_type = VBAD;
2397 		else
2398 			vp->v_type = nf3_to_vt[attr->type];
2399 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2400 		    attr->rdev.specdata2);
2401 		nfs3_attrcache(vp, attr, t);
2402 		rw_exit(&rtable[index].r_lock);
2403 	}
2404 
2405 	return (vp);
2406 }
2407 
2408 /*
2409  * Read this comment before making changes to rtablehash()!
2410  * This is a hash function in which seemingly obvious and harmless
2411  * changes can cause escalations costing million dollars!
2412  * Know what you are doing.
2413  *
2414  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2415  * algorithm is currently detailed here:
2416  *
2417  *   http://burtleburtle.net/bob/hash/doobs.html
2418  *
2419  * Of course, the above link may not be valid by the time you are reading
2420  * this, but suffice it to say that the one-at-a-time algorithm works well in
2421  * almost all cases.  If you are changing the algorithm be sure to verify that
2422  * the hash algorithm still provides even distribution in all cases and with
2423  * any server returning filehandles in whatever order (sequential or random).
2424  */
2425 static int
2426 rtablehash(nfs_fhandle *fh)
2427 {
2428 	ulong_t hash, len, i;
2429 	char *key;
2430 
2431 	key = fh->fh_buf;
2432 	len = (ulong_t)fh->fh_len;
2433 	for (hash = 0, i = 0; i < len; i++) {
2434 		hash += key[i];
2435 		hash += (hash << 10);
2436 		hash ^= (hash >> 6);
2437 	}
2438 	hash += (hash << 3);
2439 	hash ^= (hash >> 11);
2440 	hash += (hash << 15);
2441 	return (hash & rtablemask);
2442 }
2443 
2444 static vnode_t *
2445 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2446     struct vnodeops *vops,
2447     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2448     int (*compar)(const void *, const void *),
2449     int *newnode, cred_t *cr, char *dnm, char *nm)
2450 {
2451 	rnode_t *rp;
2452 	rnode_t *trp;
2453 	vnode_t *vp;
2454 	mntinfo_t *mi;
2455 
2456 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2457 
2458 	mi = VFTOMI(vfsp);
2459 start:
2460 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2461 		vp = RTOV(rp);
2462 		nfs_set_vroot(vp);
2463 		*newnode = 0;
2464 		return (vp);
2465 	}
2466 	rw_exit(&rhtp->r_lock);
2467 
2468 	mutex_enter(&rpfreelist_lock);
2469 	if (rpfreelist != NULL && rnew >= nrnode) {
2470 		rp = rpfreelist;
2471 		rp_rmfree(rp);
2472 		mutex_exit(&rpfreelist_lock);
2473 
2474 		vp = RTOV(rp);
2475 
2476 		if (rp->r_flags & RHASHED) {
2477 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2478 			mutex_enter(&vp->v_lock);
2479 			if (vp->v_count > 1) {
2480 				vp->v_count--;
2481 				mutex_exit(&vp->v_lock);
2482 				rw_exit(&rp->r_hashq->r_lock);
2483 				rw_enter(&rhtp->r_lock, RW_READER);
2484 				goto start;
2485 			}
2486 			mutex_exit(&vp->v_lock);
2487 			rp_rmhash_locked(rp);
2488 			rw_exit(&rp->r_hashq->r_lock);
2489 		}
2490 
2491 		rinactive(rp, cr);
2492 
2493 		mutex_enter(&vp->v_lock);
2494 		if (vp->v_count > 1) {
2495 			vp->v_count--;
2496 			mutex_exit(&vp->v_lock);
2497 			rw_enter(&rhtp->r_lock, RW_READER);
2498 			goto start;
2499 		}
2500 		mutex_exit(&vp->v_lock);
2501 		vn_invalid(vp);
2502 		/*
2503 		 * destroy old locks before bzero'ing and
2504 		 * recreating the locks below.
2505 		 */
2506 		nfs_rw_destroy(&rp->r_rwlock);
2507 		nfs_rw_destroy(&rp->r_lkserlock);
2508 		mutex_destroy(&rp->r_statelock);
2509 		cv_destroy(&rp->r_cv);
2510 		cv_destroy(&rp->r_commit.c_cv);
2511 		nfs_free_r_path(rp);
2512 		avl_destroy(&rp->r_dir);
2513 		/*
2514 		 * Make sure that if rnode is recycled then
2515 		 * VFS count is decremented properly before
2516 		 * reuse.
2517 		 */
2518 		VFS_RELE(vp->v_vfsp);
2519 		vn_reinit(vp);
2520 	} else {
2521 		vnode_t *new_vp;
2522 
2523 		mutex_exit(&rpfreelist_lock);
2524 
2525 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2526 		new_vp = vn_alloc(KM_SLEEP);
2527 
2528 		atomic_add_long((ulong_t *)&rnew, 1);
2529 #ifdef DEBUG
2530 		clstat_debug.nrnode.value.ui64++;
2531 #endif
2532 		vp = new_vp;
2533 	}
2534 
2535 	bzero(rp, sizeof (*rp));
2536 	rp->r_vnode = vp;
2537 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2538 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2539 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2540 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2541 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2542 	rp->r_fh.fh_len = fh->fh_len;
2543 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2544 	rp->r_server = mi->mi_curr_serv;
2545 	if (FAILOVER_MOUNT(mi)) {
2546 		/*
2547 		 * If replicated servers, stash pathnames
2548 		 */
2549 		if (dnm != NULL && nm != NULL) {
2550 			char *s, *p;
2551 			uint_t len;
2552 
2553 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2554 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2555 #ifdef DEBUG
2556 			clstat_debug.rpath.value.ui64 += len;
2557 #endif
2558 			s = rp->r_path;
2559 			for (p = dnm; *p; p++)
2560 				*s++ = *p;
2561 			*s++ = '/';
2562 			for (p = nm; *p; p++)
2563 				*s++ = *p;
2564 			*s = '\0';
2565 		} else {
2566 			/* special case for root */
2567 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2568 #ifdef DEBUG
2569 			clstat_debug.rpath.value.ui64 += 2;
2570 #endif
2571 			*rp->r_path = '.';
2572 			*(rp->r_path + 1) = '\0';
2573 		}
2574 	}
2575 	VFS_HOLD(vfsp);
2576 	rp->r_putapage = putapage;
2577 	rp->r_hashq = rhtp;
2578 	rp->r_flags = RREADDIRPLUS;
2579 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2580 	    offsetof(rddir_cache, tree));
2581 	vn_setops(vp, vops);
2582 	vp->v_data = (caddr_t)rp;
2583 	vp->v_vfsp = vfsp;
2584 	vp->v_type = VNON;
2585 	nfs_set_vroot(vp);
2586 
2587 	/*
2588 	 * There is a race condition if someone else
2589 	 * alloc's the rnode while no locks are held, so we
2590 	 * check again and recover if found.
2591 	 */
2592 	rw_enter(&rhtp->r_lock, RW_WRITER);
2593 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2594 		vp = RTOV(trp);
2595 		nfs_set_vroot(vp);
2596 		*newnode = 0;
2597 		rw_exit(&rhtp->r_lock);
2598 		rp_addfree(rp, cr);
2599 		rw_enter(&rhtp->r_lock, RW_READER);
2600 		return (vp);
2601 	}
2602 	rp_addhash(rp);
2603 	*newnode = 1;
2604 	return (vp);
2605 }
2606 
2607 static void
2608 nfs_set_vroot(vnode_t *vp)
2609 {
2610 	rnode_t *rp;
2611 	nfs_fhandle *rootfh;
2612 
2613 	rp = VTOR(vp);
2614 	rootfh = &rp->r_server->sv_fhandle;
2615 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2616 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2617 		if (!(vp->v_flag & VROOT)) {
2618 			mutex_enter(&vp->v_lock);
2619 			vp->v_flag |= VROOT;
2620 			mutex_exit(&vp->v_lock);
2621 		}
2622 	}
2623 }
2624 
2625 static void
2626 nfs_free_r_path(rnode_t *rp)
2627 {
2628 	char *path;
2629 	size_t len;
2630 
2631 	path = rp->r_path;
2632 	if (path) {
2633 		rp->r_path = NULL;
2634 		len = strlen(path) + 1;
2635 		kmem_free(path, len);
2636 #ifdef DEBUG
2637 		clstat_debug.rpath.value.ui64 -= len;
2638 #endif
2639 	}
2640 }
2641 
2642 /*
2643  * Put an rnode on the free list.
2644  *
2645  * Rnodes which were allocated above and beyond the normal limit
2646  * are immediately freed.
2647  */
2648 void
2649 rp_addfree(rnode_t *rp, cred_t *cr)
2650 {
2651 	vnode_t *vp;
2652 	struct vfs *vfsp;
2653 
2654 	vp = RTOV(rp);
2655 	ASSERT(vp->v_count >= 1);
2656 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2657 
2658 	/*
2659 	 * If we have too many rnodes allocated and there are no
2660 	 * references to this rnode, or if the rnode is no longer
2661 	 * accessible by it does not reside in the hash queues,
2662 	 * or if an i/o error occurred while writing to the file,
2663 	 * then just free it instead of putting it on the rnode
2664 	 * freelist.
2665 	 */
2666 	vfsp = vp->v_vfsp;
2667 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2668 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2669 		if (rp->r_flags & RHASHED) {
2670 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2671 			mutex_enter(&vp->v_lock);
2672 			if (vp->v_count > 1) {
2673 				vp->v_count--;
2674 				mutex_exit(&vp->v_lock);
2675 				rw_exit(&rp->r_hashq->r_lock);
2676 				return;
2677 			}
2678 			mutex_exit(&vp->v_lock);
2679 			rp_rmhash_locked(rp);
2680 			rw_exit(&rp->r_hashq->r_lock);
2681 		}
2682 
2683 		rinactive(rp, cr);
2684 
2685 		/*
2686 		 * Recheck the vnode reference count.  We need to
2687 		 * make sure that another reference has not been
2688 		 * acquired while we were not holding v_lock.  The
2689 		 * rnode is not in the rnode hash queues, so the
2690 		 * only way for a reference to have been acquired
2691 		 * is for a VOP_PUTPAGE because the rnode was marked
2692 		 * with RDIRTY or for a modified page.  This
2693 		 * reference may have been acquired before our call
2694 		 * to rinactive.  The i/o may have been completed,
2695 		 * thus allowing rinactive to complete, but the
2696 		 * reference to the vnode may not have been released
2697 		 * yet.  In any case, the rnode can not be destroyed
2698 		 * until the other references to this vnode have been
2699 		 * released.  The other references will take care of
2700 		 * either destroying the rnode or placing it on the
2701 		 * rnode freelist.  If there are no other references,
2702 		 * then the rnode may be safely destroyed.
2703 		 */
2704 		mutex_enter(&vp->v_lock);
2705 		if (vp->v_count > 1) {
2706 			vp->v_count--;
2707 			mutex_exit(&vp->v_lock);
2708 			return;
2709 		}
2710 		mutex_exit(&vp->v_lock);
2711 
2712 		destroy_rnode(rp);
2713 		return;
2714 	}
2715 
2716 	/*
2717 	 * Lock the hash queue and then recheck the reference count
2718 	 * to ensure that no other threads have acquired a reference
2719 	 * to indicate that the rnode should not be placed on the
2720 	 * freelist.  If another reference has been acquired, then
2721 	 * just release this one and let the other thread complete
2722 	 * the processing of adding this rnode to the freelist.
2723 	 */
2724 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2725 
2726 	mutex_enter(&vp->v_lock);
2727 	if (vp->v_count > 1) {
2728 		vp->v_count--;
2729 		mutex_exit(&vp->v_lock);
2730 		rw_exit(&rp->r_hashq->r_lock);
2731 		return;
2732 	}
2733 	mutex_exit(&vp->v_lock);
2734 
2735 	/*
2736 	 * If there is no cached data or metadata for this file, then
2737 	 * put the rnode on the front of the freelist so that it will
2738 	 * be reused before other rnodes which may have cached data or
2739 	 * metadata associated with them.
2740 	 */
2741 	mutex_enter(&rpfreelist_lock);
2742 	if (rpfreelist == NULL) {
2743 		rp->r_freef = rp;
2744 		rp->r_freeb = rp;
2745 		rpfreelist = rp;
2746 	} else {
2747 		rp->r_freef = rpfreelist;
2748 		rp->r_freeb = rpfreelist->r_freeb;
2749 		rpfreelist->r_freeb->r_freef = rp;
2750 		rpfreelist->r_freeb = rp;
2751 		if (!vn_has_cached_data(vp) &&
2752 		    !HAVE_RDDIR_CACHE(rp) &&
2753 		    rp->r_symlink.contents == NULL &&
2754 		    rp->r_secattr == NULL &&
2755 		    rp->r_pathconf == NULL)
2756 			rpfreelist = rp;
2757 	}
2758 	mutex_exit(&rpfreelist_lock);
2759 
2760 	rw_exit(&rp->r_hashq->r_lock);
2761 }
2762 
2763 /*
2764  * Remove an rnode from the free list.
2765  *
2766  * The caller must be holding rpfreelist_lock and the rnode
2767  * must be on the freelist.
2768  */
2769 static void
2770 rp_rmfree(rnode_t *rp)
2771 {
2772 
2773 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2774 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2775 
2776 	if (rp == rpfreelist) {
2777 		rpfreelist = rp->r_freef;
2778 		if (rp == rpfreelist)
2779 			rpfreelist = NULL;
2780 	}
2781 
2782 	rp->r_freeb->r_freef = rp->r_freef;
2783 	rp->r_freef->r_freeb = rp->r_freeb;
2784 
2785 	rp->r_freef = rp->r_freeb = NULL;
2786 }
2787 
2788 /*
2789  * Put a rnode in the hash table.
2790  *
2791  * The caller must be holding the exclusive hash queue lock.
2792  */
2793 static void
2794 rp_addhash(rnode_t *rp)
2795 {
2796 
2797 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2798 	ASSERT(!(rp->r_flags & RHASHED));
2799 
2800 	rp->r_hashf = rp->r_hashq->r_hashf;
2801 	rp->r_hashq->r_hashf = rp;
2802 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2803 	rp->r_hashf->r_hashb = rp;
2804 
2805 	mutex_enter(&rp->r_statelock);
2806 	rp->r_flags |= RHASHED;
2807 	mutex_exit(&rp->r_statelock);
2808 }
2809 
2810 /*
2811  * Remove a rnode from the hash table.
2812  *
2813  * The caller must be holding the hash queue lock.
2814  */
2815 static void
2816 rp_rmhash_locked(rnode_t *rp)
2817 {
2818 
2819 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2820 	ASSERT(rp->r_flags & RHASHED);
2821 
2822 	rp->r_hashb->r_hashf = rp->r_hashf;
2823 	rp->r_hashf->r_hashb = rp->r_hashb;
2824 
2825 	mutex_enter(&rp->r_statelock);
2826 	rp->r_flags &= ~RHASHED;
2827 	mutex_exit(&rp->r_statelock);
2828 }
2829 
2830 /*
2831  * Remove a rnode from the hash table.
2832  *
2833  * The caller must not be holding the hash queue lock.
2834  */
2835 void
2836 rp_rmhash(rnode_t *rp)
2837 {
2838 
2839 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2840 	rp_rmhash_locked(rp);
2841 	rw_exit(&rp->r_hashq->r_lock);
2842 }
2843 
2844 /*
2845  * Lookup a rnode by fhandle.
2846  *
2847  * The caller must be holding the hash queue lock, either shared or exclusive.
2848  */
2849 static rnode_t *
2850 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2851 {
2852 	rnode_t *rp;
2853 	vnode_t *vp;
2854 
2855 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2856 
2857 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2858 		vp = RTOV(rp);
2859 		if (vp->v_vfsp == vfsp &&
2860 		    rp->r_fh.fh_len == fh->fh_len &&
2861 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2862 			/*
2863 			 * remove rnode from free list, if necessary.
2864 			 */
2865 			if (rp->r_freef != NULL) {
2866 				mutex_enter(&rpfreelist_lock);
2867 				/*
2868 				 * If the rnode is on the freelist,
2869 				 * then remove it and use that reference
2870 				 * as the new reference.  Otherwise,
2871 				 * need to increment the reference count.
2872 				 */
2873 				if (rp->r_freef != NULL) {
2874 					rp_rmfree(rp);
2875 					mutex_exit(&rpfreelist_lock);
2876 				} else {
2877 					mutex_exit(&rpfreelist_lock);
2878 					VN_HOLD(vp);
2879 				}
2880 			} else
2881 				VN_HOLD(vp);
2882 			return (rp);
2883 		}
2884 	}
2885 	return (NULL);
2886 }
2887 
2888 /*
2889  * Return 1 if there is a active vnode belonging to this vfs in the
2890  * rtable cache.
2891  *
2892  * Several of these checks are done without holding the usual
2893  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2894  * etc. will redo the necessary checks before actually destroying
2895  * any rnodes.
2896  */
2897 int
2898 check_rtable(struct vfs *vfsp)
2899 {
2900 	int index;
2901 	rnode_t *rp;
2902 	vnode_t *vp;
2903 
2904 	for (index = 0; index < rtablesize; index++) {
2905 		rw_enter(&rtable[index].r_lock, RW_READER);
2906 		for (rp = rtable[index].r_hashf;
2907 		    rp != (rnode_t *)(&rtable[index]);
2908 		    rp = rp->r_hashf) {
2909 			vp = RTOV(rp);
2910 			if (vp->v_vfsp == vfsp) {
2911 				if (rp->r_freef == NULL ||
2912 				    (vn_has_cached_data(vp) &&
2913 				    (rp->r_flags & RDIRTY)) ||
2914 				    rp->r_count > 0) {
2915 					rw_exit(&rtable[index].r_lock);
2916 					return (1);
2917 				}
2918 			}
2919 		}
2920 		rw_exit(&rtable[index].r_lock);
2921 	}
2922 	return (0);
2923 }
2924 
2925 /*
2926  * Destroy inactive vnodes from the hash queues which belong to this
2927  * vfs.  It is essential that we destroy all inactive vnodes during a
2928  * forced unmount as well as during a normal unmount.
2929  */
2930 void
2931 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2932 {
2933 	int index;
2934 	rnode_t *rp;
2935 	rnode_t *rlist;
2936 	rnode_t *r_hashf;
2937 	vnode_t *vp;
2938 
2939 	rlist = NULL;
2940 
2941 	for (index = 0; index < rtablesize; index++) {
2942 		rw_enter(&rtable[index].r_lock, RW_WRITER);
2943 		for (rp = rtable[index].r_hashf;
2944 		    rp != (rnode_t *)(&rtable[index]);
2945 		    rp = r_hashf) {
2946 			/* save the hash pointer before destroying */
2947 			r_hashf = rp->r_hashf;
2948 			vp = RTOV(rp);
2949 			if (vp->v_vfsp == vfsp) {
2950 				mutex_enter(&rpfreelist_lock);
2951 				if (rp->r_freef != NULL) {
2952 					rp_rmfree(rp);
2953 					mutex_exit(&rpfreelist_lock);
2954 					rp_rmhash_locked(rp);
2955 					rp->r_hashf = rlist;
2956 					rlist = rp;
2957 				} else
2958 					mutex_exit(&rpfreelist_lock);
2959 			}
2960 		}
2961 		rw_exit(&rtable[index].r_lock);
2962 	}
2963 
2964 	for (rp = rlist; rp != NULL; rp = rlist) {
2965 		rlist = rp->r_hashf;
2966 		/*
2967 		 * This call to rp_addfree will end up destroying the
2968 		 * rnode, but in a safe way with the appropriate set
2969 		 * of checks done.
2970 		 */
2971 		rp_addfree(rp, cr);
2972 	}
2973 
2974 }
2975 
2976 /*
2977  * This routine destroys all the resources associated with the rnode
2978  * and then the rnode itself.
2979  */
2980 static void
2981 destroy_rnode(rnode_t *rp)
2982 {
2983 	vnode_t *vp;
2984 	vfs_t *vfsp;
2985 
2986 	vp = RTOV(rp);
2987 	vfsp = vp->v_vfsp;
2988 
2989 	ASSERT(vp->v_count == 1);
2990 	ASSERT(rp->r_count == 0);
2991 	ASSERT(rp->r_lmpl == NULL);
2992 	ASSERT(rp->r_mapcnt == 0);
2993 	ASSERT(!(rp->r_flags & RHASHED));
2994 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2995 	atomic_add_long((ulong_t *)&rnew, -1);
2996 #ifdef DEBUG
2997 	clstat_debug.nrnode.value.ui64--;
2998 #endif
2999 	nfs_rw_destroy(&rp->r_rwlock);
3000 	nfs_rw_destroy(&rp->r_lkserlock);
3001 	mutex_destroy(&rp->r_statelock);
3002 	cv_destroy(&rp->r_cv);
3003 	cv_destroy(&rp->r_commit.c_cv);
3004 	if (rp->r_flags & RDELMAPLIST)
3005 		list_destroy(&rp->r_indelmap);
3006 	nfs_free_r_path(rp);
3007 	avl_destroy(&rp->r_dir);
3008 	vn_invalid(vp);
3009 	vn_free(vp);
3010 	kmem_cache_free(rnode_cache, rp);
3011 	VFS_RELE(vfsp);
3012 }
3013 
3014 /*
3015  * Flush all vnodes in this (or every) vfs.
3016  * Used by nfs_sync and by nfs_unmount.
3017  */
3018 void
3019 rflush(struct vfs *vfsp, cred_t *cr)
3020 {
3021 	int index;
3022 	rnode_t *rp;
3023 	vnode_t *vp, **vplist;
3024 	long num, cnt;
3025 
3026 	/*
3027 	 * Check to see whether there is anything to do.
3028 	 */
3029 	num = rnew;
3030 	if (num == 0)
3031 		return;
3032 
3033 	/*
3034 	 * Allocate a slot for all currently active rnodes on the
3035 	 * supposition that they all may need flushing.
3036 	 */
3037 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3038 	cnt = 0;
3039 
3040 	/*
3041 	 * Walk the hash queues looking for rnodes with page
3042 	 * lists associated with them.  Make a list of these
3043 	 * files.
3044 	 */
3045 	for (index = 0; index < rtablesize; index++) {
3046 		rw_enter(&rtable[index].r_lock, RW_READER);
3047 		for (rp = rtable[index].r_hashf;
3048 		    rp != (rnode_t *)(&rtable[index]);
3049 		    rp = rp->r_hashf) {
3050 			vp = RTOV(rp);
3051 			/*
3052 			 * Don't bother sync'ing a vp if it
3053 			 * is part of virtual swap device or
3054 			 * if VFS is read-only
3055 			 */
3056 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3057 				continue;
3058 			/*
3059 			 * If flushing all mounted file systems or
3060 			 * the vnode belongs to this vfs, has pages
3061 			 * and is marked as either dirty or mmap'd,
3062 			 * hold and add this vnode to the list of
3063 			 * vnodes to flush.
3064 			 */
3065 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3066 			    vn_has_cached_data(vp) &&
3067 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3068 				VN_HOLD(vp);
3069 				vplist[cnt++] = vp;
3070 				if (cnt == num) {
3071 					rw_exit(&rtable[index].r_lock);
3072 					goto toomany;
3073 				}
3074 			}
3075 		}
3076 		rw_exit(&rtable[index].r_lock);
3077 	}
3078 toomany:
3079 
3080 	/*
3081 	 * Flush and release all of the files on the list.
3082 	 */
3083 	while (cnt-- > 0) {
3084 		vp = vplist[cnt];
3085 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3086 		VN_RELE(vp);
3087 	}
3088 
3089 	/*
3090 	 * Free the space allocated to hold the list.
3091 	 */
3092 	kmem_free(vplist, num * sizeof (*vplist));
3093 }
3094 
3095 /*
3096  * This probably needs to be larger than or equal to
3097  * log2(sizeof (struct rnode)) due to the way that rnodes are
3098  * allocated.
3099  */
3100 #define	ACACHE_SHIFT_BITS	9
3101 
3102 static int
3103 acachehash(rnode_t *rp, cred_t *cr)
3104 {
3105 
3106 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3107 	    acachemask);
3108 }
3109 
3110 #ifdef DEBUG
3111 static long nfs_access_cache_hits = 0;
3112 static long nfs_access_cache_misses = 0;
3113 #endif
3114 
3115 nfs_access_type_t
3116 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3117 {
3118 	vnode_t *vp;
3119 	acache_t *ap;
3120 	acache_hash_t *hp;
3121 	nfs_access_type_t all;
3122 
3123 	vp = RTOV(rp);
3124 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3125 		return (NFS_ACCESS_UNKNOWN);
3126 
3127 	if (rp->r_acache != NULL) {
3128 		hp = &acache[acachehash(rp, cr)];
3129 		rw_enter(&hp->lock, RW_READER);
3130 		ap = hp->next;
3131 		while (ap != (acache_t *)hp) {
3132 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3133 				if ((ap->known & acc) == acc) {
3134 #ifdef DEBUG
3135 					nfs_access_cache_hits++;
3136 #endif
3137 					if ((ap->allowed & acc) == acc)
3138 						all = NFS_ACCESS_ALLOWED;
3139 					else
3140 						all = NFS_ACCESS_DENIED;
3141 				} else {
3142 #ifdef DEBUG
3143 					nfs_access_cache_misses++;
3144 #endif
3145 					all = NFS_ACCESS_UNKNOWN;
3146 				}
3147 				rw_exit(&hp->lock);
3148 				return (all);
3149 			}
3150 			ap = ap->next;
3151 		}
3152 		rw_exit(&hp->lock);
3153 	}
3154 
3155 #ifdef DEBUG
3156 	nfs_access_cache_misses++;
3157 #endif
3158 	return (NFS_ACCESS_UNKNOWN);
3159 }
3160 
3161 void
3162 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3163 {
3164 	acache_t *ap;
3165 	acache_t *nap;
3166 	acache_hash_t *hp;
3167 
3168 	hp = &acache[acachehash(rp, cr)];
3169 
3170 	/*
3171 	 * Allocate now assuming that mostly an allocation will be
3172 	 * required.  This allows the allocation to happen without
3173 	 * holding the hash bucket locked.
3174 	 */
3175 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3176 	if (nap != NULL) {
3177 		nap->known = acc;
3178 		nap->allowed = resacc;
3179 		nap->rnode = rp;
3180 		crhold(cr);
3181 		nap->cred = cr;
3182 		nap->hashq = hp;
3183 	}
3184 
3185 	rw_enter(&hp->lock, RW_WRITER);
3186 
3187 	if (rp->r_acache != NULL) {
3188 		ap = hp->next;
3189 		while (ap != (acache_t *)hp) {
3190 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3191 				ap->known |= acc;
3192 				ap->allowed &= ~acc;
3193 				ap->allowed |= resacc;
3194 				rw_exit(&hp->lock);
3195 				if (nap != NULL) {
3196 					crfree(nap->cred);
3197 					kmem_cache_free(acache_cache, nap);
3198 				}
3199 				return;
3200 			}
3201 			ap = ap->next;
3202 		}
3203 	}
3204 
3205 	if (nap != NULL) {
3206 #ifdef DEBUG
3207 		clstat_debug.access.value.ui64++;
3208 #endif
3209 		nap->next = hp->next;
3210 		hp->next = nap;
3211 		nap->next->prev = nap;
3212 		nap->prev = (acache_t *)hp;
3213 
3214 		mutex_enter(&rp->r_statelock);
3215 		nap->list = rp->r_acache;
3216 		rp->r_acache = nap;
3217 		mutex_exit(&rp->r_statelock);
3218 	}
3219 
3220 	rw_exit(&hp->lock);
3221 }
3222 
3223 int
3224 nfs_access_purge_rp(rnode_t *rp)
3225 {
3226 	acache_t *ap;
3227 	acache_t *tmpap;
3228 	acache_t *rplist;
3229 
3230 	/*
3231 	 * If there aren't any cached entries, then there is nothing
3232 	 * to free.
3233 	 */
3234 	if (rp->r_acache == NULL)
3235 		return (0);
3236 
3237 	mutex_enter(&rp->r_statelock);
3238 	rplist = rp->r_acache;
3239 	rp->r_acache = NULL;
3240 	mutex_exit(&rp->r_statelock);
3241 
3242 	/*
3243 	 * Loop through each entry in the list pointed to in the
3244 	 * rnode.  Remove each of these entries from the hash
3245 	 * queue that it is on and remove it from the list in
3246 	 * the rnode.
3247 	 */
3248 	for (ap = rplist; ap != NULL; ap = tmpap) {
3249 		rw_enter(&ap->hashq->lock, RW_WRITER);
3250 		ap->prev->next = ap->next;
3251 		ap->next->prev = ap->prev;
3252 		rw_exit(&ap->hashq->lock);
3253 
3254 		tmpap = ap->list;
3255 		crfree(ap->cred);
3256 		kmem_cache_free(acache_cache, ap);
3257 #ifdef DEBUG
3258 		clstat_debug.access.value.ui64--;
3259 #endif
3260 	}
3261 
3262 	return (1);
3263 }
3264 
3265 static const char prefix[] = ".nfs";
3266 
3267 static kmutex_t newnum_lock;
3268 
3269 int
3270 newnum(void)
3271 {
3272 	static uint_t newnum = 0;
3273 	uint_t id;
3274 
3275 	mutex_enter(&newnum_lock);
3276 	if (newnum == 0)
3277 		newnum = gethrestime_sec() & 0xffff;
3278 	id = newnum++;
3279 	mutex_exit(&newnum_lock);
3280 	return (id);
3281 }
3282 
3283 char *
3284 newname(void)
3285 {
3286 	char *news;
3287 	char *s;
3288 	const char *p;
3289 	uint_t id;
3290 
3291 	id = newnum();
3292 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3293 	s = news;
3294 	p = prefix;
3295 	while (*p != '\0')
3296 		*s++ = *p++;
3297 	while (id != 0) {
3298 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3299 		id >>= 4;
3300 	}
3301 	*s = '\0';
3302 	return (news);
3303 }
3304 
3305 int
3306 nfs_atoi(char *cp)
3307 {
3308 	int n;
3309 
3310 	n = 0;
3311 	while (*cp != '\0') {
3312 		n = n * 10 + (*cp - '0');
3313 		cp++;
3314 	}
3315 
3316 	return (n);
3317 }
3318 
3319 /*
3320  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3321  * framework.
3322  */
3323 static int
3324 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3325 {
3326 	ksp->ks_snaptime = gethrtime();
3327 	if (rw == KSTAT_WRITE) {
3328 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3329 #ifdef DEBUG
3330 		/*
3331 		 * Currently only the global zone can write to kstats, but we
3332 		 * add the check just for paranoia.
3333 		 */
3334 		if (INGLOBALZONE(curproc))
3335 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3336 			    sizeof (clstat_debug));
3337 #endif
3338 	} else {
3339 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3340 #ifdef DEBUG
3341 		/*
3342 		 * If we're displaying the "global" debug kstat values, we
3343 		 * display them as-is to all zones since in fact they apply to
3344 		 * the system as a whole.
3345 		 */
3346 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3347 		    sizeof (clstat_debug));
3348 #endif
3349 	}
3350 	return (0);
3351 }
3352 
3353 static void *
3354 clinit_zone(zoneid_t zoneid)
3355 {
3356 	kstat_t *nfs_client_kstat;
3357 	struct nfs_clnt *nfscl;
3358 	uint_t ndata;
3359 
3360 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3361 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3362 	nfscl->nfscl_chtable = NULL;
3363 	nfscl->nfscl_zoneid = zoneid;
3364 
3365 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3366 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3367 #ifdef DEBUG
3368 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3369 #endif
3370 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3371 	    "misc", KSTAT_TYPE_NAMED, ndata,
3372 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3373 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3374 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3375 		kstat_install(nfs_client_kstat);
3376 	}
3377 	mutex_enter(&nfs_clnt_list_lock);
3378 	list_insert_head(&nfs_clnt_list, nfscl);
3379 	mutex_exit(&nfs_clnt_list_lock);
3380 	return (nfscl);
3381 }
3382 
3383 /*ARGSUSED*/
3384 static void
3385 clfini_zone(zoneid_t zoneid, void *arg)
3386 {
3387 	struct nfs_clnt *nfscl = arg;
3388 	chhead_t *chp, *next;
3389 
3390 	if (nfscl == NULL)
3391 		return;
3392 	mutex_enter(&nfs_clnt_list_lock);
3393 	list_remove(&nfs_clnt_list, nfscl);
3394 	mutex_exit(&nfs_clnt_list_lock);
3395 	clreclaim_zone(nfscl, 0);
3396 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3397 		ASSERT(chp->ch_list == NULL);
3398 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3399 		next = chp->ch_next;
3400 		kmem_free(chp, sizeof (*chp));
3401 	}
3402 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3403 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3404 	kmem_free(nfscl, sizeof (*nfscl));
3405 }
3406 
3407 /*
3408  * Called by endpnt_destructor to make sure the client handles are
3409  * cleaned up before the RPC endpoints.  This becomes a no-op if
3410  * clfini_zone (above) is called first.  This function is needed
3411  * (rather than relying on clfini_zone to clean up) because the ZSD
3412  * callbacks have no ordering mechanism, so we have no way to ensure
3413  * that clfini_zone is called before endpnt_destructor.
3414  */
3415 void
3416 clcleanup_zone(zoneid_t zoneid)
3417 {
3418 	struct nfs_clnt *nfscl;
3419 
3420 	mutex_enter(&nfs_clnt_list_lock);
3421 	nfscl = list_head(&nfs_clnt_list);
3422 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3423 		if (nfscl->nfscl_zoneid == zoneid) {
3424 			clreclaim_zone(nfscl, 0);
3425 			break;
3426 		}
3427 	}
3428 	mutex_exit(&nfs_clnt_list_lock);
3429 }
3430 
3431 int
3432 nfs_subrinit(void)
3433 {
3434 	int i;
3435 	ulong_t nrnode_max;
3436 
3437 	/*
3438 	 * Allocate and initialize the rnode hash queues
3439 	 */
3440 	if (nrnode <= 0)
3441 		nrnode = ncsize;
3442 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3443 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3444 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3445 		    "setting nrnode to max value of %ld", nrnode_max);
3446 		nrnode = nrnode_max;
3447 	}
3448 
3449 	rtablesize = 1 << highbit(nrnode / hashlen);
3450 	rtablemask = rtablesize - 1;
3451 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3452 	for (i = 0; i < rtablesize; i++) {
3453 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3454 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3455 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3456 	}
3457 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3458 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3459 
3460 	/*
3461 	 * Allocate and initialize the access cache
3462 	 */
3463 
3464 	/*
3465 	 * Initial guess is one access cache entry per rnode unless
3466 	 * nacache is set to a non-zero value and then it is used to
3467 	 * indicate a guess at the number of access cache entries.
3468 	 */
3469 	if (nacache > 0)
3470 		acachesize = 1 << highbit(nacache / hashlen);
3471 	else
3472 		acachesize = rtablesize;
3473 	acachemask = acachesize - 1;
3474 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3475 	for (i = 0; i < acachesize; i++) {
3476 		acache[i].next = (acache_t *)&acache[i];
3477 		acache[i].prev = (acache_t *)&acache[i];
3478 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3479 	}
3480 	acache_cache = kmem_cache_create("nfs_access_cache",
3481 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3482 	/*
3483 	 * Allocate and initialize the client handle cache
3484 	 */
3485 	chtab_cache = kmem_cache_create("client_handle_cache",
3486 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3487 	/*
3488 	 * Initialize the list of per-zone client handles (and associated data).
3489 	 * This needs to be done before we call zone_key_create().
3490 	 */
3491 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3492 	    offsetof(struct nfs_clnt, nfscl_node));
3493 	/*
3494 	 * Initialize the zone_key for per-zone client handle lists.
3495 	 */
3496 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3497 	/*
3498 	 * Initialize the various mutexes and reader/writer locks
3499 	 */
3500 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3501 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3502 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3503 
3504 	/*
3505 	 * Assign unique major number for all nfs mounts
3506 	 */
3507 	if ((nfs_major = getudev()) == -1) {
3508 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3509 		    "nfs: init: can't get unique device number");
3510 		nfs_major = 0;
3511 	}
3512 	nfs_minor = 0;
3513 
3514 	if (nfs3_jukebox_delay == 0)
3515 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3516 
3517 	return (0);
3518 }
3519 
3520 void
3521 nfs_subrfini(void)
3522 {
3523 	int i;
3524 
3525 	/*
3526 	 * Deallocate the rnode hash queues
3527 	 */
3528 	kmem_cache_destroy(rnode_cache);
3529 
3530 	for (i = 0; i < rtablesize; i++)
3531 		rw_destroy(&rtable[i].r_lock);
3532 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3533 
3534 	/*
3535 	 * Deallocated the access cache
3536 	 */
3537 	kmem_cache_destroy(acache_cache);
3538 
3539 	for (i = 0; i < acachesize; i++)
3540 		rw_destroy(&acache[i].lock);
3541 	kmem_free(acache, acachesize * sizeof (*acache));
3542 
3543 	/*
3544 	 * Deallocate the client handle cache
3545 	 */
3546 	kmem_cache_destroy(chtab_cache);
3547 
3548 	/*
3549 	 * Destroy the various mutexes and reader/writer locks
3550 	 */
3551 	mutex_destroy(&rpfreelist_lock);
3552 	mutex_destroy(&newnum_lock);
3553 	mutex_destroy(&nfs_minor_lock);
3554 	(void) zone_key_delete(nfsclnt_zone_key);
3555 }
3556 
3557 enum nfsstat
3558 puterrno(int error)
3559 {
3560 
3561 	switch (error) {
3562 	case EOPNOTSUPP:
3563 		return (NFSERR_OPNOTSUPP);
3564 	case ENAMETOOLONG:
3565 		return (NFSERR_NAMETOOLONG);
3566 	case ENOTEMPTY:
3567 		return (NFSERR_NOTEMPTY);
3568 	case EDQUOT:
3569 		return (NFSERR_DQUOT);
3570 	case ESTALE:
3571 		return (NFSERR_STALE);
3572 	case EREMOTE:
3573 		return (NFSERR_REMOTE);
3574 	case ENOSYS:
3575 		return (NFSERR_OPNOTSUPP);
3576 	case EOVERFLOW:
3577 		return (NFSERR_INVAL);
3578 	default:
3579 		return ((enum nfsstat)error);
3580 	}
3581 	/* NOTREACHED */
3582 }
3583 
3584 int
3585 geterrno(enum nfsstat status)
3586 {
3587 
3588 	switch (status) {
3589 	case NFSERR_OPNOTSUPP:
3590 		return (EOPNOTSUPP);
3591 	case NFSERR_NAMETOOLONG:
3592 		return (ENAMETOOLONG);
3593 	case NFSERR_NOTEMPTY:
3594 		return (ENOTEMPTY);
3595 	case NFSERR_DQUOT:
3596 		return (EDQUOT);
3597 	case NFSERR_STALE:
3598 		return (ESTALE);
3599 	case NFSERR_REMOTE:
3600 		return (EREMOTE);
3601 	case NFSERR_WFLUSH:
3602 		return (EIO);
3603 	default:
3604 		return ((int)status);
3605 	}
3606 	/* NOTREACHED */
3607 }
3608 
3609 enum nfsstat3
3610 puterrno3(int error)
3611 {
3612 
3613 #ifdef DEBUG
3614 	switch (error) {
3615 	case 0:
3616 		return (NFS3_OK);
3617 	case EPERM:
3618 		return (NFS3ERR_PERM);
3619 	case ENOENT:
3620 		return (NFS3ERR_NOENT);
3621 	case EIO:
3622 		return (NFS3ERR_IO);
3623 	case ENXIO:
3624 		return (NFS3ERR_NXIO);
3625 	case EACCES:
3626 		return (NFS3ERR_ACCES);
3627 	case EEXIST:
3628 		return (NFS3ERR_EXIST);
3629 	case EXDEV:
3630 		return (NFS3ERR_XDEV);
3631 	case ENODEV:
3632 		return (NFS3ERR_NODEV);
3633 	case ENOTDIR:
3634 		return (NFS3ERR_NOTDIR);
3635 	case EISDIR:
3636 		return (NFS3ERR_ISDIR);
3637 	case EINVAL:
3638 		return (NFS3ERR_INVAL);
3639 	case EFBIG:
3640 		return (NFS3ERR_FBIG);
3641 	case ENOSPC:
3642 		return (NFS3ERR_NOSPC);
3643 	case EROFS:
3644 		return (NFS3ERR_ROFS);
3645 	case EMLINK:
3646 		return (NFS3ERR_MLINK);
3647 	case ENAMETOOLONG:
3648 		return (NFS3ERR_NAMETOOLONG);
3649 	case ENOTEMPTY:
3650 		return (NFS3ERR_NOTEMPTY);
3651 	case EDQUOT:
3652 		return (NFS3ERR_DQUOT);
3653 	case ESTALE:
3654 		return (NFS3ERR_STALE);
3655 	case EREMOTE:
3656 		return (NFS3ERR_REMOTE);
3657 	case ENOSYS:
3658 	case EOPNOTSUPP:
3659 		return (NFS3ERR_NOTSUPP);
3660 	case EOVERFLOW:
3661 		return (NFS3ERR_INVAL);
3662 	default:
3663 		zcmn_err(getzoneid(), CE_WARN,
3664 		    "puterrno3: got error %d", error);
3665 		return ((enum nfsstat3)error);
3666 	}
3667 #else
3668 	switch (error) {
3669 	case ENAMETOOLONG:
3670 		return (NFS3ERR_NAMETOOLONG);
3671 	case ENOTEMPTY:
3672 		return (NFS3ERR_NOTEMPTY);
3673 	case EDQUOT:
3674 		return (NFS3ERR_DQUOT);
3675 	case ESTALE:
3676 		return (NFS3ERR_STALE);
3677 	case ENOSYS:
3678 	case EOPNOTSUPP:
3679 		return (NFS3ERR_NOTSUPP);
3680 	case EREMOTE:
3681 		return (NFS3ERR_REMOTE);
3682 	case EOVERFLOW:
3683 		return (NFS3ERR_INVAL);
3684 	default:
3685 		return ((enum nfsstat3)error);
3686 	}
3687 #endif
3688 }
3689 
3690 int
3691 geterrno3(enum nfsstat3 status)
3692 {
3693 
3694 #ifdef DEBUG
3695 	switch (status) {
3696 	case NFS3_OK:
3697 		return (0);
3698 	case NFS3ERR_PERM:
3699 		return (EPERM);
3700 	case NFS3ERR_NOENT:
3701 		return (ENOENT);
3702 	case NFS3ERR_IO:
3703 		return (EIO);
3704 	case NFS3ERR_NXIO:
3705 		return (ENXIO);
3706 	case NFS3ERR_ACCES:
3707 		return (EACCES);
3708 	case NFS3ERR_EXIST:
3709 		return (EEXIST);
3710 	case NFS3ERR_XDEV:
3711 		return (EXDEV);
3712 	case NFS3ERR_NODEV:
3713 		return (ENODEV);
3714 	case NFS3ERR_NOTDIR:
3715 		return (ENOTDIR);
3716 	case NFS3ERR_ISDIR:
3717 		return (EISDIR);
3718 	case NFS3ERR_INVAL:
3719 		return (EINVAL);
3720 	case NFS3ERR_FBIG:
3721 		return (EFBIG);
3722 	case NFS3ERR_NOSPC:
3723 		return (ENOSPC);
3724 	case NFS3ERR_ROFS:
3725 		return (EROFS);
3726 	case NFS3ERR_MLINK:
3727 		return (EMLINK);
3728 	case NFS3ERR_NAMETOOLONG:
3729 		return (ENAMETOOLONG);
3730 	case NFS3ERR_NOTEMPTY:
3731 		return (ENOTEMPTY);
3732 	case NFS3ERR_DQUOT:
3733 		return (EDQUOT);
3734 	case NFS3ERR_STALE:
3735 		return (ESTALE);
3736 	case NFS3ERR_REMOTE:
3737 		return (EREMOTE);
3738 	case NFS3ERR_BADHANDLE:
3739 		return (ESTALE);
3740 	case NFS3ERR_NOT_SYNC:
3741 		return (EINVAL);
3742 	case NFS3ERR_BAD_COOKIE:
3743 		return (ENOENT);
3744 	case NFS3ERR_NOTSUPP:
3745 		return (EOPNOTSUPP);
3746 	case NFS3ERR_TOOSMALL:
3747 		return (EINVAL);
3748 	case NFS3ERR_SERVERFAULT:
3749 		return (EIO);
3750 	case NFS3ERR_BADTYPE:
3751 		return (EINVAL);
3752 	case NFS3ERR_JUKEBOX:
3753 		return (ENXIO);
3754 	default:
3755 		zcmn_err(getzoneid(), CE_WARN,
3756 		    "geterrno3: got status %d", status);
3757 		return ((int)status);
3758 	}
3759 #else
3760 	switch (status) {
3761 	case NFS3ERR_NAMETOOLONG:
3762 		return (ENAMETOOLONG);
3763 	case NFS3ERR_NOTEMPTY:
3764 		return (ENOTEMPTY);
3765 	case NFS3ERR_DQUOT:
3766 		return (EDQUOT);
3767 	case NFS3ERR_STALE:
3768 	case NFS3ERR_BADHANDLE:
3769 		return (ESTALE);
3770 	case NFS3ERR_NOTSUPP:
3771 		return (EOPNOTSUPP);
3772 	case NFS3ERR_REMOTE:
3773 		return (EREMOTE);
3774 	case NFS3ERR_NOT_SYNC:
3775 	case NFS3ERR_TOOSMALL:
3776 	case NFS3ERR_BADTYPE:
3777 		return (EINVAL);
3778 	case NFS3ERR_BAD_COOKIE:
3779 		return (ENOENT);
3780 	case NFS3ERR_SERVERFAULT:
3781 		return (EIO);
3782 	case NFS3ERR_JUKEBOX:
3783 		return (ENXIO);
3784 	default:
3785 		return ((int)status);
3786 	}
3787 #endif
3788 }
3789 
3790 rddir_cache *
3791 rddir_cache_alloc(int flags)
3792 {
3793 	rddir_cache *rc;
3794 
3795 	rc = kmem_alloc(sizeof (*rc), flags);
3796 	if (rc != NULL) {
3797 		rc->entries = NULL;
3798 		rc->flags = RDDIR;
3799 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3800 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3801 		rc->count = 1;
3802 #ifdef DEBUG
3803 		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3804 #endif
3805 	}
3806 	return (rc);
3807 }
3808 
3809 static void
3810 rddir_cache_free(rddir_cache *rc)
3811 {
3812 
3813 #ifdef DEBUG
3814 	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3815 #endif
3816 	if (rc->entries != NULL) {
3817 #ifdef DEBUG
3818 		rddir_cache_buf_free(rc->entries, rc->buflen);
3819 #else
3820 		kmem_free(rc->entries, rc->buflen);
3821 #endif
3822 	}
3823 	cv_destroy(&rc->cv);
3824 	mutex_destroy(&rc->lock);
3825 	kmem_free(rc, sizeof (*rc));
3826 }
3827 
3828 void
3829 rddir_cache_hold(rddir_cache *rc)
3830 {
3831 
3832 	mutex_enter(&rc->lock);
3833 	rc->count++;
3834 	mutex_exit(&rc->lock);
3835 }
3836 
3837 void
3838 rddir_cache_rele(rddir_cache *rc)
3839 {
3840 
3841 	mutex_enter(&rc->lock);
3842 	ASSERT(rc->count > 0);
3843 	if (--rc->count == 0) {
3844 		mutex_exit(&rc->lock);
3845 		rddir_cache_free(rc);
3846 	} else
3847 		mutex_exit(&rc->lock);
3848 }
3849 
3850 #ifdef DEBUG
3851 char *
3852 rddir_cache_buf_alloc(size_t size, int flags)
3853 {
3854 	char *rc;
3855 
3856 	rc = kmem_alloc(size, flags);
3857 	if (rc != NULL)
3858 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3859 	return (rc);
3860 }
3861 
3862 void
3863 rddir_cache_buf_free(void *addr, size_t size)
3864 {
3865 
3866 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3867 	kmem_free(addr, size);
3868 }
3869 #endif
3870 
3871 static int
3872 nfs_free_data_reclaim(rnode_t *rp)
3873 {
3874 	char *contents;
3875 	int size;
3876 	vsecattr_t *vsp;
3877 	nfs3_pathconf_info *info;
3878 	int freed;
3879 	cred_t *cred;
3880 
3881 	/*
3882 	 * Free any held credentials and caches which
3883 	 * may be associated with this rnode.
3884 	 */
3885 	mutex_enter(&rp->r_statelock);
3886 	cred = rp->r_cred;
3887 	rp->r_cred = NULL;
3888 	contents = rp->r_symlink.contents;
3889 	size = rp->r_symlink.size;
3890 	rp->r_symlink.contents = NULL;
3891 	vsp = rp->r_secattr;
3892 	rp->r_secattr = NULL;
3893 	info = rp->r_pathconf;
3894 	rp->r_pathconf = NULL;
3895 	mutex_exit(&rp->r_statelock);
3896 
3897 	if (cred != NULL)
3898 		crfree(cred);
3899 
3900 	/*
3901 	 * Free the access cache entries.
3902 	 */
3903 	freed = nfs_access_purge_rp(rp);
3904 
3905 	if (!HAVE_RDDIR_CACHE(rp) &&
3906 	    contents == NULL &&
3907 	    vsp == NULL &&
3908 	    info == NULL)
3909 		return (freed);
3910 
3911 	/*
3912 	 * Free the readdir cache entries
3913 	 */
3914 	if (HAVE_RDDIR_CACHE(rp))
3915 		nfs_purge_rddir_cache(RTOV(rp));
3916 
3917 	/*
3918 	 * Free the symbolic link cache.
3919 	 */
3920 	if (contents != NULL) {
3921 
3922 		kmem_free((void *)contents, size);
3923 	}
3924 
3925 	/*
3926 	 * Free any cached ACL.
3927 	 */
3928 	if (vsp != NULL)
3929 		nfs_acl_free(vsp);
3930 
3931 	/*
3932 	 * Free any cached pathconf information.
3933 	 */
3934 	if (info != NULL)
3935 		kmem_free(info, sizeof (*info));
3936 
3937 	return (1);
3938 }
3939 
3940 static int
3941 nfs_active_data_reclaim(rnode_t *rp)
3942 {
3943 	char *contents;
3944 	int size;
3945 	vsecattr_t *vsp;
3946 	nfs3_pathconf_info *info;
3947 	int freed;
3948 
3949 	/*
3950 	 * Free any held credentials and caches which
3951 	 * may be associated with this rnode.
3952 	 */
3953 	if (!mutex_tryenter(&rp->r_statelock))
3954 		return (0);
3955 	contents = rp->r_symlink.contents;
3956 	size = rp->r_symlink.size;
3957 	rp->r_symlink.contents = NULL;
3958 	vsp = rp->r_secattr;
3959 	rp->r_secattr = NULL;
3960 	info = rp->r_pathconf;
3961 	rp->r_pathconf = NULL;
3962 	mutex_exit(&rp->r_statelock);
3963 
3964 	/*
3965 	 * Free the access cache entries.
3966 	 */
3967 	freed = nfs_access_purge_rp(rp);
3968 
3969 	if (!HAVE_RDDIR_CACHE(rp) &&
3970 	    contents == NULL &&
3971 	    vsp == NULL &&
3972 	    info == NULL)
3973 		return (freed);
3974 
3975 	/*
3976 	 * Free the readdir cache entries
3977 	 */
3978 	if (HAVE_RDDIR_CACHE(rp))
3979 		nfs_purge_rddir_cache(RTOV(rp));
3980 
3981 	/*
3982 	 * Free the symbolic link cache.
3983 	 */
3984 	if (contents != NULL) {
3985 
3986 		kmem_free((void *)contents, size);
3987 	}
3988 
3989 	/*
3990 	 * Free any cached ACL.
3991 	 */
3992 	if (vsp != NULL)
3993 		nfs_acl_free(vsp);
3994 
3995 	/*
3996 	 * Free any cached pathconf information.
3997 	 */
3998 	if (info != NULL)
3999 		kmem_free(info, sizeof (*info));
4000 
4001 	return (1);
4002 }
4003 
4004 static int
4005 nfs_free_reclaim(void)
4006 {
4007 	int freed;
4008 	rnode_t *rp;
4009 
4010 #ifdef DEBUG
4011 	clstat_debug.f_reclaim.value.ui64++;
4012 #endif
4013 	freed = 0;
4014 	mutex_enter(&rpfreelist_lock);
4015 	rp = rpfreelist;
4016 	if (rp != NULL) {
4017 		do {
4018 			if (nfs_free_data_reclaim(rp))
4019 				freed = 1;
4020 		} while ((rp = rp->r_freef) != rpfreelist);
4021 	}
4022 	mutex_exit(&rpfreelist_lock);
4023 	return (freed);
4024 }
4025 
4026 static int
4027 nfs_active_reclaim(void)
4028 {
4029 	int freed;
4030 	int index;
4031 	rnode_t *rp;
4032 
4033 #ifdef DEBUG
4034 	clstat_debug.a_reclaim.value.ui64++;
4035 #endif
4036 	freed = 0;
4037 	for (index = 0; index < rtablesize; index++) {
4038 		rw_enter(&rtable[index].r_lock, RW_READER);
4039 		for (rp = rtable[index].r_hashf;
4040 		    rp != (rnode_t *)(&rtable[index]);
4041 		    rp = rp->r_hashf) {
4042 			if (nfs_active_data_reclaim(rp))
4043 				freed = 1;
4044 		}
4045 		rw_exit(&rtable[index].r_lock);
4046 	}
4047 	return (freed);
4048 }
4049 
4050 static int
4051 nfs_rnode_reclaim(void)
4052 {
4053 	int freed;
4054 	rnode_t *rp;
4055 	vnode_t *vp;
4056 
4057 #ifdef DEBUG
4058 	clstat_debug.r_reclaim.value.ui64++;
4059 #endif
4060 	freed = 0;
4061 	mutex_enter(&rpfreelist_lock);
4062 	while ((rp = rpfreelist) != NULL) {
4063 		rp_rmfree(rp);
4064 		mutex_exit(&rpfreelist_lock);
4065 		if (rp->r_flags & RHASHED) {
4066 			vp = RTOV(rp);
4067 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4068 			mutex_enter(&vp->v_lock);
4069 			if (vp->v_count > 1) {
4070 				vp->v_count--;
4071 				mutex_exit(&vp->v_lock);
4072 				rw_exit(&rp->r_hashq->r_lock);
4073 				mutex_enter(&rpfreelist_lock);
4074 				continue;
4075 			}
4076 			mutex_exit(&vp->v_lock);
4077 			rp_rmhash_locked(rp);
4078 			rw_exit(&rp->r_hashq->r_lock);
4079 		}
4080 		/*
4081 		 * This call to rp_addfree will end up destroying the
4082 		 * rnode, but in a safe way with the appropriate set
4083 		 * of checks done.
4084 		 */
4085 		rp_addfree(rp, CRED());
4086 		mutex_enter(&rpfreelist_lock);
4087 	}
4088 	mutex_exit(&rpfreelist_lock);
4089 	return (freed);
4090 }
4091 
4092 /*ARGSUSED*/
4093 static void
4094 nfs_reclaim(void *cdrarg)
4095 {
4096 
4097 #ifdef DEBUG
4098 	clstat_debug.reclaim.value.ui64++;
4099 #endif
4100 	if (nfs_free_reclaim())
4101 		return;
4102 
4103 	if (nfs_active_reclaim())
4104 		return;
4105 
4106 	(void) nfs_rnode_reclaim();
4107 }
4108 
4109 /*
4110  * NFS client failover support
4111  *
4112  * Routines to copy filehandles
4113  */
4114 void
4115 nfscopyfh(caddr_t fhp, vnode_t *vp)
4116 {
4117 	fhandle_t *dest = (fhandle_t *)fhp;
4118 
4119 	if (dest != NULL)
4120 		*dest = *VTOFH(vp);
4121 }
4122 
4123 void
4124 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4125 {
4126 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4127 
4128 	if (dest != NULL)
4129 		*dest = *VTOFH3(vp);
4130 }
4131 
4132 /*
4133  * NFS client failover support
4134  *
4135  * failover_safe() will test various conditions to ensure that
4136  * failover is permitted for this vnode.  It will be denied
4137  * if:
4138  *	1) the operation in progress does not support failover (NULL fi)
4139  *	2) there are no available replicas (NULL mi_servers->sv_next)
4140  *	3) any locks are outstanding on this file
4141  */
4142 static int
4143 failover_safe(failinfo_t *fi)
4144 {
4145 
4146 	/*
4147 	 * Does this op permit failover?
4148 	 */
4149 	if (fi == NULL || fi->vp == NULL)
4150 		return (0);
4151 
4152 	/*
4153 	 * Are there any alternates to failover to?
4154 	 */
4155 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4156 		return (0);
4157 
4158 	/*
4159 	 * Disable check; we've forced local locking
4160 	 *
4161 	 * if (flk_has_remote_locks(fi->vp))
4162 	 *	return (0);
4163 	 */
4164 
4165 	/*
4166 	 * If we have no partial path, we can't do anything
4167 	 */
4168 	if (VTOR(fi->vp)->r_path == NULL)
4169 		return (0);
4170 
4171 	return (1);
4172 }
4173 
4174 #include <sys/thread.h>
4175 
4176 /*
4177  * NFS client failover support
4178  *
4179  * failover_newserver() will start a search for a new server,
4180  * preferably by starting an async thread to do the work.  If
4181  * someone is already doing this (recognizable by MI_BINDINPROG
4182  * being set), it will simply return and the calling thread
4183  * will queue on the mi_failover_cv condition variable.
4184  */
4185 static void
4186 failover_newserver(mntinfo_t *mi)
4187 {
4188 	/*
4189 	 * Check if someone else is doing this already
4190 	 */
4191 	mutex_enter(&mi->mi_lock);
4192 	if (mi->mi_flags & MI_BINDINPROG) {
4193 		mutex_exit(&mi->mi_lock);
4194 		return;
4195 	}
4196 	mi->mi_flags |= MI_BINDINPROG;
4197 
4198 	/*
4199 	 * Need to hold the vfs struct so that it can't be released
4200 	 * while the failover thread is selecting a new server.
4201 	 */
4202 	VFS_HOLD(mi->mi_vfsp);
4203 
4204 	/*
4205 	 * Start a thread to do the real searching.
4206 	 */
4207 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4208 
4209 	mutex_exit(&mi->mi_lock);
4210 }
4211 
4212 /*
4213  * NFS client failover support
4214  *
4215  * failover_thread() will find a new server to replace the one
4216  * currently in use, wake up other threads waiting on this mount
4217  * point, and die.  It will start at the head of the server list
4218  * and poll servers until it finds one with an NFS server which is
4219  * registered and responds to a NULL procedure ping.
4220  *
4221  * XXX failover_thread is unsafe within the scope of the
4222  * present model defined for cpr to suspend the system.
4223  * Specifically, over-the-wire calls made by the thread
4224  * are unsafe. The thread needs to be reevaluated in case of
4225  * future updates to the cpr suspend model.
4226  */
4227 static void
4228 failover_thread(mntinfo_t *mi)
4229 {
4230 	servinfo_t *svp = NULL;
4231 	CLIENT *cl;
4232 	enum clnt_stat status;
4233 	struct timeval tv;
4234 	int error;
4235 	int oncethru = 0;
4236 	callb_cpr_t cprinfo;
4237 	rnode_t *rp;
4238 	int index;
4239 	char *srvnames;
4240 	size_t srvnames_len;
4241 	struct nfs_clnt *nfscl = NULL;
4242 	zoneid_t zoneid = getzoneid();
4243 
4244 #ifdef DEBUG
4245 	/*
4246 	 * This is currently only needed to access counters which exist on
4247 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4248 	 * on non-DEBUG kernels.
4249 	 */
4250 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4251 	ASSERT(nfscl != NULL);
4252 #endif
4253 
4254 	/*
4255 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4256 	 * code guarantees that there will be only one failover thread
4257 	 * per mountinfo at any instance.
4258 	 */
4259 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4260 	    "failover_thread");
4261 
4262 	mutex_enter(&mi->mi_lock);
4263 	while (mi->mi_readers) {
4264 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4265 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4266 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4267 	}
4268 	mutex_exit(&mi->mi_lock);
4269 
4270 	tv.tv_sec = 2;
4271 	tv.tv_usec = 0;
4272 
4273 	/*
4274 	 * Ping the null NFS procedure of every server in
4275 	 * the list until one responds.  We always start
4276 	 * at the head of the list and always skip the one
4277 	 * that is current, since it's caused us a problem.
4278 	 */
4279 	while (svp == NULL) {
4280 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4281 			if (!oncethru && svp == mi->mi_curr_serv)
4282 				continue;
4283 
4284 			/*
4285 			 * If the file system was forcibly umounted
4286 			 * while trying to do a failover, then just
4287 			 * give up on the failover.  It won't matter
4288 			 * what the server is.
4289 			 */
4290 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4291 				svp = NULL;
4292 				goto done;
4293 			}
4294 
4295 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4296 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4297 			if (error)
4298 				continue;
4299 
4300 			if (!(mi->mi_flags & MI_INT))
4301 				cl->cl_nosignal = TRUE;
4302 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4303 			    xdr_void, NULL, tv);
4304 			if (!(mi->mi_flags & MI_INT))
4305 				cl->cl_nosignal = FALSE;
4306 			AUTH_DESTROY(cl->cl_auth);
4307 			CLNT_DESTROY(cl);
4308 			if (status == RPC_SUCCESS) {
4309 				if (svp == mi->mi_curr_serv) {
4310 #ifdef DEBUG
4311 					zcmn_err(zoneid, CE_NOTE,
4312 			"NFS%d: failing over: selecting original server %s",
4313 					    mi->mi_vers, svp->sv_hostname);
4314 #else
4315 					zcmn_err(zoneid, CE_NOTE,
4316 			"NFS: failing over: selecting original server %s",
4317 					    svp->sv_hostname);
4318 #endif
4319 				} else {
4320 #ifdef DEBUG
4321 					zcmn_err(zoneid, CE_NOTE,
4322 				    "NFS%d: failing over from %s to %s",
4323 					    mi->mi_vers,
4324 					    mi->mi_curr_serv->sv_hostname,
4325 					    svp->sv_hostname);
4326 #else
4327 					zcmn_err(zoneid, CE_NOTE,
4328 				    "NFS: failing over from %s to %s",
4329 					    mi->mi_curr_serv->sv_hostname,
4330 					    svp->sv_hostname);
4331 #endif
4332 				}
4333 				break;
4334 			}
4335 		}
4336 
4337 		if (svp == NULL) {
4338 			if (!oncethru) {
4339 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4340 #ifdef DEBUG
4341 				zprintf(zoneid,
4342 				    "NFS%d servers %s not responding "
4343 				    "still trying\n", mi->mi_vers, srvnames);
4344 #else
4345 				zprintf(zoneid, "NFS servers %s not responding "
4346 				    "still trying\n", srvnames);
4347 #endif
4348 				oncethru = 1;
4349 			}
4350 			mutex_enter(&mi->mi_lock);
4351 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4352 			mutex_exit(&mi->mi_lock);
4353 			delay(hz);
4354 			mutex_enter(&mi->mi_lock);
4355 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4356 			mutex_exit(&mi->mi_lock);
4357 		}
4358 	}
4359 
4360 	if (oncethru) {
4361 #ifdef DEBUG
4362 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4363 #else
4364 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4365 #endif
4366 	}
4367 
4368 	if (svp != mi->mi_curr_serv) {
4369 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4370 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4371 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4372 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4373 		    mi->mi_vfsp);
4374 		if (rp != NULL) {
4375 			if (rp->r_flags & RHASHED)
4376 				rp_rmhash_locked(rp);
4377 			rw_exit(&rtable[index].r_lock);
4378 			rp->r_server = svp;
4379 			rp->r_fh = svp->sv_fhandle;
4380 			(void) nfs_free_data_reclaim(rp);
4381 			index = rtablehash(&rp->r_fh);
4382 			rp->r_hashq = &rtable[index];
4383 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4384 			vn_exists(RTOV(rp));
4385 			rp_addhash(rp);
4386 			rw_exit(&rp->r_hashq->r_lock);
4387 			VN_RELE(RTOV(rp));
4388 		} else
4389 			rw_exit(&rtable[index].r_lock);
4390 	}
4391 
4392 done:
4393 	if (oncethru)
4394 		kmem_free(srvnames, srvnames_len);
4395 	mutex_enter(&mi->mi_lock);
4396 	mi->mi_flags &= ~MI_BINDINPROG;
4397 	if (svp != NULL) {
4398 		mi->mi_curr_serv = svp;
4399 		mi->mi_failover++;
4400 #ifdef DEBUG
4401 	nfscl->nfscl_stat.failover.value.ui64++;
4402 #endif
4403 	}
4404 	cv_broadcast(&mi->mi_failover_cv);
4405 	CALLB_CPR_EXIT(&cprinfo);
4406 	VFS_RELE(mi->mi_vfsp);
4407 	zthread_exit();
4408 	/* NOTREACHED */
4409 }
4410 
4411 /*
4412  * NFS client failover support
4413  *
4414  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4415  * is cleared, meaning that failover is complete.  Called with
4416  * mi_lock mutex held.
4417  */
4418 static int
4419 failover_wait(mntinfo_t *mi)
4420 {
4421 	k_sigset_t smask;
4422 
4423 	/*
4424 	 * If someone else is hunting for a living server,
4425 	 * sleep until it's done.  After our sleep, we may
4426 	 * be bound to the right server and get off cheaply.
4427 	 */
4428 	while (mi->mi_flags & MI_BINDINPROG) {
4429 		/*
4430 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4431 		 * and SIGTERM. (Preserving the existing masks).
4432 		 * Mask out SIGINT if mount option nointr is specified.
4433 		 */
4434 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4435 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4436 			/*
4437 			 * restore original signal mask
4438 			 */
4439 			sigunintr(&smask);
4440 			return (EINTR);
4441 		}
4442 		/*
4443 		 * restore original signal mask
4444 		 */
4445 		sigunintr(&smask);
4446 	}
4447 	return (0);
4448 }
4449 
4450 /*
4451  * NFS client failover support
4452  *
4453  * failover_remap() will do a partial pathname lookup and find the
4454  * desired vnode on the current server.  The interim vnode will be
4455  * discarded after we pilfer the new filehandle.
4456  *
4457  * Side effects:
4458  * - This routine will also update the filehandle in the args structure
4459  *    pointed to by the fi->fhp pointer if it is non-NULL.
4460  */
4461 
4462 static int
4463 failover_remap(failinfo_t *fi)
4464 {
4465 	vnode_t *vp, *nvp, *rootvp;
4466 	rnode_t *rp, *nrp;
4467 	mntinfo_t *mi;
4468 	int error;
4469 #ifdef DEBUG
4470 	struct nfs_clnt *nfscl;
4471 
4472 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4473 	ASSERT(nfscl != NULL);
4474 #endif
4475 	/*
4476 	 * Sanity check
4477 	 */
4478 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4479 		return (EINVAL);
4480 	vp = fi->vp;
4481 	rp = VTOR(vp);
4482 	mi = VTOMI(vp);
4483 
4484 	if (!(vp->v_flag & VROOT)) {
4485 		/*
4486 		 * Given the root fh, use the path stored in
4487 		 * the rnode to find the fh for the new server.
4488 		 */
4489 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4490 		if (error)
4491 			return (error);
4492 
4493 		error = failover_lookup(rp->r_path, rootvp,
4494 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4495 
4496 		VN_RELE(rootvp);
4497 
4498 		if (error)
4499 			return (error);
4500 
4501 		/*
4502 		 * If we found the same rnode, we're done now
4503 		 */
4504 		if (nvp == vp) {
4505 			/*
4506 			 * Failed and the new server may physically be same
4507 			 * OR may share a same disk subsystem. In this case
4508 			 * file handle for a particular file path is not going
4509 			 * to change, given the same filehandle lookup will
4510 			 * always locate the same rnode as the existing one.
4511 			 * All we might need to do is to update the r_server
4512 			 * with the current servinfo.
4513 			 */
4514 			if (!VALID_FH(fi)) {
4515 				rp->r_server = mi->mi_curr_serv;
4516 			}
4517 			VN_RELE(nvp);
4518 			return (0);
4519 		}
4520 
4521 		/*
4522 		 * Try to make it so that no one else will find this
4523 		 * vnode because it is just a temporary to hold the
4524 		 * new file handle until that file handle can be
4525 		 * copied to the original vnode/rnode.
4526 		 */
4527 		nrp = VTOR(nvp);
4528 		mutex_enter(&mi->mi_remap_lock);
4529 		/*
4530 		 * Some other thread could have raced in here and could
4531 		 * have done the remap for this particular rnode before
4532 		 * this thread here. Check for rp->r_server and
4533 		 * mi->mi_curr_serv and return if they are same.
4534 		 */
4535 		if (VALID_FH(fi)) {
4536 			mutex_exit(&mi->mi_remap_lock);
4537 			VN_RELE(nvp);
4538 			return (0);
4539 		}
4540 
4541 		if (nrp->r_flags & RHASHED)
4542 			rp_rmhash(nrp);
4543 
4544 		/*
4545 		 * As a heuristic check on the validity of the new
4546 		 * file, check that the size and type match against
4547 		 * that we remember from the old version.
4548 		 */
4549 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4550 			mutex_exit(&mi->mi_remap_lock);
4551 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4552 			    "NFS replicas %s and %s: file %s not same.",
4553 			    rp->r_server->sv_hostname,
4554 			    nrp->r_server->sv_hostname, rp->r_path);
4555 			VN_RELE(nvp);
4556 			return (EINVAL);
4557 		}
4558 
4559 		/*
4560 		 * snarf the filehandle from the new rnode
4561 		 * then release it, again while updating the
4562 		 * hash queues for the rnode.
4563 		 */
4564 		if (rp->r_flags & RHASHED)
4565 			rp_rmhash(rp);
4566 		rp->r_server = mi->mi_curr_serv;
4567 		rp->r_fh = nrp->r_fh;
4568 		rp->r_hashq = nrp->r_hashq;
4569 		/*
4570 		 * Copy the attributes from the new rnode to the old
4571 		 * rnode.  This will help to reduce unnecessary page
4572 		 * cache flushes.
4573 		 */
4574 		rp->r_attr = nrp->r_attr;
4575 		rp->r_attrtime = nrp->r_attrtime;
4576 		rp->r_mtime = nrp->r_mtime;
4577 		(void) nfs_free_data_reclaim(rp);
4578 		nfs_setswaplike(vp, &rp->r_attr);
4579 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4580 		rp_addhash(rp);
4581 		rw_exit(&rp->r_hashq->r_lock);
4582 		mutex_exit(&mi->mi_remap_lock);
4583 		VN_RELE(nvp);
4584 	}
4585 
4586 	/*
4587 	 * Update successful failover remap count
4588 	 */
4589 	mutex_enter(&mi->mi_lock);
4590 	mi->mi_remap++;
4591 	mutex_exit(&mi->mi_lock);
4592 #ifdef DEBUG
4593 	nfscl->nfscl_stat.remap.value.ui64++;
4594 #endif
4595 
4596 	/*
4597 	 * If we have a copied filehandle to update, do it now.
4598 	 */
4599 	if (fi->fhp != NULL && fi->copyproc != NULL)
4600 		(*fi->copyproc)(fi->fhp, vp);
4601 
4602 	return (0);
4603 }
4604 
4605 /*
4606  * NFS client failover support
4607  *
4608  * We want a simple pathname lookup routine to parse the pieces
4609  * of path in rp->r_path.  We know that the path was a created
4610  * as rnodes were made, so we know we have only to deal with
4611  * paths that look like:
4612  *	dir1/dir2/dir3/file
4613  * Any evidence of anything like .., symlinks, and ENOTDIR
4614  * are hard errors, because they mean something in this filesystem
4615  * is different from the one we came from, or has changed under
4616  * us in some way.  If this is true, we want the failure.
4617  *
4618  * Extended attributes: if the filesystem is mounted with extended
4619  * attributes enabled (-o xattr), the attribute directory will be
4620  * represented in the r_path as the magic name XATTR_RPATH. So if
4621  * we see that name in the pathname, is must be because this node
4622  * is an extended attribute.  Therefore, look it up that way.
4623  */
4624 static int
4625 failover_lookup(char *path, vnode_t *root,
4626     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4627 	vnode_t *, cred_t *, int),
4628     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4629     vnode_t **new)
4630 {
4631 	vnode_t *dvp, *nvp;
4632 	int error = EINVAL;
4633 	char *s, *p, *tmppath;
4634 	size_t len;
4635 	mntinfo_t *mi;
4636 	bool_t xattr;
4637 
4638 	/* Make local copy of path */
4639 	len = strlen(path) + 1;
4640 	tmppath = kmem_alloc(len, KM_SLEEP);
4641 	(void) strcpy(tmppath, path);
4642 	s = tmppath;
4643 
4644 	dvp = root;
4645 	VN_HOLD(dvp);
4646 	mi = VTOMI(root);
4647 	xattr = mi->mi_flags & MI_EXTATTR;
4648 
4649 	do {
4650 		p = strchr(s, '/');
4651 		if (p != NULL)
4652 			*p = '\0';
4653 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4654 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4655 			    RFSCALL_SOFT);
4656 		} else {
4657 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4658 			    CRED(), RFSCALL_SOFT);
4659 		}
4660 		if (p != NULL)
4661 			*p++ = '/';
4662 		if (error) {
4663 			VN_RELE(dvp);
4664 			kmem_free(tmppath, len);
4665 			return (error);
4666 		}
4667 		s = p;
4668 		VN_RELE(dvp);
4669 		dvp = nvp;
4670 	} while (p != NULL);
4671 
4672 	if (nvp != NULL && new != NULL)
4673 		*new = nvp;
4674 	kmem_free(tmppath, len);
4675 	return (0);
4676 }
4677 
4678 /*
4679  * NFS client failover support
4680  *
4681  * sv_free() frees the malloc'd portion of a "servinfo_t".
4682  */
4683 void
4684 sv_free(servinfo_t *svp)
4685 {
4686 	servinfo_t *next;
4687 	struct knetconfig *knconf;
4688 
4689 	while (svp != NULL) {
4690 		next = svp->sv_next;
4691 		if (svp->sv_secdata)
4692 			sec_clnt_freeinfo(svp->sv_secdata);
4693 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4694 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4695 		knconf = svp->sv_knconf;
4696 		if (knconf != NULL) {
4697 			if (knconf->knc_protofmly != NULL)
4698 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4699 			if (knconf->knc_proto != NULL)
4700 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4701 			kmem_free(knconf, sizeof (*knconf));
4702 		}
4703 		knconf = svp->sv_origknconf;
4704 		if (knconf != NULL) {
4705 			if (knconf->knc_protofmly != NULL)
4706 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4707 			if (knconf->knc_proto != NULL)
4708 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4709 			kmem_free(knconf, sizeof (*knconf));
4710 		}
4711 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4712 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4713 		mutex_destroy(&svp->sv_lock);
4714 		kmem_free(svp, sizeof (*svp));
4715 		svp = next;
4716 	}
4717 }
4718 
4719 /*
4720  * Only can return non-zero if intr != 0.
4721  */
4722 int
4723 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4724 {
4725 
4726 	mutex_enter(&l->lock);
4727 
4728 	/*
4729 	 * If this is a nested enter, then allow it.  There
4730 	 * must be as many exits as enters through.
4731 	 */
4732 	if (l->owner == curthread) {
4733 		/* lock is held for writing by current thread */
4734 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4735 		l->count--;
4736 	} else if (rw == RW_READER) {
4737 		/*
4738 		 * While there is a writer active or writers waiting,
4739 		 * then wait for them to finish up and move on.  Then,
4740 		 * increment the count to indicate that a reader is
4741 		 * active.
4742 		 */
4743 		while (l->count < 0 || l->waiters > 0) {
4744 			if (intr) {
4745 				klwp_t *lwp = ttolwp(curthread);
4746 
4747 				if (lwp != NULL)
4748 					lwp->lwp_nostop++;
4749 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4750 					if (lwp != NULL)
4751 						lwp->lwp_nostop--;
4752 					mutex_exit(&l->lock);
4753 					return (EINTR);
4754 				}
4755 				if (lwp != NULL)
4756 					lwp->lwp_nostop--;
4757 			} else
4758 				cv_wait(&l->cv, &l->lock);
4759 		}
4760 		ASSERT(l->count < INT_MAX);
4761 #ifdef	DEBUG
4762 		if ((l->count % 10000) == 9999)
4763 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4764 			    "rwlock @ %p\n", l->count, (void *)&l);
4765 #endif
4766 		l->count++;
4767 	} else {
4768 		ASSERT(rw == RW_WRITER);
4769 		/*
4770 		 * While there are readers active or a writer
4771 		 * active, then wait for all of the readers
4772 		 * to finish or for the writer to finish.
4773 		 * Then, set the owner field to curthread and
4774 		 * decrement count to indicate that a writer
4775 		 * is active.
4776 		 */
4777 		while (l->count > 0 || l->owner != NULL) {
4778 			l->waiters++;
4779 			if (intr) {
4780 				klwp_t *lwp = ttolwp(curthread);
4781 
4782 				if (lwp != NULL)
4783 					lwp->lwp_nostop++;
4784 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4785 					if (lwp != NULL)
4786 						lwp->lwp_nostop--;
4787 					l->waiters--;
4788 					cv_broadcast(&l->cv);
4789 					mutex_exit(&l->lock);
4790 					return (EINTR);
4791 				}
4792 				if (lwp != NULL)
4793 					lwp->lwp_nostop--;
4794 			} else
4795 				cv_wait(&l->cv, &l->lock);
4796 			l->waiters--;
4797 		}
4798 		l->owner = curthread;
4799 		l->count--;
4800 	}
4801 
4802 	mutex_exit(&l->lock);
4803 
4804 	return (0);
4805 }
4806 
4807 /*
4808  * If the lock is available, obtain it and return non-zero.  If there is
4809  * already a conflicting lock, return 0 immediately.
4810  */
4811 
4812 int
4813 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4814 {
4815 	mutex_enter(&l->lock);
4816 
4817 	/*
4818 	 * If this is a nested enter, then allow it.  There
4819 	 * must be as many exits as enters through.
4820 	 */
4821 	if (l->owner == curthread) {
4822 		/* lock is held for writing by current thread */
4823 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4824 		l->count--;
4825 	} else if (rw == RW_READER) {
4826 		/*
4827 		 * If there is a writer active or writers waiting, deny the
4828 		 * lock.  Otherwise, bump the count of readers.
4829 		 */
4830 		if (l->count < 0 || l->waiters > 0) {
4831 			mutex_exit(&l->lock);
4832 			return (0);
4833 		}
4834 		l->count++;
4835 	} else {
4836 		ASSERT(rw == RW_WRITER);
4837 		/*
4838 		 * If there are readers active or a writer active, deny the
4839 		 * lock.  Otherwise, set the owner field to curthread and
4840 		 * decrement count to indicate that a writer is active.
4841 		 */
4842 		if (l->count > 0 || l->owner != NULL) {
4843 			mutex_exit(&l->lock);
4844 			return (0);
4845 		}
4846 		l->owner = curthread;
4847 		l->count--;
4848 	}
4849 
4850 	mutex_exit(&l->lock);
4851 
4852 	return (1);
4853 }
4854 
4855 void
4856 nfs_rw_exit(nfs_rwlock_t *l)
4857 {
4858 
4859 	mutex_enter(&l->lock);
4860 	/*
4861 	 * If this is releasing a writer lock, then increment count to
4862 	 * indicate that there is one less writer active.  If this was
4863 	 * the last of possibly nested writer locks, then clear the owner
4864 	 * field as well to indicate that there is no writer active
4865 	 * and wakeup any possible waiting writers or readers.
4866 	 *
4867 	 * If releasing a reader lock, then just decrement count to
4868 	 * indicate that there is one less reader active.  If this was
4869 	 * the last active reader and there are writer(s) waiting,
4870 	 * then wake up the first.
4871 	 */
4872 	if (l->owner != NULL) {
4873 		ASSERT(l->owner == curthread);
4874 		l->count++;
4875 		if (l->count == 0) {
4876 			l->owner = NULL;
4877 			cv_broadcast(&l->cv);
4878 		}
4879 	} else {
4880 		ASSERT(l->count > 0);
4881 		l->count--;
4882 		if (l->count == 0 && l->waiters > 0)
4883 			cv_broadcast(&l->cv);
4884 	}
4885 	mutex_exit(&l->lock);
4886 }
4887 
4888 int
4889 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4890 {
4891 
4892 	if (rw == RW_READER)
4893 		return (l->count > 0);
4894 	ASSERT(rw == RW_WRITER);
4895 	return (l->count < 0);
4896 }
4897 
4898 /* ARGSUSED */
4899 void
4900 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4901 {
4902 
4903 	l->count = 0;
4904 	l->waiters = 0;
4905 	l->owner = NULL;
4906 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4907 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4908 }
4909 
4910 void
4911 nfs_rw_destroy(nfs_rwlock_t *l)
4912 {
4913 
4914 	mutex_destroy(&l->lock);
4915 	cv_destroy(&l->cv);
4916 }
4917 
4918 int
4919 nfs3_rddir_compar(const void *x, const void *y)
4920 {
4921 	rddir_cache *a = (rddir_cache *)x;
4922 	rddir_cache *b = (rddir_cache *)y;
4923 
4924 	if (a->nfs3_cookie == b->nfs3_cookie) {
4925 		if (a->buflen == b->buflen)
4926 			return (0);
4927 		if (a->buflen < b->buflen)
4928 			return (-1);
4929 		return (1);
4930 	}
4931 
4932 	if (a->nfs3_cookie < b->nfs3_cookie)
4933 		return (-1);
4934 
4935 	return (1);
4936 }
4937 
4938 int
4939 nfs_rddir_compar(const void *x, const void *y)
4940 {
4941 	rddir_cache *a = (rddir_cache *)x;
4942 	rddir_cache *b = (rddir_cache *)y;
4943 
4944 	if (a->nfs_cookie == b->nfs_cookie) {
4945 		if (a->buflen == b->buflen)
4946 			return (0);
4947 		if (a->buflen < b->buflen)
4948 			return (-1);
4949 		return (1);
4950 	}
4951 
4952 	if (a->nfs_cookie < b->nfs_cookie)
4953 		return (-1);
4954 
4955 	return (1);
4956 }
4957 
4958 static char *
4959 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4960 {
4961 	servinfo_t *s;
4962 	char *srvnames;
4963 	char *namep;
4964 	size_t length;
4965 
4966 	/*
4967 	 * Calculate the length of the string required to hold all
4968 	 * of the server names plus either a comma or a null
4969 	 * character following each individual one.
4970 	 */
4971 	length = 0;
4972 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4973 		length += s->sv_hostnamelen;
4974 
4975 	srvnames = kmem_alloc(length, KM_SLEEP);
4976 
4977 	namep = srvnames;
4978 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4979 		(void) strcpy(namep, s->sv_hostname);
4980 		namep += s->sv_hostnamelen - 1;
4981 		*namep++ = ',';
4982 	}
4983 	*--namep = '\0';
4984 
4985 	*len = length;
4986 
4987 	return (srvnames);
4988 }
4989 
4990 /*
4991  * These two functions are temporary and designed for the upgrade-workaround
4992  * only.  They cannot be used for general zone-crossing NFS client support, and
4993  * will be removed shortly.
4994  *
4995  * When the workaround is enabled, all NFS traffic is forced into the global
4996  * zone.  These functions are called when the code needs to refer to the state
4997  * of the underlying network connection.  They're not called when the function
4998  * needs to refer to the state of the process that invoked the system call.
4999  * (E.g., when checking whether the zone is shutting down during the mount()
5000  * call.)
5001  */
5002 
5003 struct zone *
5004 nfs_zone(void)
5005 {
5006 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5007 }
5008 
5009 zoneid_t
5010 nfs_zoneid(void)
5011 {
5012 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5013 }
5014 
5015 /*
5016  * nfs_mount_label_policy:
5017  *	Determine whether the mount is allowed according to MAC check,
5018  *	by comparing (where appropriate) label of the remote server
5019  *	against the label of the zone being mounted into.
5020  *
5021  *	Returns:
5022  *		 0 :	access allowed
5023  *		-1 :	read-only access allowed (i.e., read-down)
5024  *		>0 :	error code, such as EACCES
5025  */
5026 int
5027 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5028     struct knetconfig *knconf, cred_t *cr)
5029 {
5030 	int		addr_type;
5031 	void		*ipaddr;
5032 	bslabel_t	*server_sl, *mntlabel;
5033 	zone_t		*mntzone = NULL;
5034 	ts_label_t	*zlabel;
5035 	tsol_tpc_t	*tp;
5036 	ts_label_t	*tsl = NULL;
5037 	int		retv;
5038 
5039 	/*
5040 	 * Get the zone's label.  Each zone on a labeled system has a label.
5041 	 */
5042 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5043 	zlabel = mntzone->zone_slabel;
5044 	ASSERT(zlabel != NULL);
5045 	label_hold(zlabel);
5046 
5047 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5048 		addr_type = IPV4_VERSION;
5049 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5050 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5051 		addr_type = IPV6_VERSION;
5052 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5053 	} else {
5054 		retv = 0;
5055 		goto out;
5056 	}
5057 
5058 	retv = EACCES;				/* assume the worst */
5059 
5060 	/*
5061 	 * Next, get the assigned label of the remote server.
5062 	 */
5063 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5064 	if (tp == NULL)
5065 		goto out;			/* error getting host entry */
5066 
5067 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5068 		goto rel_tpc;			/* invalid domain */
5069 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5070 	    (tp->tpc_tp.host_type != UNLABELED))
5071 		goto rel_tpc;			/* invalid hosttype */
5072 
5073 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5074 		tsl = getflabel_cipso(vfsp);
5075 		if (tsl == NULL)
5076 			goto rel_tpc;		/* error getting server lbl */
5077 
5078 		server_sl = label2bslabel(tsl);
5079 	} else {	/* UNLABELED */
5080 		server_sl = &tp->tpc_tp.tp_def_label;
5081 	}
5082 
5083 	mntlabel = label2bslabel(zlabel);
5084 
5085 	/*
5086 	 * Now compare labels to complete the MAC check.  If the labels
5087 	 * are equal or if the requestor is in the global zone and has
5088 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5089 	 * mounts into the global zone itself; restrict these to
5090 	 * read-only.)
5091 	 *
5092 	 * If the requestor is in some other zone, but his label
5093 	 * dominates the server, then allow read-down.
5094 	 *
5095 	 * Otherwise, access is denied.
5096 	 */
5097 	if (blequal(mntlabel, server_sl) ||
5098 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5099 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5100 		if ((mntzone == global_zone) ||
5101 		    !blequal(mntlabel, server_sl))
5102 			retv = -1;		/* read-only */
5103 		else
5104 			retv = 0;		/* access OK */
5105 	} else if (bldominates(mntlabel, server_sl)) {
5106 		retv = -1;			/* read-only */
5107 	} else {
5108 		retv = EACCES;
5109 	}
5110 
5111 	if (tsl != NULL)
5112 		label_rele(tsl);
5113 
5114 rel_tpc:
5115 	TPC_RELE(tp);
5116 out:
5117 	if (mntzone)
5118 		zone_rele(mntzone);
5119 	label_rele(zlabel);
5120 	return (retv);
5121 }
5122 
5123 boolean_t
5124 nfs_has_ctty(void)
5125 {
5126 	boolean_t rv;
5127 	mutex_enter(&curproc->p_splock);
5128 	rv = (curproc->p_sessp->s_vp != NULL);
5129 	mutex_exit(&curproc->p_splock);
5130 	return (rv);
5131 }
5132 
5133 /*
5134  * TX NFS routine used by NFSv3 and NFSv4 to do label check
5135  * on client label and server's file object lable.
5136  */
5137 boolean_t
5138 do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag)
5139 {
5140 	bslabel_t *slabel;
5141 	ts_label_t *tslabel;
5142 	boolean_t result;
5143 
5144 	if ((tslabel = nfs_getflabel(vp)) == NULL) {
5145 		return (B_FALSE);
5146 	}
5147 	slabel = label2bslabel(tslabel);
5148 	DTRACE_PROBE4(tx__rfs__log__info__labelcheck, char *,
5149 	    "comparing server's file label(1) with client label(2) (vp(3))",
5150 	    bslabel_t *, slabel, bslabel_t *, clabel, vnode_t *, vp);
5151 
5152 	if (flag == EQUALITY_CHECK)
5153 		result = blequal(clabel, slabel);
5154 	else
5155 		result = bldominates(clabel, slabel);
5156 	label_rele(tslabel);
5157 	return (result);
5158 }
5159 
5160 /*
5161  * See if xattr directory to see if it has any generic user attributes
5162  */
5163 int
5164 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5165 {
5166 	struct uio uio;
5167 	struct iovec iov;
5168 	char *dbuf;
5169 	struct dirent64 *dp;
5170 	size_t dlen = 8 * 1024;
5171 	size_t dbuflen;
5172 	int eof = 0;
5173 	int error;
5174 
5175 	*valp = 0;
5176 	dbuf = kmem_alloc(dlen, KM_SLEEP);
5177 	uio.uio_iov = &iov;
5178 	uio.uio_iovcnt = 1;
5179 	uio.uio_segflg = UIO_SYSSPACE;
5180 	uio.uio_fmode = 0;
5181 	uio.uio_extflg = UIO_COPY_CACHED;
5182 	uio.uio_loffset = 0;
5183 	uio.uio_resid = dlen;
5184 	iov.iov_base = dbuf;
5185 	iov.iov_len = dlen;
5186 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5187 	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5188 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5189 
5190 	dbuflen = dlen - uio.uio_resid;
5191 
5192 	if (error || dbuflen == 0) {
5193 		kmem_free(dbuf, dlen);
5194 		return (error);
5195 	}
5196 
5197 	dp = (dirent64_t *)dbuf;
5198 
5199 	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5200 		if (strcmp(dp->d_name, ".") == 0 ||
5201 		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5202 		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5203 		    VIEW_READONLY) == 0) {
5204 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5205 			continue;
5206 		}
5207 
5208 		*valp = 1;
5209 		break;
5210 	}
5211 	kmem_free(dbuf, dlen);
5212 	return (0);
5213 }
5214