xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs_subr.c (revision 269552cd714f2f1e308fdde0fc69b18a7f1142b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/param.h>
27 #include <sys/types.h>
28 #include <sys/systm.h>
29 #include <sys/cred_impl.h>
30 #include <sys/proc.h>
31 #include <sys/user.h>
32 #include <sys/time.h>
33 #include <sys/buf.h>
34 #include <sys/vfs.h>
35 #include <sys/vnode.h>
36 #include <sys/socket.h>
37 #include <sys/uio.h>
38 #include <sys/tiuser.h>
39 #include <sys/swap.h>
40 #include <sys/errno.h>
41 #include <sys/debug.h>
42 #include <sys/kmem.h>
43 #include <sys/kstat.h>
44 #include <sys/cmn_err.h>
45 #include <sys/vtrace.h>
46 #include <sys/session.h>
47 #include <sys/dnlc.h>
48 #include <sys/bitmap.h>
49 #include <sys/acl.h>
50 #include <sys/ddi.h>
51 #include <sys/pathname.h>
52 #include <sys/flock.h>
53 #include <sys/dirent.h>
54 #include <sys/flock.h>
55 #include <sys/callb.h>
56 #include <sys/atomic.h>
57 #include <sys/list.h>
58 #include <sys/tsol/tnet.h>
59 #include <sys/priv.h>
60 #include <sys/sdt.h>
61 #include <sys/attr.h>
62 
63 #include <inet/ip6.h>
64 
65 #include <rpc/types.h>
66 #include <rpc/xdr.h>
67 #include <rpc/auth.h>
68 #include <rpc/clnt.h>
69 
70 #include <nfs/nfs.h>
71 #include <nfs/nfs4.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/rnode.h>
74 #include <nfs/nfs_acl.h>
75 
76 #include <sys/tsol/label.h>
77 
78 /*
79  * The hash queues for the access to active and cached rnodes
80  * are organized as doubly linked lists.  A reader/writer lock
81  * for each hash bucket is used to control access and to synchronize
82  * lookups, additions, and deletions from the hash queue.
83  *
84  * The rnode freelist is organized as a doubly linked list with
85  * a head pointer.  Additions and deletions are synchronized via
86  * a single mutex.
87  *
88  * In order to add an rnode to the free list, it must be hashed into
89  * a hash queue and the exclusive lock to the hash queue be held.
90  * If an rnode is not hashed into a hash queue, then it is destroyed
91  * because it represents no valuable information that can be reused
92  * about the file.  The exclusive lock to the hash queue must be
93  * held in order to prevent a lookup in the hash queue from finding
94  * the rnode and using it and assuming that the rnode is not on the
95  * freelist.  The lookup in the hash queue will have the hash queue
96  * locked, either exclusive or shared.
97  *
98  * The vnode reference count for each rnode is not allowed to drop
99  * below 1.  This prevents external entities, such as the VM
100  * subsystem, from acquiring references to vnodes already on the
101  * freelist and then trying to place them back on the freelist
102  * when their reference is released.  This means that the when an
103  * rnode is looked up in the hash queues, then either the rnode
104  * is removed from the freelist and that reference is transferred to
105  * the new reference or the vnode reference count must be incremented
106  * accordingly.  The mutex for the freelist must be held in order to
107  * accurately test to see if the rnode is on the freelist or not.
108  * The hash queue lock might be held shared and it is possible that
109  * two different threads may race to remove the rnode from the
110  * freelist.  This race can be resolved by holding the mutex for the
111  * freelist.  Please note that the mutex for the freelist does not
112  * need to held if the rnode is not on the freelist.  It can not be
113  * placed on the freelist due to the requirement that the thread
114  * putting the rnode on the freelist must hold the exclusive lock
115  * to the hash queue and the thread doing the lookup in the hash
116  * queue is holding either a shared or exclusive lock to the hash
117  * queue.
118  *
119  * The lock ordering is:
120  *
121  *	hash bucket lock -> vnode lock
122  *	hash bucket lock -> freelist lock
123  */
124 static rhashq_t *rtable;
125 
126 static kmutex_t rpfreelist_lock;
127 static rnode_t *rpfreelist = NULL;
128 static long rnew = 0;
129 long nrnode = 0;
130 
131 static int rtablesize;
132 static int rtablemask;
133 
134 static int hashlen = 4;
135 
136 static struct kmem_cache *rnode_cache;
137 
138 /*
139  * Mutex to protect the following variables:
140  *	nfs_major
141  *	nfs_minor
142  */
143 kmutex_t nfs_minor_lock;
144 int nfs_major;
145 int nfs_minor;
146 
147 /* Do we allow preepoch (negative) time values otw? */
148 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
149 
150 /*
151  * Access cache
152  */
153 static acache_hash_t *acache;
154 static long nacache;	/* used strictly to size the number of hash queues */
155 
156 static int acachesize;
157 static int acachemask;
158 static struct kmem_cache *acache_cache;
159 
160 /*
161  * Client side utilities
162  */
163 
164 /*
165  * client side statistics
166  */
167 static const struct clstat clstat_tmpl = {
168 	{ "calls",	KSTAT_DATA_UINT64 },
169 	{ "badcalls",	KSTAT_DATA_UINT64 },
170 	{ "clgets",	KSTAT_DATA_UINT64 },
171 	{ "cltoomany",	KSTAT_DATA_UINT64 },
172 #ifdef DEBUG
173 	{ "clalloc",	KSTAT_DATA_UINT64 },
174 	{ "noresponse",	KSTAT_DATA_UINT64 },
175 	{ "failover",	KSTAT_DATA_UINT64 },
176 	{ "remap",	KSTAT_DATA_UINT64 },
177 #endif
178 };
179 
180 /*
181  * The following are statistics that describe behavior of the system as a whole
182  * and doesn't correspond to any one particular zone.
183  */
184 #ifdef DEBUG
185 static struct clstat_debug {
186 	kstat_named_t	nrnode;			/* number of allocated rnodes */
187 	kstat_named_t	access;			/* size of access cache */
188 	kstat_named_t	dirent;			/* size of readdir cache */
189 	kstat_named_t	dirents;		/* size of readdir buf cache */
190 	kstat_named_t	reclaim;		/* number of reclaims */
191 	kstat_named_t	clreclaim;		/* number of cl reclaims */
192 	kstat_named_t	f_reclaim;		/* number of free reclaims */
193 	kstat_named_t	a_reclaim;		/* number of active reclaims */
194 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
195 	kstat_named_t	rpath;			/* bytes used to store rpaths */
196 } clstat_debug = {
197 	{ "nrnode",	KSTAT_DATA_UINT64 },
198 	{ "access",	KSTAT_DATA_UINT64 },
199 	{ "dirent",	KSTAT_DATA_UINT64 },
200 	{ "dirents",	KSTAT_DATA_UINT64 },
201 	{ "reclaim",	KSTAT_DATA_UINT64 },
202 	{ "clreclaim",	KSTAT_DATA_UINT64 },
203 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
204 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
205 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
206 	{ "r_path",	KSTAT_DATA_UINT64 },
207 };
208 #endif	/* DEBUG */
209 
210 /*
211  * We keep a global list of per-zone client data, so we can clean up all zones
212  * if we get low on memory.
213  */
214 static list_t nfs_clnt_list;
215 static kmutex_t nfs_clnt_list_lock;
216 static zone_key_t nfsclnt_zone_key;
217 
218 static struct kmem_cache *chtab_cache;
219 
220 /*
221  * Some servers do not properly update the attributes of the
222  * directory when changes are made.  To allow interoperability
223  * with these broken servers, the nfs_disable_rddir_cache
224  * parameter must be set in /etc/system
225  */
226 int nfs_disable_rddir_cache = 0;
227 
228 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
229 		    struct chtab **);
230 void		clfree(CLIENT *, struct chtab *);
231 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
232 		    struct chtab **, struct nfs_clnt *);
233 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234 		    struct chtab **, struct nfs_clnt *);
235 static void	clreclaim(void *);
236 static int	nfs_feedback(int, int, mntinfo_t *);
237 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
238 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
239 		    failinfo_t *);
240 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
241 		    caddr_t, cred_t *, int *, int, failinfo_t *);
242 static void	rinactive(rnode_t *, cred_t *);
243 static int	rtablehash(nfs_fhandle *);
244 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
245 		    struct vnodeops *,
246 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
247 			cred_t *),
248 		    int (*)(const void *, const void *), int *, cred_t *,
249 		    char *, char *);
250 static void	rp_rmfree(rnode_t *);
251 static void	rp_addhash(rnode_t *);
252 static void	rp_rmhash_locked(rnode_t *);
253 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
254 static void	destroy_rnode(rnode_t *);
255 static void	rddir_cache_free(rddir_cache *);
256 static int	nfs_free_data_reclaim(rnode_t *);
257 static int	nfs_active_data_reclaim(rnode_t *);
258 static int	nfs_free_reclaim(void);
259 static int	nfs_active_reclaim(void);
260 static int	nfs_rnode_reclaim(void);
261 static void	nfs_reclaim(void *);
262 static int	failover_safe(failinfo_t *);
263 static void	failover_newserver(mntinfo_t *mi);
264 static void	failover_thread(mntinfo_t *mi);
265 static int	failover_wait(mntinfo_t *);
266 static int	failover_remap(failinfo_t *);
267 static int	failover_lookup(char *, vnode_t *,
268 		    int (*)(vnode_t *, char *, vnode_t **,
269 			struct pathname *, int, vnode_t *, cred_t *, int),
270 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
271 		    vnode_t **);
272 static void	nfs_free_r_path(rnode_t *);
273 static void	nfs_set_vroot(vnode_t *);
274 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
275 
276 /*
277  * from rpcsec module (common/rpcsec)
278  */
279 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
280 extern void sec_clnt_freeh(AUTH *);
281 extern void sec_clnt_freeinfo(struct sec_data *);
282 
283 /*
284  * used in mount policy
285  */
286 extern ts_label_t *getflabel_cipso(vfs_t *);
287 
288 /*
289  * EIO or EINTR are not recoverable errors.
290  */
291 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
292 
293 /*
294  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
295  */
296 static int
297 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
298     struct chtab **chp, struct nfs_clnt *nfscl)
299 {
300 	struct chhead *ch, *newch;
301 	struct chhead **plistp;
302 	struct chtab *cp;
303 	int error;
304 	k_sigset_t smask;
305 
306 	if (newcl == NULL || chp == NULL || ci == NULL)
307 		return (EINVAL);
308 
309 	*newcl = NULL;
310 	*chp = NULL;
311 
312 	/*
313 	 * Find an unused handle or create one
314 	 */
315 	newch = NULL;
316 	nfscl->nfscl_stat.clgets.value.ui64++;
317 top:
318 	/*
319 	 * Find the correct entry in the cache to check for free
320 	 * client handles.  The search is based on the RPC program
321 	 * number, program version number, dev_t for the transport
322 	 * device, and the protocol family.
323 	 */
324 	mutex_enter(&nfscl->nfscl_chtable_lock);
325 	plistp = &nfscl->nfscl_chtable;
326 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
327 		if (ch->ch_prog == ci->cl_prog &&
328 		    ch->ch_vers == ci->cl_vers &&
329 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
330 		    (strcmp(ch->ch_protofmly,
331 		    svp->sv_knconf->knc_protofmly) == 0))
332 			break;
333 		plistp = &ch->ch_next;
334 	}
335 
336 	/*
337 	 * If we didn't find a cache entry for this quadruple, then
338 	 * create one.  If we don't have one already preallocated,
339 	 * then drop the cache lock, create one, and then start over.
340 	 * If we did have a preallocated entry, then just add it to
341 	 * the front of the list.
342 	 */
343 	if (ch == NULL) {
344 		if (newch == NULL) {
345 			mutex_exit(&nfscl->nfscl_chtable_lock);
346 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
347 			newch->ch_timesused = 0;
348 			newch->ch_prog = ci->cl_prog;
349 			newch->ch_vers = ci->cl_vers;
350 			newch->ch_dev = svp->sv_knconf->knc_rdev;
351 			newch->ch_protofmly = kmem_alloc(
352 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
353 			    KM_SLEEP);
354 			(void) strcpy(newch->ch_protofmly,
355 			    svp->sv_knconf->knc_protofmly);
356 			newch->ch_list = NULL;
357 			goto top;
358 		}
359 		ch = newch;
360 		newch = NULL;
361 		ch->ch_next = nfscl->nfscl_chtable;
362 		nfscl->nfscl_chtable = ch;
363 	/*
364 	 * We found a cache entry, but if it isn't on the front of the
365 	 * list, then move it to the front of the list to try to take
366 	 * advantage of locality of operations.
367 	 */
368 	} else if (ch != nfscl->nfscl_chtable) {
369 		*plistp = ch->ch_next;
370 		ch->ch_next = nfscl->nfscl_chtable;
371 		nfscl->nfscl_chtable = ch;
372 	}
373 
374 	/*
375 	 * If there was a free client handle cached, then remove it
376 	 * from the list, init it, and use it.
377 	 */
378 	if (ch->ch_list != NULL) {
379 		cp = ch->ch_list;
380 		ch->ch_list = cp->ch_list;
381 		mutex_exit(&nfscl->nfscl_chtable_lock);
382 		if (newch != NULL) {
383 			kmem_free(newch->ch_protofmly,
384 			    strlen(newch->ch_protofmly) + 1);
385 			kmem_free(newch, sizeof (*newch));
386 		}
387 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
388 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
389 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
390 		    &cp->ch_client->cl_auth);
391 		if (error || cp->ch_client->cl_auth == NULL) {
392 			CLNT_DESTROY(cp->ch_client);
393 			kmem_cache_free(chtab_cache, cp);
394 			return ((error != 0) ? error : EINTR);
395 		}
396 		ch->ch_timesused++;
397 		*newcl = cp->ch_client;
398 		*chp = cp;
399 		return (0);
400 	}
401 
402 	/*
403 	 * There weren't any free client handles which fit, so allocate
404 	 * a new one and use that.
405 	 */
406 #ifdef DEBUG
407 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
408 #endif
409 	mutex_exit(&nfscl->nfscl_chtable_lock);
410 
411 	nfscl->nfscl_stat.cltoomany.value.ui64++;
412 	if (newch != NULL) {
413 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
414 		kmem_free(newch, sizeof (*newch));
415 	}
416 
417 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
418 	cp->ch_head = ch;
419 
420 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
421 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
422 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
423 	sigunintr(&smask);
424 
425 	if (error != 0) {
426 		kmem_cache_free(chtab_cache, cp);
427 #ifdef DEBUG
428 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
429 #endif
430 		/*
431 		 * Warning is unnecessary if error is EINTR.
432 		 */
433 		if (error != EINTR) {
434 			nfs_cmn_err(error, CE_WARN,
435 			    "clget: couldn't create handle: %m\n");
436 		}
437 		return (error);
438 	}
439 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
440 	auth_destroy(cp->ch_client->cl_auth);
441 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
442 	    &cp->ch_client->cl_auth);
443 	if (error || cp->ch_client->cl_auth == NULL) {
444 		CLNT_DESTROY(cp->ch_client);
445 		kmem_cache_free(chtab_cache, cp);
446 #ifdef DEBUG
447 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
448 #endif
449 		return ((error != 0) ? error : EINTR);
450 	}
451 	ch->ch_timesused++;
452 	*newcl = cp->ch_client;
453 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
454 	*chp = cp;
455 	return (0);
456 }
457 
458 int
459 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
460     struct chtab **chp)
461 {
462 	struct nfs_clnt *nfscl;
463 
464 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
465 	ASSERT(nfscl != NULL);
466 
467 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
468 }
469 
470 static int
471 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
472     struct chtab **chp, struct nfs_clnt *nfscl)
473 {
474 	clinfo_t ci;
475 	int error;
476 
477 	/*
478 	 * Set read buffer size to rsize
479 	 * and add room for RPC headers.
480 	 */
481 	ci.cl_readsize = mi->mi_tsize;
482 	if (ci.cl_readsize != 0)
483 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
484 
485 	/*
486 	 * If soft mount and server is down just try once.
487 	 * meaning: do not retransmit.
488 	 */
489 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
490 		ci.cl_retrans = 0;
491 	else
492 		ci.cl_retrans = mi->mi_retrans;
493 
494 	ci.cl_prog = NFS_ACL_PROGRAM;
495 	ci.cl_vers = mi->mi_vers;
496 	ci.cl_flags = mi->mi_flags;
497 
498 	/*
499 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
500 	 * security flavor, the client tries to establish a security context
501 	 * by contacting the server. If the connection is timed out or reset,
502 	 * e.g. server reboot, we will try again.
503 	 */
504 	do {
505 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
506 
507 		if (error == 0)
508 			break;
509 
510 		/*
511 		 * For forced unmount or zone shutdown, bail out, no retry.
512 		 */
513 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
514 			error = EIO;
515 			break;
516 		}
517 
518 		/* do not retry for softmount */
519 		if (!(mi->mi_flags & MI_HARD))
520 			break;
521 
522 		/* let the caller deal with the failover case */
523 		if (FAILOVER_MOUNT(mi))
524 			break;
525 
526 	} while (error == ETIMEDOUT || error == ECONNRESET);
527 
528 	return (error);
529 }
530 
531 static int
532 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
533     struct chtab **chp, struct nfs_clnt *nfscl)
534 {
535 	clinfo_t ci;
536 	int error;
537 
538 	/*
539 	 * Set read buffer size to rsize
540 	 * and add room for RPC headers.
541 	 */
542 	ci.cl_readsize = mi->mi_tsize;
543 	if (ci.cl_readsize != 0)
544 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
545 
546 	/*
547 	 * If soft mount and server is down just try once.
548 	 * meaning: do not retransmit.
549 	 */
550 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
551 		ci.cl_retrans = 0;
552 	else
553 		ci.cl_retrans = mi->mi_retrans;
554 
555 	ci.cl_prog = mi->mi_prog;
556 	ci.cl_vers = mi->mi_vers;
557 	ci.cl_flags = mi->mi_flags;
558 
559 	/*
560 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
561 	 * security flavor, the client tries to establish a security context
562 	 * by contacting the server. If the connection is timed out or reset,
563 	 * e.g. server reboot, we will try again.
564 	 */
565 	do {
566 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
567 
568 		if (error == 0)
569 			break;
570 
571 		/*
572 		 * For forced unmount or zone shutdown, bail out, no retry.
573 		 */
574 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
575 			error = EIO;
576 			break;
577 		}
578 
579 		/* do not retry for softmount */
580 		if (!(mi->mi_flags & MI_HARD))
581 			break;
582 
583 		/* let the caller deal with the failover case */
584 		if (FAILOVER_MOUNT(mi))
585 			break;
586 
587 	} while (error == ETIMEDOUT || error == ECONNRESET);
588 
589 	return (error);
590 }
591 
592 static void
593 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
594 {
595 	if (cl->cl_auth != NULL) {
596 		sec_clnt_freeh(cl->cl_auth);
597 		cl->cl_auth = NULL;
598 	}
599 
600 	/*
601 	 * Timestamp this cache entry so that we know when it was last
602 	 * used.
603 	 */
604 	cp->ch_freed = gethrestime_sec();
605 
606 	/*
607 	 * Add the free client handle to the front of the list.
608 	 * This way, the list will be sorted in youngest to oldest
609 	 * order.
610 	 */
611 	mutex_enter(&nfscl->nfscl_chtable_lock);
612 	cp->ch_list = cp->ch_head->ch_list;
613 	cp->ch_head->ch_list = cp;
614 	mutex_exit(&nfscl->nfscl_chtable_lock);
615 }
616 
617 void
618 clfree(CLIENT *cl, struct chtab *cp)
619 {
620 	struct nfs_clnt *nfscl;
621 
622 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
623 	ASSERT(nfscl != NULL);
624 
625 	clfree_impl(cl, cp, nfscl);
626 }
627 
628 #define	CL_HOLDTIME	60	/* time to hold client handles */
629 
630 static void
631 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
632 {
633 	struct chhead *ch;
634 	struct chtab *cp;	/* list of objects that can be reclaimed */
635 	struct chtab *cpe;
636 	struct chtab *cpl;
637 	struct chtab **cpp;
638 #ifdef DEBUG
639 	int n = 0;
640 #endif
641 
642 	/*
643 	 * Need to reclaim some memory, so step through the cache
644 	 * looking through the lists for entries which can be freed.
645 	 */
646 	cp = NULL;
647 
648 	mutex_enter(&nfscl->nfscl_chtable_lock);
649 
650 	/*
651 	 * Here we step through each non-NULL quadruple and start to
652 	 * construct the reclaim list pointed to by cp.  Note that
653 	 * cp will contain all eligible chtab entries.  When this traversal
654 	 * completes, chtab entries from the last quadruple will be at the
655 	 * front of cp and entries from previously inspected quadruples have
656 	 * been appended to the rear of cp.
657 	 */
658 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
659 		if (ch->ch_list == NULL)
660 			continue;
661 		/*
662 		 * Search each list for entries older then
663 		 * cl_holdtime seconds.  The lists are maintained
664 		 * in youngest to oldest order so that when the
665 		 * first entry is found which is old enough, then
666 		 * all of the rest of the entries on the list will
667 		 * be old enough as well.
668 		 */
669 		cpl = ch->ch_list;
670 		cpp = &ch->ch_list;
671 		while (cpl != NULL &&
672 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
673 			cpp = &cpl->ch_list;
674 			cpl = cpl->ch_list;
675 		}
676 		if (cpl != NULL) {
677 			*cpp = NULL;
678 			if (cp != NULL) {
679 				cpe = cpl;
680 				while (cpe->ch_list != NULL)
681 					cpe = cpe->ch_list;
682 				cpe->ch_list = cp;
683 			}
684 			cp = cpl;
685 		}
686 	}
687 
688 	mutex_exit(&nfscl->nfscl_chtable_lock);
689 
690 	/*
691 	 * If cp is empty, then there is nothing to reclaim here.
692 	 */
693 	if (cp == NULL)
694 		return;
695 
696 	/*
697 	 * Step through the list of entries to free, destroying each client
698 	 * handle and kmem_free'ing the memory for each entry.
699 	 */
700 	while (cp != NULL) {
701 #ifdef DEBUG
702 		n++;
703 #endif
704 		CLNT_DESTROY(cp->ch_client);
705 		cpl = cp->ch_list;
706 		kmem_cache_free(chtab_cache, cp);
707 		cp = cpl;
708 	}
709 
710 #ifdef DEBUG
711 	/*
712 	 * Update clalloc so that nfsstat shows the current number
713 	 * of allocated client handles.
714 	 */
715 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
716 #endif
717 }
718 
719 /* ARGSUSED */
720 static void
721 clreclaim(void *all)
722 {
723 	struct nfs_clnt *nfscl;
724 
725 #ifdef DEBUG
726 	clstat_debug.clreclaim.value.ui64++;
727 #endif
728 	/*
729 	 * The system is low on memory; go through and try to reclaim some from
730 	 * every zone on the system.
731 	 */
732 	mutex_enter(&nfs_clnt_list_lock);
733 	nfscl = list_head(&nfs_clnt_list);
734 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
735 		clreclaim_zone(nfscl, CL_HOLDTIME);
736 	mutex_exit(&nfs_clnt_list_lock);
737 }
738 
739 /*
740  * Minimum time-out values indexed by call type
741  * These units are in "eights" of a second to avoid multiplies
742  */
743 static unsigned int minimum_timeo[] = {
744 	6, 7, 10
745 };
746 
747 /*
748  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
749  */
750 #define	MAXTIMO	(20*hz)
751 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
752 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
753 
754 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
755 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
756 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
757 
758 /*
759  * Function called when rfscall notices that we have been
760  * re-transmitting, or when we get a response without retransmissions.
761  * Return 1 if the transfer size was adjusted down - 0 if no change.
762  */
763 static int
764 nfs_feedback(int flag, int which, mntinfo_t *mi)
765 {
766 	int kind;
767 	int r = 0;
768 
769 	mutex_enter(&mi->mi_lock);
770 	if (flag == FEEDBACK_REXMIT1) {
771 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
772 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
773 			goto done;
774 		if (mi->mi_curread > MIN_NFS_TSIZE) {
775 			mi->mi_curread /= 2;
776 			if (mi->mi_curread < MIN_NFS_TSIZE)
777 				mi->mi_curread = MIN_NFS_TSIZE;
778 			r = 1;
779 		}
780 
781 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
782 			mi->mi_curwrite /= 2;
783 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
784 				mi->mi_curwrite = MIN_NFS_TSIZE;
785 			r = 1;
786 		}
787 	} else if (flag == FEEDBACK_OK) {
788 		kind = mi->mi_timer_type[which];
789 		if (kind == 0 ||
790 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
791 			goto done;
792 		if (kind == 1) {
793 			if (mi->mi_curread >= mi->mi_tsize)
794 				goto done;
795 			mi->mi_curread +=  MIN_NFS_TSIZE;
796 			if (mi->mi_curread > mi->mi_tsize/2)
797 				mi->mi_curread = mi->mi_tsize;
798 		} else if (kind == 2) {
799 			if (mi->mi_curwrite >= mi->mi_stsize)
800 				goto done;
801 			mi->mi_curwrite += MIN_NFS_TSIZE;
802 			if (mi->mi_curwrite > mi->mi_stsize/2)
803 				mi->mi_curwrite = mi->mi_stsize;
804 		}
805 	}
806 done:
807 	mutex_exit(&mi->mi_lock);
808 	return (r);
809 }
810 
811 #ifdef DEBUG
812 static int rfs2call_hits = 0;
813 static int rfs2call_misses = 0;
814 #endif
815 
816 int
817 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
818     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
819     enum nfsstat *statusp, int flags, failinfo_t *fi)
820 {
821 	int rpcerror;
822 	enum clnt_stat rpc_status;
823 
824 	ASSERT(statusp != NULL);
825 
826 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
827 	    cr, douprintf, &rpc_status, flags, fi);
828 	if (!rpcerror) {
829 		/*
830 		 * See crnetadjust() for comments.
831 		 */
832 		if (*statusp == NFSERR_ACCES &&
833 		    (cr = crnetadjust(cr)) != NULL) {
834 #ifdef DEBUG
835 			rfs2call_hits++;
836 #endif
837 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
838 			    resp, cr, douprintf, NULL, flags, fi);
839 			crfree(cr);
840 #ifdef DEBUG
841 			if (*statusp == NFSERR_ACCES)
842 				rfs2call_misses++;
843 #endif
844 		}
845 	} else if (rpc_status == RPC_PROCUNAVAIL) {
846 		*statusp = NFSERR_OPNOTSUPP;
847 		rpcerror = 0;
848 	}
849 
850 	return (rpcerror);
851 }
852 
853 #define	NFS3_JUKEBOX_DELAY	10 * hz
854 
855 static clock_t nfs3_jukebox_delay = 0;
856 
857 #ifdef DEBUG
858 static int rfs3call_hits = 0;
859 static int rfs3call_misses = 0;
860 #endif
861 
862 int
863 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
864     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
865     nfsstat3 *statusp, int flags, failinfo_t *fi)
866 {
867 	int rpcerror;
868 	int user_informed;
869 
870 	user_informed = 0;
871 	do {
872 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
873 		    cr, douprintf, NULL, flags, fi);
874 		if (!rpcerror) {
875 			cred_t *crr;
876 			if (*statusp == NFS3ERR_JUKEBOX) {
877 				if (ttoproc(curthread) == &p0) {
878 					rpcerror = EAGAIN;
879 					break;
880 				}
881 				if (!user_informed) {
882 					user_informed = 1;
883 					uprintf(
884 		"file temporarily unavailable on the server, retrying...\n");
885 				}
886 				delay(nfs3_jukebox_delay);
887 			}
888 			/*
889 			 * See crnetadjust() for comments.
890 			 */
891 			else if (*statusp == NFS3ERR_ACCES &&
892 			    (crr = crnetadjust(cr)) != NULL) {
893 #ifdef DEBUG
894 				rfs3call_hits++;
895 #endif
896 				rpcerror = rfscall(mi, which, xdrargs, argsp,
897 				    xdrres, resp, crr, douprintf,
898 				    NULL, flags, fi);
899 
900 				crfree(crr);
901 #ifdef DEBUG
902 				if (*statusp == NFS3ERR_ACCES)
903 					rfs3call_misses++;
904 #endif
905 			}
906 		}
907 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
908 
909 	return (rpcerror);
910 }
911 
912 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
913 #define	INC_READERS(mi)		{ \
914 	mi->mi_readers++; \
915 }
916 #define	DEC_READERS(mi)		{ \
917 	mi->mi_readers--; \
918 	if (mi->mi_readers == 0) \
919 		cv_broadcast(&mi->mi_failover_cv); \
920 }
921 
922 static int
923 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
924     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
925     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
926 {
927 	CLIENT *client;
928 	struct chtab *ch;
929 	cred_t *cr = icr;
930 	enum clnt_stat status;
931 	struct rpc_err rpcerr;
932 	struct timeval wait;
933 	int timeo;		/* in units of hz */
934 	int my_rsize, my_wsize;
935 	bool_t tryagain;
936 	bool_t cred_cloned = FALSE;
937 	k_sigset_t smask;
938 	servinfo_t *svp;
939 	struct nfs_clnt *nfscl;
940 	zoneid_t zoneid = getzoneid();
941 #ifdef DEBUG
942 	char *bufp;
943 #endif
944 
945 
946 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
947 	    "rfscall_start:which %d mi %p", which, mi);
948 
949 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
950 	ASSERT(nfscl != NULL);
951 
952 	nfscl->nfscl_stat.calls.value.ui64++;
953 	mi->mi_reqs[which].value.ui64++;
954 
955 	rpcerr.re_status = RPC_SUCCESS;
956 
957 	/*
958 	 * In case of forced unmount or zone shutdown, return EIO.
959 	 */
960 
961 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
962 		rpcerr.re_status = RPC_FAILED;
963 		rpcerr.re_errno = EIO;
964 		return (rpcerr.re_errno);
965 	}
966 
967 	/*
968 	 * Remember the transfer sizes in case
969 	 * nfs_feedback changes them underneath us.
970 	 */
971 	my_rsize = mi->mi_curread;
972 	my_wsize = mi->mi_curwrite;
973 
974 	/*
975 	 * NFS client failover support
976 	 *
977 	 * If this rnode is not in sync with the current server (VALID_FH),
978 	 * we'd like to do a remap to get in sync.  We can be interrupted
979 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
980 	 * use the best info we have to try the RPC.  Part of that is
981 	 * unconditionally updating the filehandle copy kept for V3.
982 	 *
983 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
984 	 * rw_enter(); we're trying to keep the current server from being
985 	 * changed on us until we're done with the remapping and have a
986 	 * matching client handle.  We don't want to sending a filehandle
987 	 * to the wrong host.
988 	 */
989 failoverretry:
990 	if (FAILOVER_MOUNT(mi)) {
991 		mutex_enter(&mi->mi_lock);
992 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
993 			if (failover_wait(mi)) {
994 				mutex_exit(&mi->mi_lock);
995 				return (EINTR);
996 			}
997 		}
998 		INC_READERS(mi);
999 		mutex_exit(&mi->mi_lock);
1000 		if (fi) {
1001 			if (!VALID_FH(fi) &&
1002 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1003 				int remaperr;
1004 
1005 				svp = mi->mi_curr_serv;
1006 				remaperr = failover_remap(fi);
1007 				if (remaperr != 0) {
1008 #ifdef DEBUG
1009 					if (remaperr != EINTR)
1010 						nfs_cmn_err(remaperr, CE_WARN,
1011 					    "rfscall couldn't failover: %m");
1012 #endif
1013 					mutex_enter(&mi->mi_lock);
1014 					DEC_READERS(mi);
1015 					mutex_exit(&mi->mi_lock);
1016 					/*
1017 					 * If failover_remap returns ETIMEDOUT
1018 					 * and the filesystem is hard mounted
1019 					 * we have to retry the call with a new
1020 					 * server.
1021 					 */
1022 					if ((mi->mi_flags & MI_HARD) &&
1023 					    IS_RECOVERABLE_ERROR(remaperr)) {
1024 						if (svp == mi->mi_curr_serv)
1025 							failover_newserver(mi);
1026 						rpcerr.re_status = RPC_SUCCESS;
1027 						goto failoverretry;
1028 					}
1029 					rpcerr.re_errno = remaperr;
1030 					return (remaperr);
1031 				}
1032 			}
1033 			if (fi->fhp && fi->copyproc)
1034 				(*fi->copyproc)(fi->fhp, fi->vp);
1035 		}
1036 	}
1037 
1038 	/* For TSOL, use a new cred which has net_mac_aware flag */
1039 	if (!cred_cloned && is_system_labeled()) {
1040 		cred_cloned = TRUE;
1041 		cr = crdup(icr);
1042 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1043 	}
1044 
1045 	/*
1046 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1047 	 * are guaranteed to reprocess the retry as a new request.
1048 	 */
1049 	svp = mi->mi_curr_serv;
1050 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1051 
1052 	if (FAILOVER_MOUNT(mi)) {
1053 		mutex_enter(&mi->mi_lock);
1054 		DEC_READERS(mi);
1055 		mutex_exit(&mi->mi_lock);
1056 
1057 		if ((rpcerr.re_errno == ETIMEDOUT ||
1058 		    rpcerr.re_errno == ECONNRESET) &&
1059 		    failover_safe(fi)) {
1060 			if (svp == mi->mi_curr_serv)
1061 				failover_newserver(mi);
1062 			goto failoverretry;
1063 		}
1064 	}
1065 	if (rpcerr.re_errno != 0)
1066 		return (rpcerr.re_errno);
1067 
1068 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1069 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1070 		timeo = (mi->mi_timeo * hz) / 10;
1071 	} else {
1072 		mutex_enter(&mi->mi_lock);
1073 		timeo = CLNT_SETTIMERS(client,
1074 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1075 		    &(mi->mi_timers[NFS_CALLTYPES]),
1076 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1077 		    (void (*)())NULL, (caddr_t)mi, 0);
1078 		mutex_exit(&mi->mi_lock);
1079 	}
1080 
1081 	/*
1082 	 * If hard mounted fs, retry call forever unless hard error occurs.
1083 	 */
1084 	do {
1085 		tryagain = FALSE;
1086 
1087 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1088 			status = RPC_FAILED;
1089 			rpcerr.re_status = RPC_FAILED;
1090 			rpcerr.re_errno = EIO;
1091 			break;
1092 		}
1093 
1094 		TICK_TO_TIMEVAL(timeo, &wait);
1095 
1096 		/*
1097 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1098 		 * and SIGTERM. (Preserving the existing masks).
1099 		 * Mask out SIGINT if mount option nointr is specified.
1100 		 */
1101 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1102 		if (!(mi->mi_flags & MI_INT))
1103 			client->cl_nosignal = TRUE;
1104 
1105 		/*
1106 		 * If there is a current signal, then don't bother
1107 		 * even trying to send out the request because we
1108 		 * won't be able to block waiting for the response.
1109 		 * Simply assume RPC_INTR and get on with it.
1110 		 */
1111 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1112 			status = RPC_INTR;
1113 		else {
1114 			status = CLNT_CALL(client, which, xdrargs, argsp,
1115 			    xdrres, resp, wait);
1116 		}
1117 
1118 		if (!(mi->mi_flags & MI_INT))
1119 			client->cl_nosignal = FALSE;
1120 		/*
1121 		 * restore original signal mask
1122 		 */
1123 		sigunintr(&smask);
1124 
1125 		switch (status) {
1126 		case RPC_SUCCESS:
1127 			if ((mi->mi_flags & MI_DYNAMIC) &&
1128 			    mi->mi_timer_type[which] != 0 &&
1129 			    (mi->mi_curread != my_rsize ||
1130 			    mi->mi_curwrite != my_wsize))
1131 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1132 			break;
1133 
1134 		case RPC_INTR:
1135 			/*
1136 			 * There is no way to recover from this error,
1137 			 * even if mount option nointr is specified.
1138 			 * SIGKILL, for example, cannot be blocked.
1139 			 */
1140 			rpcerr.re_status = RPC_INTR;
1141 			rpcerr.re_errno = EINTR;
1142 			break;
1143 
1144 		case RPC_UDERROR:
1145 			/*
1146 			 * If the NFS server is local (vold) and
1147 			 * it goes away then we get RPC_UDERROR.
1148 			 * This is a retryable error, so we would
1149 			 * loop, so check to see if the specific
1150 			 * error was ECONNRESET, indicating that
1151 			 * target did not exist at all.  If so,
1152 			 * return with RPC_PROGUNAVAIL and
1153 			 * ECONNRESET to indicate why.
1154 			 */
1155 			CLNT_GETERR(client, &rpcerr);
1156 			if (rpcerr.re_errno == ECONNRESET) {
1157 				rpcerr.re_status = RPC_PROGUNAVAIL;
1158 				rpcerr.re_errno = ECONNRESET;
1159 				break;
1160 			}
1161 			/*FALLTHROUGH*/
1162 
1163 		default:		/* probably RPC_TIMEDOUT */
1164 			if (IS_UNRECOVERABLE_RPC(status))
1165 				break;
1166 
1167 			/*
1168 			 * increment server not responding count
1169 			 */
1170 			mutex_enter(&mi->mi_lock);
1171 			mi->mi_noresponse++;
1172 			mutex_exit(&mi->mi_lock);
1173 #ifdef DEBUG
1174 			nfscl->nfscl_stat.noresponse.value.ui64++;
1175 #endif
1176 
1177 			if (!(mi->mi_flags & MI_HARD)) {
1178 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1179 				    (mi->mi_ss_call_type[which] == 0))
1180 					break;
1181 			}
1182 
1183 			/*
1184 			 * The call is in progress (over COTS).
1185 			 * Try the CLNT_CALL again, but don't
1186 			 * print a noisy error message.
1187 			 */
1188 			if (status == RPC_INPROGRESS) {
1189 				tryagain = TRUE;
1190 				break;
1191 			}
1192 
1193 			if (flags & RFSCALL_SOFT)
1194 				break;
1195 
1196 			/*
1197 			 * On zone shutdown, just move on.
1198 			 */
1199 			if (zone_status_get(curproc->p_zone) >=
1200 			    ZONE_IS_SHUTTING_DOWN) {
1201 				rpcerr.re_status = RPC_FAILED;
1202 				rpcerr.re_errno = EIO;
1203 				break;
1204 			}
1205 
1206 			/*
1207 			 * NFS client failover support
1208 			 *
1209 			 * If the current server just failed us, we'll
1210 			 * start the process of finding a new server.
1211 			 * After that, we can just retry.
1212 			 */
1213 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1214 				if (svp == mi->mi_curr_serv)
1215 					failover_newserver(mi);
1216 				clfree_impl(client, ch, nfscl);
1217 				goto failoverretry;
1218 			}
1219 
1220 			tryagain = TRUE;
1221 			timeo = backoff(timeo);
1222 			mutex_enter(&mi->mi_lock);
1223 			if (!(mi->mi_flags & MI_PRINTED)) {
1224 				mi->mi_flags |= MI_PRINTED;
1225 				mutex_exit(&mi->mi_lock);
1226 #ifdef DEBUG
1227 				zprintf(zoneid,
1228 			"NFS%d server %s not responding still trying\n",
1229 				    mi->mi_vers, svp->sv_hostname);
1230 #else
1231 				zprintf(zoneid,
1232 			"NFS server %s not responding still trying\n",
1233 				    svp->sv_hostname);
1234 #endif
1235 			} else
1236 				mutex_exit(&mi->mi_lock);
1237 			if (*douprintf && nfs_has_ctty()) {
1238 				*douprintf = 0;
1239 				if (!(mi->mi_flags & MI_NOPRINT))
1240 #ifdef DEBUG
1241 					uprintf(
1242 			    "NFS%d server %s not responding still trying\n",
1243 					    mi->mi_vers, svp->sv_hostname);
1244 #else
1245 					uprintf(
1246 			    "NFS server %s not responding still trying\n",
1247 					    svp->sv_hostname);
1248 #endif
1249 			}
1250 
1251 			/*
1252 			 * If doing dynamic adjustment of transfer
1253 			 * size and if it's a read or write call
1254 			 * and if the transfer size changed while
1255 			 * retransmitting or if the feedback routine
1256 			 * changed the transfer size,
1257 			 * then exit rfscall so that the transfer
1258 			 * size can be adjusted at the vnops level.
1259 			 */
1260 			if ((mi->mi_flags & MI_DYNAMIC) &&
1261 			    mi->mi_timer_type[which] != 0 &&
1262 			    (mi->mi_curread != my_rsize ||
1263 			    mi->mi_curwrite != my_wsize ||
1264 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1265 				/*
1266 				 * On read or write calls, return
1267 				 * back to the vnode ops level if
1268 				 * the transfer size changed.
1269 				 */
1270 				clfree_impl(client, ch, nfscl);
1271 				if (cred_cloned)
1272 					crfree(cr);
1273 				return (ENFS_TRYAGAIN);
1274 			}
1275 		}
1276 	} while (tryagain);
1277 
1278 	if (status != RPC_SUCCESS) {
1279 		/*
1280 		 * Let soft mounts use the timed out message.
1281 		 */
1282 		if (status == RPC_INPROGRESS)
1283 			status = RPC_TIMEDOUT;
1284 		nfscl->nfscl_stat.badcalls.value.ui64++;
1285 		if (status != RPC_INTR) {
1286 			mutex_enter(&mi->mi_lock);
1287 			mi->mi_flags |= MI_DOWN;
1288 			mutex_exit(&mi->mi_lock);
1289 			CLNT_GETERR(client, &rpcerr);
1290 #ifdef DEBUG
1291 			bufp = clnt_sperror(client, svp->sv_hostname);
1292 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1293 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1294 			if (nfs_has_ctty()) {
1295 				if (!(mi->mi_flags & MI_NOPRINT)) {
1296 					uprintf("NFS%d %s failed for %s\n",
1297 					    mi->mi_vers, mi->mi_rfsnames[which],
1298 					    bufp);
1299 				}
1300 			}
1301 			kmem_free(bufp, MAXPATHLEN);
1302 #else
1303 			zprintf(zoneid,
1304 			    "NFS %s failed for server %s: error %d (%s)\n",
1305 			    mi->mi_rfsnames[which], svp->sv_hostname,
1306 			    status, clnt_sperrno(status));
1307 			if (nfs_has_ctty()) {
1308 				if (!(mi->mi_flags & MI_NOPRINT)) {
1309 					uprintf(
1310 				"NFS %s failed for server %s: error %d (%s)\n",
1311 					    mi->mi_rfsnames[which],
1312 					    svp->sv_hostname, status,
1313 					    clnt_sperrno(status));
1314 				}
1315 			}
1316 #endif
1317 			/*
1318 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1319 			 * re_errno is set appropriately depending on
1320 			 * the authentication error
1321 			 */
1322 			if (status == RPC_VERSMISMATCH ||
1323 			    status == RPC_PROGVERSMISMATCH)
1324 				rpcerr.re_errno = EIO;
1325 		}
1326 	} else {
1327 		/*
1328 		 * Test the value of mi_down and mi_printed without
1329 		 * holding the mi_lock mutex.  If they are both zero,
1330 		 * then it is okay to skip the down and printed
1331 		 * processing.  This saves on a mutex_enter and
1332 		 * mutex_exit pair for a normal, successful RPC.
1333 		 * This was just complete overhead.
1334 		 */
1335 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1336 			mutex_enter(&mi->mi_lock);
1337 			mi->mi_flags &= ~MI_DOWN;
1338 			if (mi->mi_flags & MI_PRINTED) {
1339 				mi->mi_flags &= ~MI_PRINTED;
1340 				mutex_exit(&mi->mi_lock);
1341 #ifdef DEBUG
1342 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1343 				zprintf(zoneid, "NFS%d server %s ok\n",
1344 				    mi->mi_vers, svp->sv_hostname);
1345 #else
1346 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1347 				zprintf(zoneid, "NFS server %s ok\n",
1348 				    svp->sv_hostname);
1349 #endif
1350 			} else
1351 				mutex_exit(&mi->mi_lock);
1352 		}
1353 
1354 		if (*douprintf == 0) {
1355 			if (!(mi->mi_flags & MI_NOPRINT))
1356 #ifdef DEBUG
1357 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1358 					uprintf("NFS%d server %s ok\n",
1359 					    mi->mi_vers, svp->sv_hostname);
1360 #else
1361 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1363 #endif
1364 			*douprintf = 1;
1365 		}
1366 	}
1367 
1368 	clfree_impl(client, ch, nfscl);
1369 	if (cred_cloned)
1370 		crfree(cr);
1371 
1372 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1373 
1374 	if (rpc_status != NULL)
1375 		*rpc_status = rpcerr.re_status;
1376 
1377 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1378 	    rpcerr.re_errno);
1379 
1380 	return (rpcerr.re_errno);
1381 }
1382 
1383 #ifdef DEBUG
1384 static int acl2call_hits = 0;
1385 static int acl2call_misses = 0;
1386 #endif
1387 
1388 int
1389 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1390     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1391     enum nfsstat *statusp, int flags, failinfo_t *fi)
1392 {
1393 	int rpcerror;
1394 
1395 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1396 	    cr, douprintf, flags, fi);
1397 	if (!rpcerror) {
1398 		/*
1399 		 * See comments with crnetadjust().
1400 		 */
1401 		if (*statusp == NFSERR_ACCES &&
1402 		    (cr = crnetadjust(cr)) != NULL) {
1403 #ifdef DEBUG
1404 			acl2call_hits++;
1405 #endif
1406 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1407 			    resp, cr, douprintf, flags, fi);
1408 			crfree(cr);
1409 #ifdef DEBUG
1410 			if (*statusp == NFSERR_ACCES)
1411 				acl2call_misses++;
1412 #endif
1413 		}
1414 	}
1415 
1416 	return (rpcerror);
1417 }
1418 
1419 #ifdef DEBUG
1420 static int acl3call_hits = 0;
1421 static int acl3call_misses = 0;
1422 #endif
1423 
1424 int
1425 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1426     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1427     nfsstat3 *statusp, int flags, failinfo_t *fi)
1428 {
1429 	int rpcerror;
1430 	int user_informed;
1431 
1432 	user_informed = 0;
1433 
1434 	do {
1435 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1436 		    cr, douprintf, flags, fi);
1437 		if (!rpcerror) {
1438 			cred_t *crr;
1439 			if (*statusp == NFS3ERR_JUKEBOX) {
1440 				if (!user_informed) {
1441 					user_informed = 1;
1442 					uprintf(
1443 		"file temporarily unavailable on the server, retrying...\n");
1444 				}
1445 				delay(nfs3_jukebox_delay);
1446 			}
1447 			/*
1448 			 * See crnetadjust() for comments.
1449 			 */
1450 			else if (*statusp == NFS3ERR_ACCES &&
1451 			    (crr = crnetadjust(cr)) != NULL) {
1452 #ifdef DEBUG
1453 				acl3call_hits++;
1454 #endif
1455 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1456 				    xdrres, resp, crr, douprintf, flags, fi);
1457 
1458 				crfree(crr);
1459 #ifdef DEBUG
1460 				if (*statusp == NFS3ERR_ACCES)
1461 					acl3call_misses++;
1462 #endif
1463 			}
1464 		}
1465 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1466 
1467 	return (rpcerror);
1468 }
1469 
1470 static int
1471 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1472     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1473     int flags, failinfo_t *fi)
1474 {
1475 	CLIENT *client;
1476 	struct chtab *ch;
1477 	cred_t *cr = icr;
1478 	bool_t cred_cloned = FALSE;
1479 	enum clnt_stat status;
1480 	struct rpc_err rpcerr;
1481 	struct timeval wait;
1482 	int timeo;		/* in units of hz */
1483 #if 0 /* notyet */
1484 	int my_rsize, my_wsize;
1485 #endif
1486 	bool_t tryagain;
1487 	k_sigset_t smask;
1488 	servinfo_t *svp;
1489 	struct nfs_clnt *nfscl;
1490 	zoneid_t zoneid = getzoneid();
1491 #ifdef DEBUG
1492 	char *bufp;
1493 #endif
1494 
1495 #if 0 /* notyet */
1496 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1497 	    "rfscall_start:which %d mi %p", which, mi);
1498 #endif
1499 
1500 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1501 	ASSERT(nfscl != NULL);
1502 
1503 	nfscl->nfscl_stat.calls.value.ui64++;
1504 	mi->mi_aclreqs[which].value.ui64++;
1505 
1506 	rpcerr.re_status = RPC_SUCCESS;
1507 
1508 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1509 		rpcerr.re_status = RPC_FAILED;
1510 		rpcerr.re_errno = EIO;
1511 		return (rpcerr.re_errno);
1512 	}
1513 
1514 #if 0 /* notyet */
1515 	/*
1516 	 * Remember the transfer sizes in case
1517 	 * nfs_feedback changes them underneath us.
1518 	 */
1519 	my_rsize = mi->mi_curread;
1520 	my_wsize = mi->mi_curwrite;
1521 #endif
1522 
1523 	/*
1524 	 * NFS client failover support
1525 	 *
1526 	 * If this rnode is not in sync with the current server (VALID_FH),
1527 	 * we'd like to do a remap to get in sync.  We can be interrupted
1528 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1529 	 * use the best info we have to try the RPC.  Part of that is
1530 	 * unconditionally updating the filehandle copy kept for V3.
1531 	 *
1532 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1533 	 * rw_enter(); we're trying to keep the current server from being
1534 	 * changed on us until we're done with the remapping and have a
1535 	 * matching client handle.  We don't want to sending a filehandle
1536 	 * to the wrong host.
1537 	 */
1538 failoverretry:
1539 	if (FAILOVER_MOUNT(mi)) {
1540 		mutex_enter(&mi->mi_lock);
1541 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1542 			if (failover_wait(mi)) {
1543 				mutex_exit(&mi->mi_lock);
1544 				return (EINTR);
1545 			}
1546 		}
1547 		INC_READERS(mi);
1548 		mutex_exit(&mi->mi_lock);
1549 		if (fi) {
1550 			if (!VALID_FH(fi) &&
1551 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1552 				int remaperr;
1553 
1554 				svp = mi->mi_curr_serv;
1555 				remaperr = failover_remap(fi);
1556 				if (remaperr != 0) {
1557 #ifdef DEBUG
1558 					if (remaperr != EINTR)
1559 						nfs_cmn_err(remaperr, CE_WARN,
1560 					    "aclcall couldn't failover: %m");
1561 #endif
1562 					mutex_enter(&mi->mi_lock);
1563 					DEC_READERS(mi);
1564 					mutex_exit(&mi->mi_lock);
1565 
1566 					/*
1567 					 * If failover_remap returns ETIMEDOUT
1568 					 * and the filesystem is hard mounted
1569 					 * we have to retry the call with a new
1570 					 * server.
1571 					 */
1572 					if ((mi->mi_flags & MI_HARD) &&
1573 					    IS_RECOVERABLE_ERROR(remaperr)) {
1574 						if (svp == mi->mi_curr_serv)
1575 							failover_newserver(mi);
1576 						rpcerr.re_status = RPC_SUCCESS;
1577 						goto failoverretry;
1578 					}
1579 					return (remaperr);
1580 				}
1581 			}
1582 			if (fi->fhp && fi->copyproc)
1583 				(*fi->copyproc)(fi->fhp, fi->vp);
1584 		}
1585 	}
1586 
1587 	/* For TSOL, use a new cred which has net_mac_aware flag */
1588 	if (!cred_cloned && is_system_labeled()) {
1589 		cred_cloned = TRUE;
1590 		cr = crdup(icr);
1591 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1592 	}
1593 
1594 	/*
1595 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1596 	 * are guaranteed to reprocess the retry as a new request.
1597 	 */
1598 	svp = mi->mi_curr_serv;
1599 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1600 	if (FAILOVER_MOUNT(mi)) {
1601 		mutex_enter(&mi->mi_lock);
1602 		DEC_READERS(mi);
1603 		mutex_exit(&mi->mi_lock);
1604 
1605 		if ((rpcerr.re_errno == ETIMEDOUT ||
1606 		    rpcerr.re_errno == ECONNRESET) &&
1607 		    failover_safe(fi)) {
1608 			if (svp == mi->mi_curr_serv)
1609 				failover_newserver(mi);
1610 			goto failoverretry;
1611 		}
1612 	}
1613 	if (rpcerr.re_errno != 0) {
1614 		if (cred_cloned)
1615 			crfree(cr);
1616 		return (rpcerr.re_errno);
1617 	}
1618 
1619 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1620 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1621 		timeo = (mi->mi_timeo * hz) / 10;
1622 	} else {
1623 		mutex_enter(&mi->mi_lock);
1624 		timeo = CLNT_SETTIMERS(client,
1625 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1626 		    &(mi->mi_timers[NFS_CALLTYPES]),
1627 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1628 		    (void (*)()) 0, (caddr_t)mi, 0);
1629 		mutex_exit(&mi->mi_lock);
1630 	}
1631 
1632 	/*
1633 	 * If hard mounted fs, retry call forever unless hard error occurs.
1634 	 */
1635 	do {
1636 		tryagain = FALSE;
1637 
1638 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1639 			status = RPC_FAILED;
1640 			rpcerr.re_status = RPC_FAILED;
1641 			rpcerr.re_errno = EIO;
1642 			break;
1643 		}
1644 
1645 		TICK_TO_TIMEVAL(timeo, &wait);
1646 
1647 		/*
1648 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1649 		 * and SIGTERM. (Preserving the existing masks).
1650 		 * Mask out SIGINT if mount option nointr is specified.
1651 		 */
1652 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1653 		if (!(mi->mi_flags & MI_INT))
1654 			client->cl_nosignal = TRUE;
1655 
1656 		/*
1657 		 * If there is a current signal, then don't bother
1658 		 * even trying to send out the request because we
1659 		 * won't be able to block waiting for the response.
1660 		 * Simply assume RPC_INTR and get on with it.
1661 		 */
1662 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1663 			status = RPC_INTR;
1664 		else {
1665 			status = CLNT_CALL(client, which, xdrargs, argsp,
1666 			    xdrres, resp, wait);
1667 		}
1668 
1669 		if (!(mi->mi_flags & MI_INT))
1670 			client->cl_nosignal = FALSE;
1671 		/*
1672 		 * restore original signal mask
1673 		 */
1674 		sigunintr(&smask);
1675 
1676 		switch (status) {
1677 		case RPC_SUCCESS:
1678 #if 0 /* notyet */
1679 			if ((mi->mi_flags & MI_DYNAMIC) &&
1680 			    mi->mi_timer_type[which] != 0 &&
1681 			    (mi->mi_curread != my_rsize ||
1682 			    mi->mi_curwrite != my_wsize))
1683 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1684 #endif
1685 			break;
1686 
1687 		/*
1688 		 * Unfortunately, there are servers in the world which
1689 		 * are not coded correctly.  They are not prepared to
1690 		 * handle RPC requests to the NFS port which are not
1691 		 * NFS requests.  Thus, they may try to process the
1692 		 * NFS_ACL request as if it were an NFS request.  This
1693 		 * does not work.  Generally, an error will be generated
1694 		 * on the client because it will not be able to decode
1695 		 * the response from the server.  However, it seems
1696 		 * possible that the server may not be able to decode
1697 		 * the arguments.  Thus, the criteria for deciding
1698 		 * whether the server supports NFS_ACL or not is whether
1699 		 * the following RPC errors are returned from CLNT_CALL.
1700 		 */
1701 		case RPC_CANTDECODERES:
1702 		case RPC_PROGUNAVAIL:
1703 		case RPC_CANTDECODEARGS:
1704 		case RPC_PROGVERSMISMATCH:
1705 			mutex_enter(&mi->mi_lock);
1706 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1707 			mutex_exit(&mi->mi_lock);
1708 			break;
1709 
1710 		/*
1711 		 * If the server supports NFS_ACL but not the new ops
1712 		 * for extended attributes, make sure we don't retry.
1713 		 */
1714 		case RPC_PROCUNAVAIL:
1715 			mutex_enter(&mi->mi_lock);
1716 			mi->mi_flags &= ~MI_EXTATTR;
1717 			mutex_exit(&mi->mi_lock);
1718 			break;
1719 
1720 		case RPC_INTR:
1721 			/*
1722 			 * There is no way to recover from this error,
1723 			 * even if mount option nointr is specified.
1724 			 * SIGKILL, for example, cannot be blocked.
1725 			 */
1726 			rpcerr.re_status = RPC_INTR;
1727 			rpcerr.re_errno = EINTR;
1728 			break;
1729 
1730 		case RPC_UDERROR:
1731 			/*
1732 			 * If the NFS server is local (vold) and
1733 			 * it goes away then we get RPC_UDERROR.
1734 			 * This is a retryable error, so we would
1735 			 * loop, so check to see if the specific
1736 			 * error was ECONNRESET, indicating that
1737 			 * target did not exist at all.  If so,
1738 			 * return with RPC_PROGUNAVAIL and
1739 			 * ECONNRESET to indicate why.
1740 			 */
1741 			CLNT_GETERR(client, &rpcerr);
1742 			if (rpcerr.re_errno == ECONNRESET) {
1743 				rpcerr.re_status = RPC_PROGUNAVAIL;
1744 				rpcerr.re_errno = ECONNRESET;
1745 				break;
1746 			}
1747 			/*FALLTHROUGH*/
1748 
1749 		default:		/* probably RPC_TIMEDOUT */
1750 			if (IS_UNRECOVERABLE_RPC(status))
1751 				break;
1752 
1753 			/*
1754 			 * increment server not responding count
1755 			 */
1756 			mutex_enter(&mi->mi_lock);
1757 			mi->mi_noresponse++;
1758 			mutex_exit(&mi->mi_lock);
1759 #ifdef DEBUG
1760 			nfscl->nfscl_stat.noresponse.value.ui64++;
1761 #endif
1762 
1763 			if (!(mi->mi_flags & MI_HARD)) {
1764 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1765 				    (mi->mi_acl_ss_call_type[which] == 0))
1766 					break;
1767 			}
1768 
1769 			/*
1770 			 * The call is in progress (over COTS).
1771 			 * Try the CLNT_CALL again, but don't
1772 			 * print a noisy error message.
1773 			 */
1774 			if (status == RPC_INPROGRESS) {
1775 				tryagain = TRUE;
1776 				break;
1777 			}
1778 
1779 			if (flags & RFSCALL_SOFT)
1780 				break;
1781 
1782 			/*
1783 			 * On zone shutdown, just move on.
1784 			 */
1785 			if (zone_status_get(curproc->p_zone) >=
1786 			    ZONE_IS_SHUTTING_DOWN) {
1787 				rpcerr.re_status = RPC_FAILED;
1788 				rpcerr.re_errno = EIO;
1789 				break;
1790 			}
1791 
1792 			/*
1793 			 * NFS client failover support
1794 			 *
1795 			 * If the current server just failed us, we'll
1796 			 * start the process of finding a new server.
1797 			 * After that, we can just retry.
1798 			 */
1799 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1800 				if (svp == mi->mi_curr_serv)
1801 					failover_newserver(mi);
1802 				clfree_impl(client, ch, nfscl);
1803 				goto failoverretry;
1804 			}
1805 
1806 			tryagain = TRUE;
1807 			timeo = backoff(timeo);
1808 			mutex_enter(&mi->mi_lock);
1809 			if (!(mi->mi_flags & MI_PRINTED)) {
1810 				mi->mi_flags |= MI_PRINTED;
1811 				mutex_exit(&mi->mi_lock);
1812 #ifdef DEBUG
1813 				zprintf(zoneid,
1814 			"NFS_ACL%d server %s not responding still trying\n",
1815 				    mi->mi_vers, svp->sv_hostname);
1816 #else
1817 				zprintf(zoneid,
1818 			    "NFS server %s not responding still trying\n",
1819 				    svp->sv_hostname);
1820 #endif
1821 			} else
1822 				mutex_exit(&mi->mi_lock);
1823 			if (*douprintf && nfs_has_ctty()) {
1824 				*douprintf = 0;
1825 				if (!(mi->mi_flags & MI_NOPRINT))
1826 #ifdef DEBUG
1827 					uprintf(
1828 			"NFS_ACL%d server %s not responding still trying\n",
1829 					    mi->mi_vers, svp->sv_hostname);
1830 #else
1831 					uprintf(
1832 			    "NFS server %s not responding still trying\n",
1833 					    svp->sv_hostname);
1834 #endif
1835 			}
1836 
1837 #if 0 /* notyet */
1838 			/*
1839 			 * If doing dynamic adjustment of transfer
1840 			 * size and if it's a read or write call
1841 			 * and if the transfer size changed while
1842 			 * retransmitting or if the feedback routine
1843 			 * changed the transfer size,
1844 			 * then exit rfscall so that the transfer
1845 			 * size can be adjusted at the vnops level.
1846 			 */
1847 			if ((mi->mi_flags & MI_DYNAMIC) &&
1848 			    mi->mi_acl_timer_type[which] != 0 &&
1849 			    (mi->mi_curread != my_rsize ||
1850 			    mi->mi_curwrite != my_wsize ||
1851 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1852 				/*
1853 				 * On read or write calls, return
1854 				 * back to the vnode ops level if
1855 				 * the transfer size changed.
1856 				 */
1857 				clfree_impl(client, ch, nfscl);
1858 				if (cred_cloned)
1859 					crfree(cr);
1860 				return (ENFS_TRYAGAIN);
1861 			}
1862 #endif
1863 		}
1864 	} while (tryagain);
1865 
1866 	if (status != RPC_SUCCESS) {
1867 		/*
1868 		 * Let soft mounts use the timed out message.
1869 		 */
1870 		if (status == RPC_INPROGRESS)
1871 			status = RPC_TIMEDOUT;
1872 		nfscl->nfscl_stat.badcalls.value.ui64++;
1873 		if (status == RPC_CANTDECODERES ||
1874 		    status == RPC_PROGUNAVAIL ||
1875 		    status == RPC_PROCUNAVAIL ||
1876 		    status == RPC_CANTDECODEARGS ||
1877 		    status == RPC_PROGVERSMISMATCH)
1878 			CLNT_GETERR(client, &rpcerr);
1879 		else if (status != RPC_INTR) {
1880 			mutex_enter(&mi->mi_lock);
1881 			mi->mi_flags |= MI_DOWN;
1882 			mutex_exit(&mi->mi_lock);
1883 			CLNT_GETERR(client, &rpcerr);
1884 #ifdef DEBUG
1885 			bufp = clnt_sperror(client, svp->sv_hostname);
1886 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1887 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1888 			if (nfs_has_ctty()) {
1889 				if (!(mi->mi_flags & MI_NOPRINT)) {
1890 					uprintf("NFS_ACL%d %s failed for %s\n",
1891 					    mi->mi_vers, mi->mi_aclnames[which],
1892 					    bufp);
1893 				}
1894 			}
1895 			kmem_free(bufp, MAXPATHLEN);
1896 #else
1897 			zprintf(zoneid,
1898 			    "NFS %s failed for server %s: error %d (%s)\n",
1899 			    mi->mi_aclnames[which], svp->sv_hostname,
1900 			    status, clnt_sperrno(status));
1901 			if (nfs_has_ctty()) {
1902 				if (!(mi->mi_flags & MI_NOPRINT))
1903 					uprintf(
1904 				"NFS %s failed for server %s: error %d (%s)\n",
1905 					    mi->mi_aclnames[which],
1906 					    svp->sv_hostname, status,
1907 					    clnt_sperrno(status));
1908 			}
1909 #endif
1910 			/*
1911 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1912 			 * re_errno is set appropriately depending on
1913 			 * the authentication error
1914 			 */
1915 			if (status == RPC_VERSMISMATCH ||
1916 			    status == RPC_PROGVERSMISMATCH)
1917 				rpcerr.re_errno = EIO;
1918 		}
1919 	} else {
1920 		/*
1921 		 * Test the value of mi_down and mi_printed without
1922 		 * holding the mi_lock mutex.  If they are both zero,
1923 		 * then it is okay to skip the down and printed
1924 		 * processing.  This saves on a mutex_enter and
1925 		 * mutex_exit pair for a normal, successful RPC.
1926 		 * This was just complete overhead.
1927 		 */
1928 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1929 			mutex_enter(&mi->mi_lock);
1930 			mi->mi_flags &= ~MI_DOWN;
1931 			if (mi->mi_flags & MI_PRINTED) {
1932 				mi->mi_flags &= ~MI_PRINTED;
1933 				mutex_exit(&mi->mi_lock);
1934 #ifdef DEBUG
1935 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1936 				    mi->mi_vers, svp->sv_hostname);
1937 #else
1938 				zprintf(zoneid, "NFS server %s ok\n",
1939 				    svp->sv_hostname);
1940 #endif
1941 			} else
1942 				mutex_exit(&mi->mi_lock);
1943 		}
1944 
1945 		if (*douprintf == 0) {
1946 			if (!(mi->mi_flags & MI_NOPRINT))
1947 #ifdef DEBUG
1948 				uprintf("NFS_ACL%d server %s ok\n",
1949 				    mi->mi_vers, svp->sv_hostname);
1950 #else
1951 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1952 #endif
1953 			*douprintf = 1;
1954 		}
1955 	}
1956 
1957 	clfree_impl(client, ch, nfscl);
1958 	if (cred_cloned)
1959 		crfree(cr);
1960 
1961 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1962 
1963 #if 0 /* notyet */
1964 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1965 	    rpcerr.re_errno);
1966 #endif
1967 
1968 	return (rpcerr.re_errno);
1969 }
1970 
1971 int
1972 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1973 {
1974 	uint_t mask = vap->va_mask;
1975 
1976 	if (!(mask & AT_MODE))
1977 		sa->sa_mode = (uint32_t)-1;
1978 	else
1979 		sa->sa_mode = vap->va_mode;
1980 	if (!(mask & AT_UID))
1981 		sa->sa_uid = (uint32_t)-1;
1982 	else
1983 		sa->sa_uid = (uint32_t)vap->va_uid;
1984 	if (!(mask & AT_GID))
1985 		sa->sa_gid = (uint32_t)-1;
1986 	else
1987 		sa->sa_gid = (uint32_t)vap->va_gid;
1988 	if (!(mask & AT_SIZE))
1989 		sa->sa_size = (uint32_t)-1;
1990 	else
1991 		sa->sa_size = (uint32_t)vap->va_size;
1992 	if (!(mask & AT_ATIME))
1993 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
1994 	else {
1995 		/* check time validity */
1996 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1997 			return (EOVERFLOW);
1998 		}
1999 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2000 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2001 	}
2002 	if (!(mask & AT_MTIME))
2003 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2004 	else {
2005 		/* check time validity */
2006 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2007 			return (EOVERFLOW);
2008 		}
2009 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2010 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2011 	}
2012 	return (0);
2013 }
2014 
2015 int
2016 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2017 {
2018 	uint_t mask = vap->va_mask;
2019 
2020 	if (!(mask & AT_MODE))
2021 		sa->mode.set_it = FALSE;
2022 	else {
2023 		sa->mode.set_it = TRUE;
2024 		sa->mode.mode = (mode3)vap->va_mode;
2025 	}
2026 	if (!(mask & AT_UID))
2027 		sa->uid.set_it = FALSE;
2028 	else {
2029 		sa->uid.set_it = TRUE;
2030 		sa->uid.uid = (uid3)vap->va_uid;
2031 	}
2032 	if (!(mask & AT_GID))
2033 		sa->gid.set_it = FALSE;
2034 	else {
2035 		sa->gid.set_it = TRUE;
2036 		sa->gid.gid = (gid3)vap->va_gid;
2037 	}
2038 	if (!(mask & AT_SIZE))
2039 		sa->size.set_it = FALSE;
2040 	else {
2041 		sa->size.set_it = TRUE;
2042 		sa->size.size = (size3)vap->va_size;
2043 	}
2044 	if (!(mask & AT_ATIME))
2045 		sa->atime.set_it = DONT_CHANGE;
2046 	else {
2047 		/* check time validity */
2048 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2049 			return (EOVERFLOW);
2050 		}
2051 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2052 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2053 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2054 	}
2055 	if (!(mask & AT_MTIME))
2056 		sa->mtime.set_it = DONT_CHANGE;
2057 	else {
2058 		/* check time validity */
2059 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2060 			return (EOVERFLOW);
2061 		}
2062 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2063 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2064 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2065 	}
2066 	return (0);
2067 }
2068 
2069 void
2070 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2071 {
2072 
2073 	da->da_fhandle = VTOFH(dvp);
2074 	da->da_name = nm;
2075 	da->da_flags = 0;
2076 }
2077 
2078 void
2079 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2080 {
2081 
2082 	da->dirp = VTOFH3(dvp);
2083 	da->name = nm;
2084 }
2085 
2086 int
2087 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2088 {
2089 	int error;
2090 	rnode_t *rp;
2091 	struct vattr va;
2092 
2093 	va.va_mask = AT_MODE | AT_GID;
2094 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2095 	if (error)
2096 		return (error);
2097 
2098 	/*
2099 	 * To determine the expected group-id of the created file:
2100 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2101 	 *	GRPID option, and the directory's set-gid bit is clear,
2102 	 *	then use the process's gid.
2103 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2104 	 */
2105 	rp = VTOR(dvp);
2106 	mutex_enter(&rp->r_statelock);
2107 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2108 		*gidp = crgetgid(cr);
2109 	else
2110 		*gidp = va.va_gid;
2111 	mutex_exit(&rp->r_statelock);
2112 	return (0);
2113 }
2114 
2115 int
2116 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2117 {
2118 	int error;
2119 	struct vattr va;
2120 
2121 	va.va_mask = AT_MODE;
2122 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2123 	if (error)
2124 		return (error);
2125 
2126 	/*
2127 	 * Modify the expected mode (om) so that the set-gid bit matches
2128 	 * that of the parent directory (dvp).
2129 	 */
2130 	if (va.va_mode & VSGID)
2131 		*omp |= VSGID;
2132 	else
2133 		*omp &= ~VSGID;
2134 	return (0);
2135 }
2136 
2137 void
2138 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2139 {
2140 
2141 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2142 		if (!(vp->v_flag & VSWAPLIKE)) {
2143 			mutex_enter(&vp->v_lock);
2144 			vp->v_flag |= VSWAPLIKE;
2145 			mutex_exit(&vp->v_lock);
2146 		}
2147 	} else {
2148 		if (vp->v_flag & VSWAPLIKE) {
2149 			mutex_enter(&vp->v_lock);
2150 			vp->v_flag &= ~VSWAPLIKE;
2151 			mutex_exit(&vp->v_lock);
2152 		}
2153 	}
2154 }
2155 
2156 /*
2157  * Free the resources associated with an rnode.
2158  */
2159 static void
2160 rinactive(rnode_t *rp, cred_t *cr)
2161 {
2162 	vnode_t *vp;
2163 	cred_t *cred;
2164 	char *contents;
2165 	int size;
2166 	vsecattr_t *vsp;
2167 	int error;
2168 	nfs3_pathconf_info *info;
2169 
2170 	/*
2171 	 * Before freeing anything, wait until all asynchronous
2172 	 * activity is done on this rnode.  This will allow all
2173 	 * asynchronous read ahead and write behind i/o's to
2174 	 * finish.
2175 	 */
2176 	mutex_enter(&rp->r_statelock);
2177 	while (rp->r_count > 0)
2178 		cv_wait(&rp->r_cv, &rp->r_statelock);
2179 	mutex_exit(&rp->r_statelock);
2180 
2181 	/*
2182 	 * Flush and invalidate all pages associated with the vnode.
2183 	 */
2184 	vp = RTOV(rp);
2185 	if (vn_has_cached_data(vp)) {
2186 		ASSERT(vp->v_type != VCHR);
2187 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2188 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2189 			if (error && (error == ENOSPC || error == EDQUOT)) {
2190 				mutex_enter(&rp->r_statelock);
2191 				if (!rp->r_error)
2192 					rp->r_error = error;
2193 				mutex_exit(&rp->r_statelock);
2194 			}
2195 		}
2196 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2197 	}
2198 
2199 	/*
2200 	 * Free any held credentials and caches which may be associated
2201 	 * with this rnode.
2202 	 */
2203 	mutex_enter(&rp->r_statelock);
2204 	cred = rp->r_cred;
2205 	rp->r_cred = NULL;
2206 	contents = rp->r_symlink.contents;
2207 	size = rp->r_symlink.size;
2208 	rp->r_symlink.contents = NULL;
2209 	vsp = rp->r_secattr;
2210 	rp->r_secattr = NULL;
2211 	info = rp->r_pathconf;
2212 	rp->r_pathconf = NULL;
2213 	mutex_exit(&rp->r_statelock);
2214 
2215 	/*
2216 	 * Free the held credential.
2217 	 */
2218 	if (cred != NULL)
2219 		crfree(cred);
2220 
2221 	/*
2222 	 * Free the access cache entries.
2223 	 */
2224 	(void) nfs_access_purge_rp(rp);
2225 
2226 	/*
2227 	 * Free the readdir cache entries.
2228 	 */
2229 	if (HAVE_RDDIR_CACHE(rp))
2230 		nfs_purge_rddir_cache(vp);
2231 
2232 	/*
2233 	 * Free the symbolic link cache.
2234 	 */
2235 	if (contents != NULL) {
2236 
2237 		kmem_free((void *)contents, size);
2238 	}
2239 
2240 	/*
2241 	 * Free any cached ACL.
2242 	 */
2243 	if (vsp != NULL)
2244 		nfs_acl_free(vsp);
2245 
2246 	/*
2247 	 * Free any cached pathconf information.
2248 	 */
2249 	if (info != NULL)
2250 		kmem_free(info, sizeof (*info));
2251 }
2252 
2253 /*
2254  * Return a vnode for the given NFS Version 2 file handle.
2255  * If no rnode exists for this fhandle, create one and put it
2256  * into the hash queues.  If the rnode for this fhandle
2257  * already exists, return it.
2258  *
2259  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2260  */
2261 vnode_t *
2262 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2263     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2264 {
2265 	int newnode;
2266 	int index;
2267 	vnode_t *vp;
2268 	nfs_fhandle nfh;
2269 	vattr_t va;
2270 
2271 	nfh.fh_len = NFS_FHSIZE;
2272 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2273 
2274 	index = rtablehash(&nfh);
2275 	rw_enter(&rtable[index].r_lock, RW_READER);
2276 
2277 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2278 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2279 
2280 	if (attr != NULL) {
2281 		if (!newnode) {
2282 			rw_exit(&rtable[index].r_lock);
2283 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2284 		} else {
2285 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2286 				vp->v_type = VBAD;
2287 			else
2288 				vp->v_type = n2v_type(attr);
2289 			/*
2290 			 * A translation here seems to be necessary
2291 			 * because this function can be called
2292 			 * with `attr' that has come from the wire,
2293 			 * and been operated on by vattr_to_nattr().
2294 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2295 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2296 			 * ->makenfsnode().
2297 			 */
2298 			if ((attr->na_rdev & 0xffff0000) == 0)
2299 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2300 			else
2301 				vp->v_rdev = expldev(n2v_rdev(attr));
2302 			nfs_attrcache(vp, attr, t);
2303 			rw_exit(&rtable[index].r_lock);
2304 		}
2305 	} else {
2306 		if (newnode) {
2307 			PURGE_ATTRCACHE(vp);
2308 		}
2309 		rw_exit(&rtable[index].r_lock);
2310 	}
2311 
2312 	return (vp);
2313 }
2314 
2315 /*
2316  * Return a vnode for the given NFS Version 3 file handle.
2317  * If no rnode exists for this fhandle, create one and put it
2318  * into the hash queues.  If the rnode for this fhandle
2319  * already exists, return it.
2320  *
2321  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2322  */
2323 vnode_t *
2324 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2325     cred_t *cr, char *dnm, char *nm)
2326 {
2327 	int newnode;
2328 	int index;
2329 	vnode_t *vp;
2330 
2331 	index = rtablehash((nfs_fhandle *)fh);
2332 	rw_enter(&rtable[index].r_lock, RW_READER);
2333 
2334 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2335 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2336 	    dnm, nm);
2337 
2338 	if (vap == NULL) {
2339 		if (newnode) {
2340 			PURGE_ATTRCACHE(vp);
2341 		}
2342 		rw_exit(&rtable[index].r_lock);
2343 		return (vp);
2344 	}
2345 
2346 	if (!newnode) {
2347 		rw_exit(&rtable[index].r_lock);
2348 		nfs_attr_cache(vp, vap, t, cr);
2349 	} else {
2350 		rnode_t *rp = VTOR(vp);
2351 
2352 		vp->v_type = vap->va_type;
2353 		vp->v_rdev = vap->va_rdev;
2354 
2355 		mutex_enter(&rp->r_statelock);
2356 		if (rp->r_mtime <= t)
2357 			nfs_attrcache_va(vp, vap);
2358 		mutex_exit(&rp->r_statelock);
2359 		rw_exit(&rtable[index].r_lock);
2360 	}
2361 
2362 	return (vp);
2363 }
2364 
2365 vnode_t *
2366 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2367     cred_t *cr, char *dnm, char *nm)
2368 {
2369 	int newnode;
2370 	int index;
2371 	vnode_t *vp;
2372 	vattr_t va;
2373 
2374 	index = rtablehash((nfs_fhandle *)fh);
2375 	rw_enter(&rtable[index].r_lock, RW_READER);
2376 
2377 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2378 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2379 	    dnm, nm);
2380 
2381 	if (attr == NULL) {
2382 		if (newnode) {
2383 			PURGE_ATTRCACHE(vp);
2384 		}
2385 		rw_exit(&rtable[index].r_lock);
2386 		return (vp);
2387 	}
2388 
2389 	if (!newnode) {
2390 		rw_exit(&rtable[index].r_lock);
2391 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2392 	} else {
2393 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2394 			vp->v_type = VBAD;
2395 		else
2396 			vp->v_type = nf3_to_vt[attr->type];
2397 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2398 		    attr->rdev.specdata2);
2399 		nfs3_attrcache(vp, attr, t);
2400 		rw_exit(&rtable[index].r_lock);
2401 	}
2402 
2403 	return (vp);
2404 }
2405 
2406 /*
2407  * Read this comment before making changes to rtablehash()!
2408  * This is a hash function in which seemingly obvious and harmless
2409  * changes can cause escalations costing million dollars!
2410  * Know what you are doing.
2411  *
2412  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2413  * algorithm is currently detailed here:
2414  *
2415  *   http://burtleburtle.net/bob/hash/doobs.html
2416  *
2417  * Of course, the above link may not be valid by the time you are reading
2418  * this, but suffice it to say that the one-at-a-time algorithm works well in
2419  * almost all cases.  If you are changing the algorithm be sure to verify that
2420  * the hash algorithm still provides even distribution in all cases and with
2421  * any server returning filehandles in whatever order (sequential or random).
2422  */
2423 static int
2424 rtablehash(nfs_fhandle *fh)
2425 {
2426 	ulong_t hash, len, i;
2427 	char *key;
2428 
2429 	key = fh->fh_buf;
2430 	len = (ulong_t)fh->fh_len;
2431 	for (hash = 0, i = 0; i < len; i++) {
2432 		hash += key[i];
2433 		hash += (hash << 10);
2434 		hash ^= (hash >> 6);
2435 	}
2436 	hash += (hash << 3);
2437 	hash ^= (hash >> 11);
2438 	hash += (hash << 15);
2439 	return (hash & rtablemask);
2440 }
2441 
2442 static vnode_t *
2443 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2444     struct vnodeops *vops,
2445     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2446     int (*compar)(const void *, const void *),
2447     int *newnode, cred_t *cr, char *dnm, char *nm)
2448 {
2449 	rnode_t *rp;
2450 	rnode_t *trp;
2451 	vnode_t *vp;
2452 	mntinfo_t *mi;
2453 
2454 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2455 
2456 	mi = VFTOMI(vfsp);
2457 start:
2458 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2459 		vp = RTOV(rp);
2460 		nfs_set_vroot(vp);
2461 		*newnode = 0;
2462 		return (vp);
2463 	}
2464 	rw_exit(&rhtp->r_lock);
2465 
2466 	mutex_enter(&rpfreelist_lock);
2467 	if (rpfreelist != NULL && rnew >= nrnode) {
2468 		rp = rpfreelist;
2469 		rp_rmfree(rp);
2470 		mutex_exit(&rpfreelist_lock);
2471 
2472 		vp = RTOV(rp);
2473 
2474 		if (rp->r_flags & RHASHED) {
2475 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2476 			mutex_enter(&vp->v_lock);
2477 			if (vp->v_count > 1) {
2478 				vp->v_count--;
2479 				mutex_exit(&vp->v_lock);
2480 				rw_exit(&rp->r_hashq->r_lock);
2481 				rw_enter(&rhtp->r_lock, RW_READER);
2482 				goto start;
2483 			}
2484 			mutex_exit(&vp->v_lock);
2485 			rp_rmhash_locked(rp);
2486 			rw_exit(&rp->r_hashq->r_lock);
2487 		}
2488 
2489 		rinactive(rp, cr);
2490 
2491 		mutex_enter(&vp->v_lock);
2492 		if (vp->v_count > 1) {
2493 			vp->v_count--;
2494 			mutex_exit(&vp->v_lock);
2495 			rw_enter(&rhtp->r_lock, RW_READER);
2496 			goto start;
2497 		}
2498 		mutex_exit(&vp->v_lock);
2499 		vn_invalid(vp);
2500 		/*
2501 		 * destroy old locks before bzero'ing and
2502 		 * recreating the locks below.
2503 		 */
2504 		nfs_rw_destroy(&rp->r_rwlock);
2505 		nfs_rw_destroy(&rp->r_lkserlock);
2506 		mutex_destroy(&rp->r_statelock);
2507 		cv_destroy(&rp->r_cv);
2508 		cv_destroy(&rp->r_commit.c_cv);
2509 		nfs_free_r_path(rp);
2510 		avl_destroy(&rp->r_dir);
2511 		/*
2512 		 * Make sure that if rnode is recycled then
2513 		 * VFS count is decremented properly before
2514 		 * reuse.
2515 		 */
2516 		VFS_RELE(vp->v_vfsp);
2517 		vn_reinit(vp);
2518 	} else {
2519 		vnode_t *new_vp;
2520 
2521 		mutex_exit(&rpfreelist_lock);
2522 
2523 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2524 		new_vp = vn_alloc(KM_SLEEP);
2525 
2526 		atomic_add_long((ulong_t *)&rnew, 1);
2527 #ifdef DEBUG
2528 		clstat_debug.nrnode.value.ui64++;
2529 #endif
2530 		vp = new_vp;
2531 	}
2532 
2533 	bzero(rp, sizeof (*rp));
2534 	rp->r_vnode = vp;
2535 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2536 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2537 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2538 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2539 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2540 	rp->r_fh.fh_len = fh->fh_len;
2541 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2542 	rp->r_server = mi->mi_curr_serv;
2543 	if (FAILOVER_MOUNT(mi)) {
2544 		/*
2545 		 * If replicated servers, stash pathnames
2546 		 */
2547 		if (dnm != NULL && nm != NULL) {
2548 			char *s, *p;
2549 			uint_t len;
2550 
2551 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2552 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2553 #ifdef DEBUG
2554 			clstat_debug.rpath.value.ui64 += len;
2555 #endif
2556 			s = rp->r_path;
2557 			for (p = dnm; *p; p++)
2558 				*s++ = *p;
2559 			*s++ = '/';
2560 			for (p = nm; *p; p++)
2561 				*s++ = *p;
2562 			*s = '\0';
2563 		} else {
2564 			/* special case for root */
2565 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2566 #ifdef DEBUG
2567 			clstat_debug.rpath.value.ui64 += 2;
2568 #endif
2569 			*rp->r_path = '.';
2570 			*(rp->r_path + 1) = '\0';
2571 		}
2572 	}
2573 	VFS_HOLD(vfsp);
2574 	rp->r_putapage = putapage;
2575 	rp->r_hashq = rhtp;
2576 	rp->r_flags = RREADDIRPLUS;
2577 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2578 	    offsetof(rddir_cache, tree));
2579 	vn_setops(vp, vops);
2580 	vp->v_data = (caddr_t)rp;
2581 	vp->v_vfsp = vfsp;
2582 	vp->v_type = VNON;
2583 	nfs_set_vroot(vp);
2584 
2585 	/*
2586 	 * There is a race condition if someone else
2587 	 * alloc's the rnode while no locks are held, so we
2588 	 * check again and recover if found.
2589 	 */
2590 	rw_enter(&rhtp->r_lock, RW_WRITER);
2591 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2592 		vp = RTOV(trp);
2593 		nfs_set_vroot(vp);
2594 		*newnode = 0;
2595 		rw_exit(&rhtp->r_lock);
2596 		rp_addfree(rp, cr);
2597 		rw_enter(&rhtp->r_lock, RW_READER);
2598 		return (vp);
2599 	}
2600 	rp_addhash(rp);
2601 	*newnode = 1;
2602 	return (vp);
2603 }
2604 
2605 static void
2606 nfs_set_vroot(vnode_t *vp)
2607 {
2608 	rnode_t *rp;
2609 	nfs_fhandle *rootfh;
2610 
2611 	rp = VTOR(vp);
2612 	rootfh = &rp->r_server->sv_fhandle;
2613 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2614 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2615 		if (!(vp->v_flag & VROOT)) {
2616 			mutex_enter(&vp->v_lock);
2617 			vp->v_flag |= VROOT;
2618 			mutex_exit(&vp->v_lock);
2619 		}
2620 	}
2621 }
2622 
2623 static void
2624 nfs_free_r_path(rnode_t *rp)
2625 {
2626 	char *path;
2627 	size_t len;
2628 
2629 	path = rp->r_path;
2630 	if (path) {
2631 		rp->r_path = NULL;
2632 		len = strlen(path) + 1;
2633 		kmem_free(path, len);
2634 #ifdef DEBUG
2635 		clstat_debug.rpath.value.ui64 -= len;
2636 #endif
2637 	}
2638 }
2639 
2640 /*
2641  * Put an rnode on the free list.
2642  *
2643  * Rnodes which were allocated above and beyond the normal limit
2644  * are immediately freed.
2645  */
2646 void
2647 rp_addfree(rnode_t *rp, cred_t *cr)
2648 {
2649 	vnode_t *vp;
2650 	struct vfs *vfsp;
2651 
2652 	vp = RTOV(rp);
2653 	ASSERT(vp->v_count >= 1);
2654 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2655 
2656 	/*
2657 	 * If we have too many rnodes allocated and there are no
2658 	 * references to this rnode, or if the rnode is no longer
2659 	 * accessible by it does not reside in the hash queues,
2660 	 * or if an i/o error occurred while writing to the file,
2661 	 * then just free it instead of putting it on the rnode
2662 	 * freelist.
2663 	 */
2664 	vfsp = vp->v_vfsp;
2665 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2666 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2667 		if (rp->r_flags & RHASHED) {
2668 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2669 			mutex_enter(&vp->v_lock);
2670 			if (vp->v_count > 1) {
2671 				vp->v_count--;
2672 				mutex_exit(&vp->v_lock);
2673 				rw_exit(&rp->r_hashq->r_lock);
2674 				return;
2675 			}
2676 			mutex_exit(&vp->v_lock);
2677 			rp_rmhash_locked(rp);
2678 			rw_exit(&rp->r_hashq->r_lock);
2679 		}
2680 
2681 		rinactive(rp, cr);
2682 
2683 		/*
2684 		 * Recheck the vnode reference count.  We need to
2685 		 * make sure that another reference has not been
2686 		 * acquired while we were not holding v_lock.  The
2687 		 * rnode is not in the rnode hash queues, so the
2688 		 * only way for a reference to have been acquired
2689 		 * is for a VOP_PUTPAGE because the rnode was marked
2690 		 * with RDIRTY or for a modified page.  This
2691 		 * reference may have been acquired before our call
2692 		 * to rinactive.  The i/o may have been completed,
2693 		 * thus allowing rinactive to complete, but the
2694 		 * reference to the vnode may not have been released
2695 		 * yet.  In any case, the rnode can not be destroyed
2696 		 * until the other references to this vnode have been
2697 		 * released.  The other references will take care of
2698 		 * either destroying the rnode or placing it on the
2699 		 * rnode freelist.  If there are no other references,
2700 		 * then the rnode may be safely destroyed.
2701 		 */
2702 		mutex_enter(&vp->v_lock);
2703 		if (vp->v_count > 1) {
2704 			vp->v_count--;
2705 			mutex_exit(&vp->v_lock);
2706 			return;
2707 		}
2708 		mutex_exit(&vp->v_lock);
2709 
2710 		destroy_rnode(rp);
2711 		return;
2712 	}
2713 
2714 	/*
2715 	 * Lock the hash queue and then recheck the reference count
2716 	 * to ensure that no other threads have acquired a reference
2717 	 * to indicate that the rnode should not be placed on the
2718 	 * freelist.  If another reference has been acquired, then
2719 	 * just release this one and let the other thread complete
2720 	 * the processing of adding this rnode to the freelist.
2721 	 */
2722 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2723 
2724 	mutex_enter(&vp->v_lock);
2725 	if (vp->v_count > 1) {
2726 		vp->v_count--;
2727 		mutex_exit(&vp->v_lock);
2728 		rw_exit(&rp->r_hashq->r_lock);
2729 		return;
2730 	}
2731 	mutex_exit(&vp->v_lock);
2732 
2733 	/*
2734 	 * If there is no cached data or metadata for this file, then
2735 	 * put the rnode on the front of the freelist so that it will
2736 	 * be reused before other rnodes which may have cached data or
2737 	 * metadata associated with them.
2738 	 */
2739 	mutex_enter(&rpfreelist_lock);
2740 	if (rpfreelist == NULL) {
2741 		rp->r_freef = rp;
2742 		rp->r_freeb = rp;
2743 		rpfreelist = rp;
2744 	} else {
2745 		rp->r_freef = rpfreelist;
2746 		rp->r_freeb = rpfreelist->r_freeb;
2747 		rpfreelist->r_freeb->r_freef = rp;
2748 		rpfreelist->r_freeb = rp;
2749 		if (!vn_has_cached_data(vp) &&
2750 		    !HAVE_RDDIR_CACHE(rp) &&
2751 		    rp->r_symlink.contents == NULL &&
2752 		    rp->r_secattr == NULL &&
2753 		    rp->r_pathconf == NULL)
2754 			rpfreelist = rp;
2755 	}
2756 	mutex_exit(&rpfreelist_lock);
2757 
2758 	rw_exit(&rp->r_hashq->r_lock);
2759 }
2760 
2761 /*
2762  * Remove an rnode from the free list.
2763  *
2764  * The caller must be holding rpfreelist_lock and the rnode
2765  * must be on the freelist.
2766  */
2767 static void
2768 rp_rmfree(rnode_t *rp)
2769 {
2770 
2771 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2772 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2773 
2774 	if (rp == rpfreelist) {
2775 		rpfreelist = rp->r_freef;
2776 		if (rp == rpfreelist)
2777 			rpfreelist = NULL;
2778 	}
2779 
2780 	rp->r_freeb->r_freef = rp->r_freef;
2781 	rp->r_freef->r_freeb = rp->r_freeb;
2782 
2783 	rp->r_freef = rp->r_freeb = NULL;
2784 }
2785 
2786 /*
2787  * Put a rnode in the hash table.
2788  *
2789  * The caller must be holding the exclusive hash queue lock.
2790  */
2791 static void
2792 rp_addhash(rnode_t *rp)
2793 {
2794 
2795 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2796 	ASSERT(!(rp->r_flags & RHASHED));
2797 
2798 	rp->r_hashf = rp->r_hashq->r_hashf;
2799 	rp->r_hashq->r_hashf = rp;
2800 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2801 	rp->r_hashf->r_hashb = rp;
2802 
2803 	mutex_enter(&rp->r_statelock);
2804 	rp->r_flags |= RHASHED;
2805 	mutex_exit(&rp->r_statelock);
2806 }
2807 
2808 /*
2809  * Remove a rnode from the hash table.
2810  *
2811  * The caller must be holding the hash queue lock.
2812  */
2813 static void
2814 rp_rmhash_locked(rnode_t *rp)
2815 {
2816 
2817 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2818 	ASSERT(rp->r_flags & RHASHED);
2819 
2820 	rp->r_hashb->r_hashf = rp->r_hashf;
2821 	rp->r_hashf->r_hashb = rp->r_hashb;
2822 
2823 	mutex_enter(&rp->r_statelock);
2824 	rp->r_flags &= ~RHASHED;
2825 	mutex_exit(&rp->r_statelock);
2826 }
2827 
2828 /*
2829  * Remove a rnode from the hash table.
2830  *
2831  * The caller must not be holding the hash queue lock.
2832  */
2833 void
2834 rp_rmhash(rnode_t *rp)
2835 {
2836 
2837 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2838 	rp_rmhash_locked(rp);
2839 	rw_exit(&rp->r_hashq->r_lock);
2840 }
2841 
2842 /*
2843  * Lookup a rnode by fhandle.
2844  *
2845  * The caller must be holding the hash queue lock, either shared or exclusive.
2846  */
2847 static rnode_t *
2848 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2849 {
2850 	rnode_t *rp;
2851 	vnode_t *vp;
2852 
2853 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2854 
2855 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2856 		vp = RTOV(rp);
2857 		if (vp->v_vfsp == vfsp &&
2858 		    rp->r_fh.fh_len == fh->fh_len &&
2859 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2860 			/*
2861 			 * remove rnode from free list, if necessary.
2862 			 */
2863 			if (rp->r_freef != NULL) {
2864 				mutex_enter(&rpfreelist_lock);
2865 				/*
2866 				 * If the rnode is on the freelist,
2867 				 * then remove it and use that reference
2868 				 * as the new reference.  Otherwise,
2869 				 * need to increment the reference count.
2870 				 */
2871 				if (rp->r_freef != NULL) {
2872 					rp_rmfree(rp);
2873 					mutex_exit(&rpfreelist_lock);
2874 				} else {
2875 					mutex_exit(&rpfreelist_lock);
2876 					VN_HOLD(vp);
2877 				}
2878 			} else
2879 				VN_HOLD(vp);
2880 			return (rp);
2881 		}
2882 	}
2883 	return (NULL);
2884 }
2885 
2886 /*
2887  * Return 1 if there is a active vnode belonging to this vfs in the
2888  * rtable cache.
2889  *
2890  * Several of these checks are done without holding the usual
2891  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2892  * etc. will redo the necessary checks before actually destroying
2893  * any rnodes.
2894  */
2895 int
2896 check_rtable(struct vfs *vfsp)
2897 {
2898 	int index;
2899 	rnode_t *rp;
2900 	vnode_t *vp;
2901 
2902 	for (index = 0; index < rtablesize; index++) {
2903 		rw_enter(&rtable[index].r_lock, RW_READER);
2904 		for (rp = rtable[index].r_hashf;
2905 		    rp != (rnode_t *)(&rtable[index]);
2906 		    rp = rp->r_hashf) {
2907 			vp = RTOV(rp);
2908 			if (vp->v_vfsp == vfsp) {
2909 				if (rp->r_freef == NULL ||
2910 				    (vn_has_cached_data(vp) &&
2911 				    (rp->r_flags & RDIRTY)) ||
2912 				    rp->r_count > 0) {
2913 					rw_exit(&rtable[index].r_lock);
2914 					return (1);
2915 				}
2916 			}
2917 		}
2918 		rw_exit(&rtable[index].r_lock);
2919 	}
2920 	return (0);
2921 }
2922 
2923 /*
2924  * Destroy inactive vnodes from the hash queues which belong to this
2925  * vfs.  It is essential that we destroy all inactive vnodes during a
2926  * forced unmount as well as during a normal unmount.
2927  */
2928 void
2929 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2930 {
2931 	int index;
2932 	rnode_t *rp;
2933 	rnode_t *rlist;
2934 	rnode_t *r_hashf;
2935 	vnode_t *vp;
2936 
2937 	rlist = NULL;
2938 
2939 	for (index = 0; index < rtablesize; index++) {
2940 		rw_enter(&rtable[index].r_lock, RW_WRITER);
2941 		for (rp = rtable[index].r_hashf;
2942 		    rp != (rnode_t *)(&rtable[index]);
2943 		    rp = r_hashf) {
2944 			/* save the hash pointer before destroying */
2945 			r_hashf = rp->r_hashf;
2946 			vp = RTOV(rp);
2947 			if (vp->v_vfsp == vfsp) {
2948 				mutex_enter(&rpfreelist_lock);
2949 				if (rp->r_freef != NULL) {
2950 					rp_rmfree(rp);
2951 					mutex_exit(&rpfreelist_lock);
2952 					rp_rmhash_locked(rp);
2953 					rp->r_hashf = rlist;
2954 					rlist = rp;
2955 				} else
2956 					mutex_exit(&rpfreelist_lock);
2957 			}
2958 		}
2959 		rw_exit(&rtable[index].r_lock);
2960 	}
2961 
2962 	for (rp = rlist; rp != NULL; rp = rlist) {
2963 		rlist = rp->r_hashf;
2964 		/*
2965 		 * This call to rp_addfree will end up destroying the
2966 		 * rnode, but in a safe way with the appropriate set
2967 		 * of checks done.
2968 		 */
2969 		rp_addfree(rp, cr);
2970 	}
2971 
2972 }
2973 
2974 /*
2975  * This routine destroys all the resources associated with the rnode
2976  * and then the rnode itself.
2977  */
2978 static void
2979 destroy_rnode(rnode_t *rp)
2980 {
2981 	vnode_t *vp;
2982 	vfs_t *vfsp;
2983 
2984 	vp = RTOV(rp);
2985 	vfsp = vp->v_vfsp;
2986 
2987 	ASSERT(vp->v_count == 1);
2988 	ASSERT(rp->r_count == 0);
2989 	ASSERT(rp->r_lmpl == NULL);
2990 	ASSERT(rp->r_mapcnt == 0);
2991 	ASSERT(!(rp->r_flags & RHASHED));
2992 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2993 	atomic_add_long((ulong_t *)&rnew, -1);
2994 #ifdef DEBUG
2995 	clstat_debug.nrnode.value.ui64--;
2996 #endif
2997 	nfs_rw_destroy(&rp->r_rwlock);
2998 	nfs_rw_destroy(&rp->r_lkserlock);
2999 	mutex_destroy(&rp->r_statelock);
3000 	cv_destroy(&rp->r_cv);
3001 	cv_destroy(&rp->r_commit.c_cv);
3002 	if (rp->r_flags & RDELMAPLIST)
3003 		list_destroy(&rp->r_indelmap);
3004 	nfs_free_r_path(rp);
3005 	avl_destroy(&rp->r_dir);
3006 	vn_invalid(vp);
3007 	vn_free(vp);
3008 	kmem_cache_free(rnode_cache, rp);
3009 	VFS_RELE(vfsp);
3010 }
3011 
3012 /*
3013  * Flush all vnodes in this (or every) vfs.
3014  * Used by nfs_sync and by nfs_unmount.
3015  */
3016 void
3017 rflush(struct vfs *vfsp, cred_t *cr)
3018 {
3019 	int index;
3020 	rnode_t *rp;
3021 	vnode_t *vp, **vplist;
3022 	long num, cnt;
3023 
3024 	/*
3025 	 * Check to see whether there is anything to do.
3026 	 */
3027 	num = rnew;
3028 	if (num == 0)
3029 		return;
3030 
3031 	/*
3032 	 * Allocate a slot for all currently active rnodes on the
3033 	 * supposition that they all may need flushing.
3034 	 */
3035 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3036 	cnt = 0;
3037 
3038 	/*
3039 	 * Walk the hash queues looking for rnodes with page
3040 	 * lists associated with them.  Make a list of these
3041 	 * files.
3042 	 */
3043 	for (index = 0; index < rtablesize; index++) {
3044 		rw_enter(&rtable[index].r_lock, RW_READER);
3045 		for (rp = rtable[index].r_hashf;
3046 		    rp != (rnode_t *)(&rtable[index]);
3047 		    rp = rp->r_hashf) {
3048 			vp = RTOV(rp);
3049 			/*
3050 			 * Don't bother sync'ing a vp if it
3051 			 * is part of virtual swap device or
3052 			 * if VFS is read-only
3053 			 */
3054 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3055 				continue;
3056 			/*
3057 			 * If flushing all mounted file systems or
3058 			 * the vnode belongs to this vfs, has pages
3059 			 * and is marked as either dirty or mmap'd,
3060 			 * hold and add this vnode to the list of
3061 			 * vnodes to flush.
3062 			 */
3063 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3064 			    vn_has_cached_data(vp) &&
3065 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3066 				VN_HOLD(vp);
3067 				vplist[cnt++] = vp;
3068 				if (cnt == num) {
3069 					rw_exit(&rtable[index].r_lock);
3070 					goto toomany;
3071 				}
3072 			}
3073 		}
3074 		rw_exit(&rtable[index].r_lock);
3075 	}
3076 toomany:
3077 
3078 	/*
3079 	 * Flush and release all of the files on the list.
3080 	 */
3081 	while (cnt-- > 0) {
3082 		vp = vplist[cnt];
3083 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3084 		VN_RELE(vp);
3085 	}
3086 
3087 	/*
3088 	 * Free the space allocated to hold the list.
3089 	 */
3090 	kmem_free(vplist, num * sizeof (*vplist));
3091 }
3092 
3093 /*
3094  * This probably needs to be larger than or equal to
3095  * log2(sizeof (struct rnode)) due to the way that rnodes are
3096  * allocated.
3097  */
3098 #define	ACACHE_SHIFT_BITS	9
3099 
3100 static int
3101 acachehash(rnode_t *rp, cred_t *cr)
3102 {
3103 
3104 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3105 	    acachemask);
3106 }
3107 
3108 #ifdef DEBUG
3109 static long nfs_access_cache_hits = 0;
3110 static long nfs_access_cache_misses = 0;
3111 #endif
3112 
3113 nfs_access_type_t
3114 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3115 {
3116 	vnode_t *vp;
3117 	acache_t *ap;
3118 	acache_hash_t *hp;
3119 	nfs_access_type_t all;
3120 
3121 	vp = RTOV(rp);
3122 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3123 		return (NFS_ACCESS_UNKNOWN);
3124 
3125 	if (rp->r_acache != NULL) {
3126 		hp = &acache[acachehash(rp, cr)];
3127 		rw_enter(&hp->lock, RW_READER);
3128 		ap = hp->next;
3129 		while (ap != (acache_t *)hp) {
3130 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3131 				if ((ap->known & acc) == acc) {
3132 #ifdef DEBUG
3133 					nfs_access_cache_hits++;
3134 #endif
3135 					if ((ap->allowed & acc) == acc)
3136 						all = NFS_ACCESS_ALLOWED;
3137 					else
3138 						all = NFS_ACCESS_DENIED;
3139 				} else {
3140 #ifdef DEBUG
3141 					nfs_access_cache_misses++;
3142 #endif
3143 					all = NFS_ACCESS_UNKNOWN;
3144 				}
3145 				rw_exit(&hp->lock);
3146 				return (all);
3147 			}
3148 			ap = ap->next;
3149 		}
3150 		rw_exit(&hp->lock);
3151 	}
3152 
3153 #ifdef DEBUG
3154 	nfs_access_cache_misses++;
3155 #endif
3156 	return (NFS_ACCESS_UNKNOWN);
3157 }
3158 
3159 void
3160 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3161 {
3162 	acache_t *ap;
3163 	acache_t *nap;
3164 	acache_hash_t *hp;
3165 
3166 	hp = &acache[acachehash(rp, cr)];
3167 
3168 	/*
3169 	 * Allocate now assuming that mostly an allocation will be
3170 	 * required.  This allows the allocation to happen without
3171 	 * holding the hash bucket locked.
3172 	 */
3173 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3174 	if (nap != NULL) {
3175 		nap->known = acc;
3176 		nap->allowed = resacc;
3177 		nap->rnode = rp;
3178 		crhold(cr);
3179 		nap->cred = cr;
3180 		nap->hashq = hp;
3181 	}
3182 
3183 	rw_enter(&hp->lock, RW_WRITER);
3184 
3185 	if (rp->r_acache != NULL) {
3186 		ap = hp->next;
3187 		while (ap != (acache_t *)hp) {
3188 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3189 				ap->known |= acc;
3190 				ap->allowed &= ~acc;
3191 				ap->allowed |= resacc;
3192 				rw_exit(&hp->lock);
3193 				if (nap != NULL) {
3194 					crfree(nap->cred);
3195 					kmem_cache_free(acache_cache, nap);
3196 				}
3197 				return;
3198 			}
3199 			ap = ap->next;
3200 		}
3201 	}
3202 
3203 	if (nap != NULL) {
3204 #ifdef DEBUG
3205 		clstat_debug.access.value.ui64++;
3206 #endif
3207 		nap->next = hp->next;
3208 		hp->next = nap;
3209 		nap->next->prev = nap;
3210 		nap->prev = (acache_t *)hp;
3211 
3212 		mutex_enter(&rp->r_statelock);
3213 		nap->list = rp->r_acache;
3214 		rp->r_acache = nap;
3215 		mutex_exit(&rp->r_statelock);
3216 	}
3217 
3218 	rw_exit(&hp->lock);
3219 }
3220 
3221 int
3222 nfs_access_purge_rp(rnode_t *rp)
3223 {
3224 	acache_t *ap;
3225 	acache_t *tmpap;
3226 	acache_t *rplist;
3227 
3228 	/*
3229 	 * If there aren't any cached entries, then there is nothing
3230 	 * to free.
3231 	 */
3232 	if (rp->r_acache == NULL)
3233 		return (0);
3234 
3235 	mutex_enter(&rp->r_statelock);
3236 	rplist = rp->r_acache;
3237 	rp->r_acache = NULL;
3238 	mutex_exit(&rp->r_statelock);
3239 
3240 	/*
3241 	 * Loop through each entry in the list pointed to in the
3242 	 * rnode.  Remove each of these entries from the hash
3243 	 * queue that it is on and remove it from the list in
3244 	 * the rnode.
3245 	 */
3246 	for (ap = rplist; ap != NULL; ap = tmpap) {
3247 		rw_enter(&ap->hashq->lock, RW_WRITER);
3248 		ap->prev->next = ap->next;
3249 		ap->next->prev = ap->prev;
3250 		rw_exit(&ap->hashq->lock);
3251 
3252 		tmpap = ap->list;
3253 		crfree(ap->cred);
3254 		kmem_cache_free(acache_cache, ap);
3255 #ifdef DEBUG
3256 		clstat_debug.access.value.ui64--;
3257 #endif
3258 	}
3259 
3260 	return (1);
3261 }
3262 
3263 static const char prefix[] = ".nfs";
3264 
3265 static kmutex_t newnum_lock;
3266 
3267 int
3268 newnum(void)
3269 {
3270 	static uint_t newnum = 0;
3271 	uint_t id;
3272 
3273 	mutex_enter(&newnum_lock);
3274 	if (newnum == 0)
3275 		newnum = gethrestime_sec() & 0xffff;
3276 	id = newnum++;
3277 	mutex_exit(&newnum_lock);
3278 	return (id);
3279 }
3280 
3281 char *
3282 newname(void)
3283 {
3284 	char *news;
3285 	char *s;
3286 	const char *p;
3287 	uint_t id;
3288 
3289 	id = newnum();
3290 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3291 	s = news;
3292 	p = prefix;
3293 	while (*p != '\0')
3294 		*s++ = *p++;
3295 	while (id != 0) {
3296 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3297 		id >>= 4;
3298 	}
3299 	*s = '\0';
3300 	return (news);
3301 }
3302 
3303 /*
3304  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3305  * framework.
3306  */
3307 static int
3308 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3309 {
3310 	ksp->ks_snaptime = gethrtime();
3311 	if (rw == KSTAT_WRITE) {
3312 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3313 #ifdef DEBUG
3314 		/*
3315 		 * Currently only the global zone can write to kstats, but we
3316 		 * add the check just for paranoia.
3317 		 */
3318 		if (INGLOBALZONE(curproc))
3319 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3320 			    sizeof (clstat_debug));
3321 #endif
3322 	} else {
3323 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3324 #ifdef DEBUG
3325 		/*
3326 		 * If we're displaying the "global" debug kstat values, we
3327 		 * display them as-is to all zones since in fact they apply to
3328 		 * the system as a whole.
3329 		 */
3330 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3331 		    sizeof (clstat_debug));
3332 #endif
3333 	}
3334 	return (0);
3335 }
3336 
3337 static void *
3338 clinit_zone(zoneid_t zoneid)
3339 {
3340 	kstat_t *nfs_client_kstat;
3341 	struct nfs_clnt *nfscl;
3342 	uint_t ndata;
3343 
3344 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3345 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3346 	nfscl->nfscl_chtable = NULL;
3347 	nfscl->nfscl_zoneid = zoneid;
3348 
3349 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3350 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3351 #ifdef DEBUG
3352 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3353 #endif
3354 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3355 	    "misc", KSTAT_TYPE_NAMED, ndata,
3356 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3357 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3358 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3359 		kstat_install(nfs_client_kstat);
3360 	}
3361 	mutex_enter(&nfs_clnt_list_lock);
3362 	list_insert_head(&nfs_clnt_list, nfscl);
3363 	mutex_exit(&nfs_clnt_list_lock);
3364 	return (nfscl);
3365 }
3366 
3367 /*ARGSUSED*/
3368 static void
3369 clfini_zone(zoneid_t zoneid, void *arg)
3370 {
3371 	struct nfs_clnt *nfscl = arg;
3372 	chhead_t *chp, *next;
3373 
3374 	if (nfscl == NULL)
3375 		return;
3376 	mutex_enter(&nfs_clnt_list_lock);
3377 	list_remove(&nfs_clnt_list, nfscl);
3378 	mutex_exit(&nfs_clnt_list_lock);
3379 	clreclaim_zone(nfscl, 0);
3380 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3381 		ASSERT(chp->ch_list == NULL);
3382 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3383 		next = chp->ch_next;
3384 		kmem_free(chp, sizeof (*chp));
3385 	}
3386 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3387 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3388 	kmem_free(nfscl, sizeof (*nfscl));
3389 }
3390 
3391 /*
3392  * Called by endpnt_destructor to make sure the client handles are
3393  * cleaned up before the RPC endpoints.  This becomes a no-op if
3394  * clfini_zone (above) is called first.  This function is needed
3395  * (rather than relying on clfini_zone to clean up) because the ZSD
3396  * callbacks have no ordering mechanism, so we have no way to ensure
3397  * that clfini_zone is called before endpnt_destructor.
3398  */
3399 void
3400 clcleanup_zone(zoneid_t zoneid)
3401 {
3402 	struct nfs_clnt *nfscl;
3403 
3404 	mutex_enter(&nfs_clnt_list_lock);
3405 	nfscl = list_head(&nfs_clnt_list);
3406 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3407 		if (nfscl->nfscl_zoneid == zoneid) {
3408 			clreclaim_zone(nfscl, 0);
3409 			break;
3410 		}
3411 	}
3412 	mutex_exit(&nfs_clnt_list_lock);
3413 }
3414 
3415 int
3416 nfs_subrinit(void)
3417 {
3418 	int i;
3419 	ulong_t nrnode_max;
3420 
3421 	/*
3422 	 * Allocate and initialize the rnode hash queues
3423 	 */
3424 	if (nrnode <= 0)
3425 		nrnode = ncsize;
3426 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3427 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3428 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3429 		    "setting nrnode to max value of %ld", nrnode_max);
3430 		nrnode = nrnode_max;
3431 	}
3432 
3433 	rtablesize = 1 << highbit(nrnode / hashlen);
3434 	rtablemask = rtablesize - 1;
3435 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3436 	for (i = 0; i < rtablesize; i++) {
3437 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3438 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3439 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3440 	}
3441 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3442 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3443 
3444 	/*
3445 	 * Allocate and initialize the access cache
3446 	 */
3447 
3448 	/*
3449 	 * Initial guess is one access cache entry per rnode unless
3450 	 * nacache is set to a non-zero value and then it is used to
3451 	 * indicate a guess at the number of access cache entries.
3452 	 */
3453 	if (nacache > 0)
3454 		acachesize = 1 << highbit(nacache / hashlen);
3455 	else
3456 		acachesize = rtablesize;
3457 	acachemask = acachesize - 1;
3458 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3459 	for (i = 0; i < acachesize; i++) {
3460 		acache[i].next = (acache_t *)&acache[i];
3461 		acache[i].prev = (acache_t *)&acache[i];
3462 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3463 	}
3464 	acache_cache = kmem_cache_create("nfs_access_cache",
3465 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3466 	/*
3467 	 * Allocate and initialize the client handle cache
3468 	 */
3469 	chtab_cache = kmem_cache_create("client_handle_cache",
3470 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3471 	/*
3472 	 * Initialize the list of per-zone client handles (and associated data).
3473 	 * This needs to be done before we call zone_key_create().
3474 	 */
3475 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3476 	    offsetof(struct nfs_clnt, nfscl_node));
3477 	/*
3478 	 * Initialize the zone_key for per-zone client handle lists.
3479 	 */
3480 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3481 	/*
3482 	 * Initialize the various mutexes and reader/writer locks
3483 	 */
3484 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3485 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3486 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3487 
3488 	/*
3489 	 * Assign unique major number for all nfs mounts
3490 	 */
3491 	if ((nfs_major = getudev()) == -1) {
3492 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3493 		    "nfs: init: can't get unique device number");
3494 		nfs_major = 0;
3495 	}
3496 	nfs_minor = 0;
3497 
3498 	if (nfs3_jukebox_delay == 0)
3499 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3500 
3501 	return (0);
3502 }
3503 
3504 void
3505 nfs_subrfini(void)
3506 {
3507 	int i;
3508 
3509 	/*
3510 	 * Deallocate the rnode hash queues
3511 	 */
3512 	kmem_cache_destroy(rnode_cache);
3513 
3514 	for (i = 0; i < rtablesize; i++)
3515 		rw_destroy(&rtable[i].r_lock);
3516 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3517 
3518 	/*
3519 	 * Deallocated the access cache
3520 	 */
3521 	kmem_cache_destroy(acache_cache);
3522 
3523 	for (i = 0; i < acachesize; i++)
3524 		rw_destroy(&acache[i].lock);
3525 	kmem_free(acache, acachesize * sizeof (*acache));
3526 
3527 	/*
3528 	 * Deallocate the client handle cache
3529 	 */
3530 	kmem_cache_destroy(chtab_cache);
3531 
3532 	/*
3533 	 * Destroy the various mutexes and reader/writer locks
3534 	 */
3535 	mutex_destroy(&rpfreelist_lock);
3536 	mutex_destroy(&newnum_lock);
3537 	mutex_destroy(&nfs_minor_lock);
3538 	(void) zone_key_delete(nfsclnt_zone_key);
3539 }
3540 
3541 enum nfsstat
3542 puterrno(int error)
3543 {
3544 
3545 	switch (error) {
3546 	case EOPNOTSUPP:
3547 		return (NFSERR_OPNOTSUPP);
3548 	case ENAMETOOLONG:
3549 		return (NFSERR_NAMETOOLONG);
3550 	case ENOTEMPTY:
3551 		return (NFSERR_NOTEMPTY);
3552 	case EDQUOT:
3553 		return (NFSERR_DQUOT);
3554 	case ESTALE:
3555 		return (NFSERR_STALE);
3556 	case EREMOTE:
3557 		return (NFSERR_REMOTE);
3558 	case ENOSYS:
3559 		return (NFSERR_OPNOTSUPP);
3560 	case EOVERFLOW:
3561 		return (NFSERR_INVAL);
3562 	default:
3563 		return ((enum nfsstat)error);
3564 	}
3565 	/* NOTREACHED */
3566 }
3567 
3568 int
3569 geterrno(enum nfsstat status)
3570 {
3571 
3572 	switch (status) {
3573 	case NFSERR_OPNOTSUPP:
3574 		return (EOPNOTSUPP);
3575 	case NFSERR_NAMETOOLONG:
3576 		return (ENAMETOOLONG);
3577 	case NFSERR_NOTEMPTY:
3578 		return (ENOTEMPTY);
3579 	case NFSERR_DQUOT:
3580 		return (EDQUOT);
3581 	case NFSERR_STALE:
3582 		return (ESTALE);
3583 	case NFSERR_REMOTE:
3584 		return (EREMOTE);
3585 	case NFSERR_WFLUSH:
3586 		return (EIO);
3587 	default:
3588 		return ((int)status);
3589 	}
3590 	/* NOTREACHED */
3591 }
3592 
3593 enum nfsstat3
3594 puterrno3(int error)
3595 {
3596 
3597 #ifdef DEBUG
3598 	switch (error) {
3599 	case 0:
3600 		return (NFS3_OK);
3601 	case EPERM:
3602 		return (NFS3ERR_PERM);
3603 	case ENOENT:
3604 		return (NFS3ERR_NOENT);
3605 	case EIO:
3606 		return (NFS3ERR_IO);
3607 	case ENXIO:
3608 		return (NFS3ERR_NXIO);
3609 	case EACCES:
3610 		return (NFS3ERR_ACCES);
3611 	case EEXIST:
3612 		return (NFS3ERR_EXIST);
3613 	case EXDEV:
3614 		return (NFS3ERR_XDEV);
3615 	case ENODEV:
3616 		return (NFS3ERR_NODEV);
3617 	case ENOTDIR:
3618 		return (NFS3ERR_NOTDIR);
3619 	case EISDIR:
3620 		return (NFS3ERR_ISDIR);
3621 	case EINVAL:
3622 		return (NFS3ERR_INVAL);
3623 	case EFBIG:
3624 		return (NFS3ERR_FBIG);
3625 	case ENOSPC:
3626 		return (NFS3ERR_NOSPC);
3627 	case EROFS:
3628 		return (NFS3ERR_ROFS);
3629 	case EMLINK:
3630 		return (NFS3ERR_MLINK);
3631 	case ENAMETOOLONG:
3632 		return (NFS3ERR_NAMETOOLONG);
3633 	case ENOTEMPTY:
3634 		return (NFS3ERR_NOTEMPTY);
3635 	case EDQUOT:
3636 		return (NFS3ERR_DQUOT);
3637 	case ESTALE:
3638 		return (NFS3ERR_STALE);
3639 	case EREMOTE:
3640 		return (NFS3ERR_REMOTE);
3641 	case ENOSYS:
3642 	case EOPNOTSUPP:
3643 		return (NFS3ERR_NOTSUPP);
3644 	case EOVERFLOW:
3645 		return (NFS3ERR_INVAL);
3646 	default:
3647 		zcmn_err(getzoneid(), CE_WARN,
3648 		    "puterrno3: got error %d", error);
3649 		return ((enum nfsstat3)error);
3650 	}
3651 #else
3652 	switch (error) {
3653 	case ENAMETOOLONG:
3654 		return (NFS3ERR_NAMETOOLONG);
3655 	case ENOTEMPTY:
3656 		return (NFS3ERR_NOTEMPTY);
3657 	case EDQUOT:
3658 		return (NFS3ERR_DQUOT);
3659 	case ESTALE:
3660 		return (NFS3ERR_STALE);
3661 	case ENOSYS:
3662 	case EOPNOTSUPP:
3663 		return (NFS3ERR_NOTSUPP);
3664 	case EREMOTE:
3665 		return (NFS3ERR_REMOTE);
3666 	case EOVERFLOW:
3667 		return (NFS3ERR_INVAL);
3668 	default:
3669 		return ((enum nfsstat3)error);
3670 	}
3671 #endif
3672 }
3673 
3674 int
3675 geterrno3(enum nfsstat3 status)
3676 {
3677 
3678 #ifdef DEBUG
3679 	switch (status) {
3680 	case NFS3_OK:
3681 		return (0);
3682 	case NFS3ERR_PERM:
3683 		return (EPERM);
3684 	case NFS3ERR_NOENT:
3685 		return (ENOENT);
3686 	case NFS3ERR_IO:
3687 		return (EIO);
3688 	case NFS3ERR_NXIO:
3689 		return (ENXIO);
3690 	case NFS3ERR_ACCES:
3691 		return (EACCES);
3692 	case NFS3ERR_EXIST:
3693 		return (EEXIST);
3694 	case NFS3ERR_XDEV:
3695 		return (EXDEV);
3696 	case NFS3ERR_NODEV:
3697 		return (ENODEV);
3698 	case NFS3ERR_NOTDIR:
3699 		return (ENOTDIR);
3700 	case NFS3ERR_ISDIR:
3701 		return (EISDIR);
3702 	case NFS3ERR_INVAL:
3703 		return (EINVAL);
3704 	case NFS3ERR_FBIG:
3705 		return (EFBIG);
3706 	case NFS3ERR_NOSPC:
3707 		return (ENOSPC);
3708 	case NFS3ERR_ROFS:
3709 		return (EROFS);
3710 	case NFS3ERR_MLINK:
3711 		return (EMLINK);
3712 	case NFS3ERR_NAMETOOLONG:
3713 		return (ENAMETOOLONG);
3714 	case NFS3ERR_NOTEMPTY:
3715 		return (ENOTEMPTY);
3716 	case NFS3ERR_DQUOT:
3717 		return (EDQUOT);
3718 	case NFS3ERR_STALE:
3719 		return (ESTALE);
3720 	case NFS3ERR_REMOTE:
3721 		return (EREMOTE);
3722 	case NFS3ERR_BADHANDLE:
3723 		return (ESTALE);
3724 	case NFS3ERR_NOT_SYNC:
3725 		return (EINVAL);
3726 	case NFS3ERR_BAD_COOKIE:
3727 		return (ENOENT);
3728 	case NFS3ERR_NOTSUPP:
3729 		return (EOPNOTSUPP);
3730 	case NFS3ERR_TOOSMALL:
3731 		return (EINVAL);
3732 	case NFS3ERR_SERVERFAULT:
3733 		return (EIO);
3734 	case NFS3ERR_BADTYPE:
3735 		return (EINVAL);
3736 	case NFS3ERR_JUKEBOX:
3737 		return (ENXIO);
3738 	default:
3739 		zcmn_err(getzoneid(), CE_WARN,
3740 		    "geterrno3: got status %d", status);
3741 		return ((int)status);
3742 	}
3743 #else
3744 	switch (status) {
3745 	case NFS3ERR_NAMETOOLONG:
3746 		return (ENAMETOOLONG);
3747 	case NFS3ERR_NOTEMPTY:
3748 		return (ENOTEMPTY);
3749 	case NFS3ERR_DQUOT:
3750 		return (EDQUOT);
3751 	case NFS3ERR_STALE:
3752 	case NFS3ERR_BADHANDLE:
3753 		return (ESTALE);
3754 	case NFS3ERR_NOTSUPP:
3755 		return (EOPNOTSUPP);
3756 	case NFS3ERR_REMOTE:
3757 		return (EREMOTE);
3758 	case NFS3ERR_NOT_SYNC:
3759 	case NFS3ERR_TOOSMALL:
3760 	case NFS3ERR_BADTYPE:
3761 		return (EINVAL);
3762 	case NFS3ERR_BAD_COOKIE:
3763 		return (ENOENT);
3764 	case NFS3ERR_SERVERFAULT:
3765 		return (EIO);
3766 	case NFS3ERR_JUKEBOX:
3767 		return (ENXIO);
3768 	default:
3769 		return ((int)status);
3770 	}
3771 #endif
3772 }
3773 
3774 rddir_cache *
3775 rddir_cache_alloc(int flags)
3776 {
3777 	rddir_cache *rc;
3778 
3779 	rc = kmem_alloc(sizeof (*rc), flags);
3780 	if (rc != NULL) {
3781 		rc->entries = NULL;
3782 		rc->flags = RDDIR;
3783 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3784 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3785 		rc->count = 1;
3786 #ifdef DEBUG
3787 		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3788 #endif
3789 	}
3790 	return (rc);
3791 }
3792 
3793 static void
3794 rddir_cache_free(rddir_cache *rc)
3795 {
3796 
3797 #ifdef DEBUG
3798 	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3799 #endif
3800 	if (rc->entries != NULL) {
3801 #ifdef DEBUG
3802 		rddir_cache_buf_free(rc->entries, rc->buflen);
3803 #else
3804 		kmem_free(rc->entries, rc->buflen);
3805 #endif
3806 	}
3807 	cv_destroy(&rc->cv);
3808 	mutex_destroy(&rc->lock);
3809 	kmem_free(rc, sizeof (*rc));
3810 }
3811 
3812 void
3813 rddir_cache_hold(rddir_cache *rc)
3814 {
3815 
3816 	mutex_enter(&rc->lock);
3817 	rc->count++;
3818 	mutex_exit(&rc->lock);
3819 }
3820 
3821 void
3822 rddir_cache_rele(rddir_cache *rc)
3823 {
3824 
3825 	mutex_enter(&rc->lock);
3826 	ASSERT(rc->count > 0);
3827 	if (--rc->count == 0) {
3828 		mutex_exit(&rc->lock);
3829 		rddir_cache_free(rc);
3830 	} else
3831 		mutex_exit(&rc->lock);
3832 }
3833 
3834 #ifdef DEBUG
3835 char *
3836 rddir_cache_buf_alloc(size_t size, int flags)
3837 {
3838 	char *rc;
3839 
3840 	rc = kmem_alloc(size, flags);
3841 	if (rc != NULL)
3842 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3843 	return (rc);
3844 }
3845 
3846 void
3847 rddir_cache_buf_free(void *addr, size_t size)
3848 {
3849 
3850 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3851 	kmem_free(addr, size);
3852 }
3853 #endif
3854 
3855 static int
3856 nfs_free_data_reclaim(rnode_t *rp)
3857 {
3858 	char *contents;
3859 	int size;
3860 	vsecattr_t *vsp;
3861 	nfs3_pathconf_info *info;
3862 	int freed;
3863 	cred_t *cred;
3864 
3865 	/*
3866 	 * Free any held credentials and caches which
3867 	 * may be associated with this rnode.
3868 	 */
3869 	mutex_enter(&rp->r_statelock);
3870 	cred = rp->r_cred;
3871 	rp->r_cred = NULL;
3872 	contents = rp->r_symlink.contents;
3873 	size = rp->r_symlink.size;
3874 	rp->r_symlink.contents = NULL;
3875 	vsp = rp->r_secattr;
3876 	rp->r_secattr = NULL;
3877 	info = rp->r_pathconf;
3878 	rp->r_pathconf = NULL;
3879 	mutex_exit(&rp->r_statelock);
3880 
3881 	if (cred != NULL)
3882 		crfree(cred);
3883 
3884 	/*
3885 	 * Free the access cache entries.
3886 	 */
3887 	freed = nfs_access_purge_rp(rp);
3888 
3889 	if (!HAVE_RDDIR_CACHE(rp) &&
3890 	    contents == NULL &&
3891 	    vsp == NULL &&
3892 	    info == NULL)
3893 		return (freed);
3894 
3895 	/*
3896 	 * Free the readdir cache entries
3897 	 */
3898 	if (HAVE_RDDIR_CACHE(rp))
3899 		nfs_purge_rddir_cache(RTOV(rp));
3900 
3901 	/*
3902 	 * Free the symbolic link cache.
3903 	 */
3904 	if (contents != NULL) {
3905 
3906 		kmem_free((void *)contents, size);
3907 	}
3908 
3909 	/*
3910 	 * Free any cached ACL.
3911 	 */
3912 	if (vsp != NULL)
3913 		nfs_acl_free(vsp);
3914 
3915 	/*
3916 	 * Free any cached pathconf information.
3917 	 */
3918 	if (info != NULL)
3919 		kmem_free(info, sizeof (*info));
3920 
3921 	return (1);
3922 }
3923 
3924 static int
3925 nfs_active_data_reclaim(rnode_t *rp)
3926 {
3927 	char *contents;
3928 	int size;
3929 	vsecattr_t *vsp;
3930 	nfs3_pathconf_info *info;
3931 	int freed;
3932 
3933 	/*
3934 	 * Free any held credentials and caches which
3935 	 * may be associated with this rnode.
3936 	 */
3937 	if (!mutex_tryenter(&rp->r_statelock))
3938 		return (0);
3939 	contents = rp->r_symlink.contents;
3940 	size = rp->r_symlink.size;
3941 	rp->r_symlink.contents = NULL;
3942 	vsp = rp->r_secattr;
3943 	rp->r_secattr = NULL;
3944 	info = rp->r_pathconf;
3945 	rp->r_pathconf = NULL;
3946 	mutex_exit(&rp->r_statelock);
3947 
3948 	/*
3949 	 * Free the access cache entries.
3950 	 */
3951 	freed = nfs_access_purge_rp(rp);
3952 
3953 	if (!HAVE_RDDIR_CACHE(rp) &&
3954 	    contents == NULL &&
3955 	    vsp == NULL &&
3956 	    info == NULL)
3957 		return (freed);
3958 
3959 	/*
3960 	 * Free the readdir cache entries
3961 	 */
3962 	if (HAVE_RDDIR_CACHE(rp))
3963 		nfs_purge_rddir_cache(RTOV(rp));
3964 
3965 	/*
3966 	 * Free the symbolic link cache.
3967 	 */
3968 	if (contents != NULL) {
3969 
3970 		kmem_free((void *)contents, size);
3971 	}
3972 
3973 	/*
3974 	 * Free any cached ACL.
3975 	 */
3976 	if (vsp != NULL)
3977 		nfs_acl_free(vsp);
3978 
3979 	/*
3980 	 * Free any cached pathconf information.
3981 	 */
3982 	if (info != NULL)
3983 		kmem_free(info, sizeof (*info));
3984 
3985 	return (1);
3986 }
3987 
3988 static int
3989 nfs_free_reclaim(void)
3990 {
3991 	int freed;
3992 	rnode_t *rp;
3993 
3994 #ifdef DEBUG
3995 	clstat_debug.f_reclaim.value.ui64++;
3996 #endif
3997 	freed = 0;
3998 	mutex_enter(&rpfreelist_lock);
3999 	rp = rpfreelist;
4000 	if (rp != NULL) {
4001 		do {
4002 			if (nfs_free_data_reclaim(rp))
4003 				freed = 1;
4004 		} while ((rp = rp->r_freef) != rpfreelist);
4005 	}
4006 	mutex_exit(&rpfreelist_lock);
4007 	return (freed);
4008 }
4009 
4010 static int
4011 nfs_active_reclaim(void)
4012 {
4013 	int freed;
4014 	int index;
4015 	rnode_t *rp;
4016 
4017 #ifdef DEBUG
4018 	clstat_debug.a_reclaim.value.ui64++;
4019 #endif
4020 	freed = 0;
4021 	for (index = 0; index < rtablesize; index++) {
4022 		rw_enter(&rtable[index].r_lock, RW_READER);
4023 		for (rp = rtable[index].r_hashf;
4024 		    rp != (rnode_t *)(&rtable[index]);
4025 		    rp = rp->r_hashf) {
4026 			if (nfs_active_data_reclaim(rp))
4027 				freed = 1;
4028 		}
4029 		rw_exit(&rtable[index].r_lock);
4030 	}
4031 	return (freed);
4032 }
4033 
4034 static int
4035 nfs_rnode_reclaim(void)
4036 {
4037 	int freed;
4038 	rnode_t *rp;
4039 	vnode_t *vp;
4040 
4041 #ifdef DEBUG
4042 	clstat_debug.r_reclaim.value.ui64++;
4043 #endif
4044 	freed = 0;
4045 	mutex_enter(&rpfreelist_lock);
4046 	while ((rp = rpfreelist) != NULL) {
4047 		rp_rmfree(rp);
4048 		mutex_exit(&rpfreelist_lock);
4049 		if (rp->r_flags & RHASHED) {
4050 			vp = RTOV(rp);
4051 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4052 			mutex_enter(&vp->v_lock);
4053 			if (vp->v_count > 1) {
4054 				vp->v_count--;
4055 				mutex_exit(&vp->v_lock);
4056 				rw_exit(&rp->r_hashq->r_lock);
4057 				mutex_enter(&rpfreelist_lock);
4058 				continue;
4059 			}
4060 			mutex_exit(&vp->v_lock);
4061 			rp_rmhash_locked(rp);
4062 			rw_exit(&rp->r_hashq->r_lock);
4063 		}
4064 		/*
4065 		 * This call to rp_addfree will end up destroying the
4066 		 * rnode, but in a safe way with the appropriate set
4067 		 * of checks done.
4068 		 */
4069 		rp_addfree(rp, CRED());
4070 		mutex_enter(&rpfreelist_lock);
4071 	}
4072 	mutex_exit(&rpfreelist_lock);
4073 	return (freed);
4074 }
4075 
4076 /*ARGSUSED*/
4077 static void
4078 nfs_reclaim(void *cdrarg)
4079 {
4080 
4081 #ifdef DEBUG
4082 	clstat_debug.reclaim.value.ui64++;
4083 #endif
4084 	if (nfs_free_reclaim())
4085 		return;
4086 
4087 	if (nfs_active_reclaim())
4088 		return;
4089 
4090 	(void) nfs_rnode_reclaim();
4091 }
4092 
4093 /*
4094  * NFS client failover support
4095  *
4096  * Routines to copy filehandles
4097  */
4098 void
4099 nfscopyfh(caddr_t fhp, vnode_t *vp)
4100 {
4101 	fhandle_t *dest = (fhandle_t *)fhp;
4102 
4103 	if (dest != NULL)
4104 		*dest = *VTOFH(vp);
4105 }
4106 
4107 void
4108 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4109 {
4110 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4111 
4112 	if (dest != NULL)
4113 		*dest = *VTOFH3(vp);
4114 }
4115 
4116 /*
4117  * NFS client failover support
4118  *
4119  * failover_safe() will test various conditions to ensure that
4120  * failover is permitted for this vnode.  It will be denied
4121  * if:
4122  *	1) the operation in progress does not support failover (NULL fi)
4123  *	2) there are no available replicas (NULL mi_servers->sv_next)
4124  *	3) any locks are outstanding on this file
4125  */
4126 static int
4127 failover_safe(failinfo_t *fi)
4128 {
4129 
4130 	/*
4131 	 * Does this op permit failover?
4132 	 */
4133 	if (fi == NULL || fi->vp == NULL)
4134 		return (0);
4135 
4136 	/*
4137 	 * Are there any alternates to failover to?
4138 	 */
4139 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4140 		return (0);
4141 
4142 	/*
4143 	 * Disable check; we've forced local locking
4144 	 *
4145 	 * if (flk_has_remote_locks(fi->vp))
4146 	 *	return (0);
4147 	 */
4148 
4149 	/*
4150 	 * If we have no partial path, we can't do anything
4151 	 */
4152 	if (VTOR(fi->vp)->r_path == NULL)
4153 		return (0);
4154 
4155 	return (1);
4156 }
4157 
4158 #include <sys/thread.h>
4159 
4160 /*
4161  * NFS client failover support
4162  *
4163  * failover_newserver() will start a search for a new server,
4164  * preferably by starting an async thread to do the work.  If
4165  * someone is already doing this (recognizable by MI_BINDINPROG
4166  * being set), it will simply return and the calling thread
4167  * will queue on the mi_failover_cv condition variable.
4168  */
4169 static void
4170 failover_newserver(mntinfo_t *mi)
4171 {
4172 	/*
4173 	 * Check if someone else is doing this already
4174 	 */
4175 	mutex_enter(&mi->mi_lock);
4176 	if (mi->mi_flags & MI_BINDINPROG) {
4177 		mutex_exit(&mi->mi_lock);
4178 		return;
4179 	}
4180 	mi->mi_flags |= MI_BINDINPROG;
4181 
4182 	/*
4183 	 * Need to hold the vfs struct so that it can't be released
4184 	 * while the failover thread is selecting a new server.
4185 	 */
4186 	VFS_HOLD(mi->mi_vfsp);
4187 
4188 	/*
4189 	 * Start a thread to do the real searching.
4190 	 */
4191 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4192 
4193 	mutex_exit(&mi->mi_lock);
4194 }
4195 
4196 /*
4197  * NFS client failover support
4198  *
4199  * failover_thread() will find a new server to replace the one
4200  * currently in use, wake up other threads waiting on this mount
4201  * point, and die.  It will start at the head of the server list
4202  * and poll servers until it finds one with an NFS server which is
4203  * registered and responds to a NULL procedure ping.
4204  *
4205  * XXX failover_thread is unsafe within the scope of the
4206  * present model defined for cpr to suspend the system.
4207  * Specifically, over-the-wire calls made by the thread
4208  * are unsafe. The thread needs to be reevaluated in case of
4209  * future updates to the cpr suspend model.
4210  */
4211 static void
4212 failover_thread(mntinfo_t *mi)
4213 {
4214 	servinfo_t *svp = NULL;
4215 	CLIENT *cl;
4216 	enum clnt_stat status;
4217 	struct timeval tv;
4218 	int error;
4219 	int oncethru = 0;
4220 	callb_cpr_t cprinfo;
4221 	rnode_t *rp;
4222 	int index;
4223 	char *srvnames;
4224 	size_t srvnames_len;
4225 	struct nfs_clnt *nfscl = NULL;
4226 	zoneid_t zoneid = getzoneid();
4227 
4228 #ifdef DEBUG
4229 	/*
4230 	 * This is currently only needed to access counters which exist on
4231 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4232 	 * on non-DEBUG kernels.
4233 	 */
4234 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4235 	ASSERT(nfscl != NULL);
4236 #endif
4237 
4238 	/*
4239 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4240 	 * code guarantees that there will be only one failover thread
4241 	 * per mountinfo at any instance.
4242 	 */
4243 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4244 	    "failover_thread");
4245 
4246 	mutex_enter(&mi->mi_lock);
4247 	while (mi->mi_readers) {
4248 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4249 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4250 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4251 	}
4252 	mutex_exit(&mi->mi_lock);
4253 
4254 	tv.tv_sec = 2;
4255 	tv.tv_usec = 0;
4256 
4257 	/*
4258 	 * Ping the null NFS procedure of every server in
4259 	 * the list until one responds.  We always start
4260 	 * at the head of the list and always skip the one
4261 	 * that is current, since it's caused us a problem.
4262 	 */
4263 	while (svp == NULL) {
4264 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4265 			if (!oncethru && svp == mi->mi_curr_serv)
4266 				continue;
4267 
4268 			/*
4269 			 * If the file system was forcibly umounted
4270 			 * while trying to do a failover, then just
4271 			 * give up on the failover.  It won't matter
4272 			 * what the server is.
4273 			 */
4274 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4275 				svp = NULL;
4276 				goto done;
4277 			}
4278 
4279 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4280 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4281 			if (error)
4282 				continue;
4283 
4284 			if (!(mi->mi_flags & MI_INT))
4285 				cl->cl_nosignal = TRUE;
4286 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4287 			    xdr_void, NULL, tv);
4288 			if (!(mi->mi_flags & MI_INT))
4289 				cl->cl_nosignal = FALSE;
4290 			AUTH_DESTROY(cl->cl_auth);
4291 			CLNT_DESTROY(cl);
4292 			if (status == RPC_SUCCESS) {
4293 				if (svp == mi->mi_curr_serv) {
4294 #ifdef DEBUG
4295 					zcmn_err(zoneid, CE_NOTE,
4296 			"NFS%d: failing over: selecting original server %s",
4297 					    mi->mi_vers, svp->sv_hostname);
4298 #else
4299 					zcmn_err(zoneid, CE_NOTE,
4300 			"NFS: failing over: selecting original server %s",
4301 					    svp->sv_hostname);
4302 #endif
4303 				} else {
4304 #ifdef DEBUG
4305 					zcmn_err(zoneid, CE_NOTE,
4306 				    "NFS%d: failing over from %s to %s",
4307 					    mi->mi_vers,
4308 					    mi->mi_curr_serv->sv_hostname,
4309 					    svp->sv_hostname);
4310 #else
4311 					zcmn_err(zoneid, CE_NOTE,
4312 				    "NFS: failing over from %s to %s",
4313 					    mi->mi_curr_serv->sv_hostname,
4314 					    svp->sv_hostname);
4315 #endif
4316 				}
4317 				break;
4318 			}
4319 		}
4320 
4321 		if (svp == NULL) {
4322 			if (!oncethru) {
4323 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4324 #ifdef DEBUG
4325 				zprintf(zoneid,
4326 				    "NFS%d servers %s not responding "
4327 				    "still trying\n", mi->mi_vers, srvnames);
4328 #else
4329 				zprintf(zoneid, "NFS servers %s not responding "
4330 				    "still trying\n", srvnames);
4331 #endif
4332 				oncethru = 1;
4333 			}
4334 			mutex_enter(&mi->mi_lock);
4335 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4336 			mutex_exit(&mi->mi_lock);
4337 			delay(hz);
4338 			mutex_enter(&mi->mi_lock);
4339 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4340 			mutex_exit(&mi->mi_lock);
4341 		}
4342 	}
4343 
4344 	if (oncethru) {
4345 #ifdef DEBUG
4346 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4347 #else
4348 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4349 #endif
4350 	}
4351 
4352 	if (svp != mi->mi_curr_serv) {
4353 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4354 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4355 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4356 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4357 		    mi->mi_vfsp);
4358 		if (rp != NULL) {
4359 			if (rp->r_flags & RHASHED)
4360 				rp_rmhash_locked(rp);
4361 			rw_exit(&rtable[index].r_lock);
4362 			rp->r_server = svp;
4363 			rp->r_fh = svp->sv_fhandle;
4364 			(void) nfs_free_data_reclaim(rp);
4365 			index = rtablehash(&rp->r_fh);
4366 			rp->r_hashq = &rtable[index];
4367 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4368 			vn_exists(RTOV(rp));
4369 			rp_addhash(rp);
4370 			rw_exit(&rp->r_hashq->r_lock);
4371 			VN_RELE(RTOV(rp));
4372 		} else
4373 			rw_exit(&rtable[index].r_lock);
4374 	}
4375 
4376 done:
4377 	if (oncethru)
4378 		kmem_free(srvnames, srvnames_len);
4379 	mutex_enter(&mi->mi_lock);
4380 	mi->mi_flags &= ~MI_BINDINPROG;
4381 	if (svp != NULL) {
4382 		mi->mi_curr_serv = svp;
4383 		mi->mi_failover++;
4384 #ifdef DEBUG
4385 	nfscl->nfscl_stat.failover.value.ui64++;
4386 #endif
4387 	}
4388 	cv_broadcast(&mi->mi_failover_cv);
4389 	CALLB_CPR_EXIT(&cprinfo);
4390 	VFS_RELE(mi->mi_vfsp);
4391 	zthread_exit();
4392 	/* NOTREACHED */
4393 }
4394 
4395 /*
4396  * NFS client failover support
4397  *
4398  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4399  * is cleared, meaning that failover is complete.  Called with
4400  * mi_lock mutex held.
4401  */
4402 static int
4403 failover_wait(mntinfo_t *mi)
4404 {
4405 	k_sigset_t smask;
4406 
4407 	/*
4408 	 * If someone else is hunting for a living server,
4409 	 * sleep until it's done.  After our sleep, we may
4410 	 * be bound to the right server and get off cheaply.
4411 	 */
4412 	while (mi->mi_flags & MI_BINDINPROG) {
4413 		/*
4414 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4415 		 * and SIGTERM. (Preserving the existing masks).
4416 		 * Mask out SIGINT if mount option nointr is specified.
4417 		 */
4418 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4419 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4420 			/*
4421 			 * restore original signal mask
4422 			 */
4423 			sigunintr(&smask);
4424 			return (EINTR);
4425 		}
4426 		/*
4427 		 * restore original signal mask
4428 		 */
4429 		sigunintr(&smask);
4430 	}
4431 	return (0);
4432 }
4433 
4434 /*
4435  * NFS client failover support
4436  *
4437  * failover_remap() will do a partial pathname lookup and find the
4438  * desired vnode on the current server.  The interim vnode will be
4439  * discarded after we pilfer the new filehandle.
4440  *
4441  * Side effects:
4442  * - This routine will also update the filehandle in the args structure
4443  *    pointed to by the fi->fhp pointer if it is non-NULL.
4444  */
4445 
4446 static int
4447 failover_remap(failinfo_t *fi)
4448 {
4449 	vnode_t *vp, *nvp, *rootvp;
4450 	rnode_t *rp, *nrp;
4451 	mntinfo_t *mi;
4452 	int error;
4453 #ifdef DEBUG
4454 	struct nfs_clnt *nfscl;
4455 
4456 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4457 	ASSERT(nfscl != NULL);
4458 #endif
4459 	/*
4460 	 * Sanity check
4461 	 */
4462 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4463 		return (EINVAL);
4464 	vp = fi->vp;
4465 	rp = VTOR(vp);
4466 	mi = VTOMI(vp);
4467 
4468 	if (!(vp->v_flag & VROOT)) {
4469 		/*
4470 		 * Given the root fh, use the path stored in
4471 		 * the rnode to find the fh for the new server.
4472 		 */
4473 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4474 		if (error)
4475 			return (error);
4476 
4477 		error = failover_lookup(rp->r_path, rootvp,
4478 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4479 
4480 		VN_RELE(rootvp);
4481 
4482 		if (error)
4483 			return (error);
4484 
4485 		/*
4486 		 * If we found the same rnode, we're done now
4487 		 */
4488 		if (nvp == vp) {
4489 			/*
4490 			 * Failed and the new server may physically be same
4491 			 * OR may share a same disk subsystem. In this case
4492 			 * file handle for a particular file path is not going
4493 			 * to change, given the same filehandle lookup will
4494 			 * always locate the same rnode as the existing one.
4495 			 * All we might need to do is to update the r_server
4496 			 * with the current servinfo.
4497 			 */
4498 			if (!VALID_FH(fi)) {
4499 				rp->r_server = mi->mi_curr_serv;
4500 			}
4501 			VN_RELE(nvp);
4502 			return (0);
4503 		}
4504 
4505 		/*
4506 		 * Try to make it so that no one else will find this
4507 		 * vnode because it is just a temporary to hold the
4508 		 * new file handle until that file handle can be
4509 		 * copied to the original vnode/rnode.
4510 		 */
4511 		nrp = VTOR(nvp);
4512 		mutex_enter(&mi->mi_remap_lock);
4513 		/*
4514 		 * Some other thread could have raced in here and could
4515 		 * have done the remap for this particular rnode before
4516 		 * this thread here. Check for rp->r_server and
4517 		 * mi->mi_curr_serv and return if they are same.
4518 		 */
4519 		if (VALID_FH(fi)) {
4520 			mutex_exit(&mi->mi_remap_lock);
4521 			VN_RELE(nvp);
4522 			return (0);
4523 		}
4524 
4525 		if (nrp->r_flags & RHASHED)
4526 			rp_rmhash(nrp);
4527 
4528 		/*
4529 		 * As a heuristic check on the validity of the new
4530 		 * file, check that the size and type match against
4531 		 * that we remember from the old version.
4532 		 */
4533 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4534 			mutex_exit(&mi->mi_remap_lock);
4535 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4536 			    "NFS replicas %s and %s: file %s not same.",
4537 			    rp->r_server->sv_hostname,
4538 			    nrp->r_server->sv_hostname, rp->r_path);
4539 			VN_RELE(nvp);
4540 			return (EINVAL);
4541 		}
4542 
4543 		/*
4544 		 * snarf the filehandle from the new rnode
4545 		 * then release it, again while updating the
4546 		 * hash queues for the rnode.
4547 		 */
4548 		if (rp->r_flags & RHASHED)
4549 			rp_rmhash(rp);
4550 		rp->r_server = mi->mi_curr_serv;
4551 		rp->r_fh = nrp->r_fh;
4552 		rp->r_hashq = nrp->r_hashq;
4553 		/*
4554 		 * Copy the attributes from the new rnode to the old
4555 		 * rnode.  This will help to reduce unnecessary page
4556 		 * cache flushes.
4557 		 */
4558 		rp->r_attr = nrp->r_attr;
4559 		rp->r_attrtime = nrp->r_attrtime;
4560 		rp->r_mtime = nrp->r_mtime;
4561 		(void) nfs_free_data_reclaim(rp);
4562 		nfs_setswaplike(vp, &rp->r_attr);
4563 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4564 		rp_addhash(rp);
4565 		rw_exit(&rp->r_hashq->r_lock);
4566 		mutex_exit(&mi->mi_remap_lock);
4567 		VN_RELE(nvp);
4568 	}
4569 
4570 	/*
4571 	 * Update successful failover remap count
4572 	 */
4573 	mutex_enter(&mi->mi_lock);
4574 	mi->mi_remap++;
4575 	mutex_exit(&mi->mi_lock);
4576 #ifdef DEBUG
4577 	nfscl->nfscl_stat.remap.value.ui64++;
4578 #endif
4579 
4580 	/*
4581 	 * If we have a copied filehandle to update, do it now.
4582 	 */
4583 	if (fi->fhp != NULL && fi->copyproc != NULL)
4584 		(*fi->copyproc)(fi->fhp, vp);
4585 
4586 	return (0);
4587 }
4588 
4589 /*
4590  * NFS client failover support
4591  *
4592  * We want a simple pathname lookup routine to parse the pieces
4593  * of path in rp->r_path.  We know that the path was a created
4594  * as rnodes were made, so we know we have only to deal with
4595  * paths that look like:
4596  *	dir1/dir2/dir3/file
4597  * Any evidence of anything like .., symlinks, and ENOTDIR
4598  * are hard errors, because they mean something in this filesystem
4599  * is different from the one we came from, or has changed under
4600  * us in some way.  If this is true, we want the failure.
4601  *
4602  * Extended attributes: if the filesystem is mounted with extended
4603  * attributes enabled (-o xattr), the attribute directory will be
4604  * represented in the r_path as the magic name XATTR_RPATH. So if
4605  * we see that name in the pathname, is must be because this node
4606  * is an extended attribute.  Therefore, look it up that way.
4607  */
4608 static int
4609 failover_lookup(char *path, vnode_t *root,
4610     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4611 	vnode_t *, cred_t *, int),
4612     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4613     vnode_t **new)
4614 {
4615 	vnode_t *dvp, *nvp;
4616 	int error = EINVAL;
4617 	char *s, *p, *tmppath;
4618 	size_t len;
4619 	mntinfo_t *mi;
4620 	bool_t xattr;
4621 
4622 	/* Make local copy of path */
4623 	len = strlen(path) + 1;
4624 	tmppath = kmem_alloc(len, KM_SLEEP);
4625 	(void) strcpy(tmppath, path);
4626 	s = tmppath;
4627 
4628 	dvp = root;
4629 	VN_HOLD(dvp);
4630 	mi = VTOMI(root);
4631 	xattr = mi->mi_flags & MI_EXTATTR;
4632 
4633 	do {
4634 		p = strchr(s, '/');
4635 		if (p != NULL)
4636 			*p = '\0';
4637 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4638 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4639 			    RFSCALL_SOFT);
4640 		} else {
4641 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4642 			    CRED(), RFSCALL_SOFT);
4643 		}
4644 		if (p != NULL)
4645 			*p++ = '/';
4646 		if (error) {
4647 			VN_RELE(dvp);
4648 			kmem_free(tmppath, len);
4649 			return (error);
4650 		}
4651 		s = p;
4652 		VN_RELE(dvp);
4653 		dvp = nvp;
4654 	} while (p != NULL);
4655 
4656 	if (nvp != NULL && new != NULL)
4657 		*new = nvp;
4658 	kmem_free(tmppath, len);
4659 	return (0);
4660 }
4661 
4662 /*
4663  * NFS client failover support
4664  *
4665  * sv_free() frees the malloc'd portion of a "servinfo_t".
4666  */
4667 void
4668 sv_free(servinfo_t *svp)
4669 {
4670 	servinfo_t *next;
4671 	struct knetconfig *knconf;
4672 
4673 	while (svp != NULL) {
4674 		next = svp->sv_next;
4675 		if (svp->sv_secdata)
4676 			sec_clnt_freeinfo(svp->sv_secdata);
4677 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4678 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4679 		knconf = svp->sv_knconf;
4680 		if (knconf != NULL) {
4681 			if (knconf->knc_protofmly != NULL)
4682 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4683 			if (knconf->knc_proto != NULL)
4684 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4685 			kmem_free(knconf, sizeof (*knconf));
4686 		}
4687 		knconf = svp->sv_origknconf;
4688 		if (knconf != NULL) {
4689 			if (knconf->knc_protofmly != NULL)
4690 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4691 			if (knconf->knc_proto != NULL)
4692 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4693 			kmem_free(knconf, sizeof (*knconf));
4694 		}
4695 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4696 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4697 		mutex_destroy(&svp->sv_lock);
4698 		kmem_free(svp, sizeof (*svp));
4699 		svp = next;
4700 	}
4701 }
4702 
4703 /*
4704  * Only can return non-zero if intr != 0.
4705  */
4706 int
4707 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4708 {
4709 
4710 	mutex_enter(&l->lock);
4711 
4712 	/*
4713 	 * If this is a nested enter, then allow it.  There
4714 	 * must be as many exits as enters through.
4715 	 */
4716 	if (l->owner == curthread) {
4717 		/* lock is held for writing by current thread */
4718 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4719 		l->count--;
4720 	} else if (rw == RW_READER) {
4721 		/*
4722 		 * While there is a writer active or writers waiting,
4723 		 * then wait for them to finish up and move on.  Then,
4724 		 * increment the count to indicate that a reader is
4725 		 * active.
4726 		 */
4727 		while (l->count < 0 || l->waiters > 0) {
4728 			if (intr) {
4729 				klwp_t *lwp = ttolwp(curthread);
4730 
4731 				if (lwp != NULL)
4732 					lwp->lwp_nostop++;
4733 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4734 					if (lwp != NULL)
4735 						lwp->lwp_nostop--;
4736 					mutex_exit(&l->lock);
4737 					return (EINTR);
4738 				}
4739 				if (lwp != NULL)
4740 					lwp->lwp_nostop--;
4741 			} else
4742 				cv_wait(&l->cv, &l->lock);
4743 		}
4744 		ASSERT(l->count < INT_MAX);
4745 #ifdef	DEBUG
4746 		if ((l->count % 10000) == 9999)
4747 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4748 			    "rwlock @ %p\n", l->count, (void *)&l);
4749 #endif
4750 		l->count++;
4751 	} else {
4752 		ASSERT(rw == RW_WRITER);
4753 		/*
4754 		 * While there are readers active or a writer
4755 		 * active, then wait for all of the readers
4756 		 * to finish or for the writer to finish.
4757 		 * Then, set the owner field to curthread and
4758 		 * decrement count to indicate that a writer
4759 		 * is active.
4760 		 */
4761 		while (l->count > 0 || l->owner != NULL) {
4762 			l->waiters++;
4763 			if (intr) {
4764 				klwp_t *lwp = ttolwp(curthread);
4765 
4766 				if (lwp != NULL)
4767 					lwp->lwp_nostop++;
4768 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4769 					if (lwp != NULL)
4770 						lwp->lwp_nostop--;
4771 					l->waiters--;
4772 					cv_broadcast(&l->cv);
4773 					mutex_exit(&l->lock);
4774 					return (EINTR);
4775 				}
4776 				if (lwp != NULL)
4777 					lwp->lwp_nostop--;
4778 			} else
4779 				cv_wait(&l->cv, &l->lock);
4780 			l->waiters--;
4781 		}
4782 		l->owner = curthread;
4783 		l->count--;
4784 	}
4785 
4786 	mutex_exit(&l->lock);
4787 
4788 	return (0);
4789 }
4790 
4791 /*
4792  * If the lock is available, obtain it and return non-zero.  If there is
4793  * already a conflicting lock, return 0 immediately.
4794  */
4795 
4796 int
4797 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4798 {
4799 	mutex_enter(&l->lock);
4800 
4801 	/*
4802 	 * If this is a nested enter, then allow it.  There
4803 	 * must be as many exits as enters through.
4804 	 */
4805 	if (l->owner == curthread) {
4806 		/* lock is held for writing by current thread */
4807 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4808 		l->count--;
4809 	} else if (rw == RW_READER) {
4810 		/*
4811 		 * If there is a writer active or writers waiting, deny the
4812 		 * lock.  Otherwise, bump the count of readers.
4813 		 */
4814 		if (l->count < 0 || l->waiters > 0) {
4815 			mutex_exit(&l->lock);
4816 			return (0);
4817 		}
4818 		l->count++;
4819 	} else {
4820 		ASSERT(rw == RW_WRITER);
4821 		/*
4822 		 * If there are readers active or a writer active, deny the
4823 		 * lock.  Otherwise, set the owner field to curthread and
4824 		 * decrement count to indicate that a writer is active.
4825 		 */
4826 		if (l->count > 0 || l->owner != NULL) {
4827 			mutex_exit(&l->lock);
4828 			return (0);
4829 		}
4830 		l->owner = curthread;
4831 		l->count--;
4832 	}
4833 
4834 	mutex_exit(&l->lock);
4835 
4836 	return (1);
4837 }
4838 
4839 void
4840 nfs_rw_exit(nfs_rwlock_t *l)
4841 {
4842 
4843 	mutex_enter(&l->lock);
4844 	/*
4845 	 * If this is releasing a writer lock, then increment count to
4846 	 * indicate that there is one less writer active.  If this was
4847 	 * the last of possibly nested writer locks, then clear the owner
4848 	 * field as well to indicate that there is no writer active
4849 	 * and wakeup any possible waiting writers or readers.
4850 	 *
4851 	 * If releasing a reader lock, then just decrement count to
4852 	 * indicate that there is one less reader active.  If this was
4853 	 * the last active reader and there are writer(s) waiting,
4854 	 * then wake up the first.
4855 	 */
4856 	if (l->owner != NULL) {
4857 		ASSERT(l->owner == curthread);
4858 		l->count++;
4859 		if (l->count == 0) {
4860 			l->owner = NULL;
4861 			cv_broadcast(&l->cv);
4862 		}
4863 	} else {
4864 		ASSERT(l->count > 0);
4865 		l->count--;
4866 		if (l->count == 0 && l->waiters > 0)
4867 			cv_broadcast(&l->cv);
4868 	}
4869 	mutex_exit(&l->lock);
4870 }
4871 
4872 int
4873 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4874 {
4875 
4876 	if (rw == RW_READER)
4877 		return (l->count > 0);
4878 	ASSERT(rw == RW_WRITER);
4879 	return (l->count < 0);
4880 }
4881 
4882 /* ARGSUSED */
4883 void
4884 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4885 {
4886 
4887 	l->count = 0;
4888 	l->waiters = 0;
4889 	l->owner = NULL;
4890 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4891 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4892 }
4893 
4894 void
4895 nfs_rw_destroy(nfs_rwlock_t *l)
4896 {
4897 
4898 	mutex_destroy(&l->lock);
4899 	cv_destroy(&l->cv);
4900 }
4901 
4902 int
4903 nfs3_rddir_compar(const void *x, const void *y)
4904 {
4905 	rddir_cache *a = (rddir_cache *)x;
4906 	rddir_cache *b = (rddir_cache *)y;
4907 
4908 	if (a->nfs3_cookie == b->nfs3_cookie) {
4909 		if (a->buflen == b->buflen)
4910 			return (0);
4911 		if (a->buflen < b->buflen)
4912 			return (-1);
4913 		return (1);
4914 	}
4915 
4916 	if (a->nfs3_cookie < b->nfs3_cookie)
4917 		return (-1);
4918 
4919 	return (1);
4920 }
4921 
4922 int
4923 nfs_rddir_compar(const void *x, const void *y)
4924 {
4925 	rddir_cache *a = (rddir_cache *)x;
4926 	rddir_cache *b = (rddir_cache *)y;
4927 
4928 	if (a->nfs_cookie == b->nfs_cookie) {
4929 		if (a->buflen == b->buflen)
4930 			return (0);
4931 		if (a->buflen < b->buflen)
4932 			return (-1);
4933 		return (1);
4934 	}
4935 
4936 	if (a->nfs_cookie < b->nfs_cookie)
4937 		return (-1);
4938 
4939 	return (1);
4940 }
4941 
4942 static char *
4943 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4944 {
4945 	servinfo_t *s;
4946 	char *srvnames;
4947 	char *namep;
4948 	size_t length;
4949 
4950 	/*
4951 	 * Calculate the length of the string required to hold all
4952 	 * of the server names plus either a comma or a null
4953 	 * character following each individual one.
4954 	 */
4955 	length = 0;
4956 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4957 		length += s->sv_hostnamelen;
4958 
4959 	srvnames = kmem_alloc(length, KM_SLEEP);
4960 
4961 	namep = srvnames;
4962 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4963 		(void) strcpy(namep, s->sv_hostname);
4964 		namep += s->sv_hostnamelen - 1;
4965 		*namep++ = ',';
4966 	}
4967 	*--namep = '\0';
4968 
4969 	*len = length;
4970 
4971 	return (srvnames);
4972 }
4973 
4974 /*
4975  * These two functions are temporary and designed for the upgrade-workaround
4976  * only.  They cannot be used for general zone-crossing NFS client support, and
4977  * will be removed shortly.
4978  *
4979  * When the workaround is enabled, all NFS traffic is forced into the global
4980  * zone.  These functions are called when the code needs to refer to the state
4981  * of the underlying network connection.  They're not called when the function
4982  * needs to refer to the state of the process that invoked the system call.
4983  * (E.g., when checking whether the zone is shutting down during the mount()
4984  * call.)
4985  */
4986 
4987 struct zone *
4988 nfs_zone(void)
4989 {
4990 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
4991 }
4992 
4993 zoneid_t
4994 nfs_zoneid(void)
4995 {
4996 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
4997 }
4998 
4999 /*
5000  * nfs_mount_label_policy:
5001  *	Determine whether the mount is allowed according to MAC check,
5002  *	by comparing (where appropriate) label of the remote server
5003  *	against the label of the zone being mounted into.
5004  *
5005  *	Returns:
5006  *		 0 :	access allowed
5007  *		-1 :	read-only access allowed (i.e., read-down)
5008  *		>0 :	error code, such as EACCES
5009  */
5010 int
5011 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5012     struct knetconfig *knconf, cred_t *cr)
5013 {
5014 	int		addr_type;
5015 	void		*ipaddr;
5016 	bslabel_t	*server_sl, *mntlabel;
5017 	zone_t		*mntzone = NULL;
5018 	ts_label_t	*zlabel;
5019 	tsol_tpc_t	*tp;
5020 	ts_label_t	*tsl = NULL;
5021 	int		retv;
5022 
5023 	/*
5024 	 * Get the zone's label.  Each zone on a labeled system has a label.
5025 	 */
5026 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5027 	zlabel = mntzone->zone_slabel;
5028 	ASSERT(zlabel != NULL);
5029 	label_hold(zlabel);
5030 
5031 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5032 		addr_type = IPV4_VERSION;
5033 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5034 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5035 		addr_type = IPV6_VERSION;
5036 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5037 	} else {
5038 		retv = 0;
5039 		goto out;
5040 	}
5041 
5042 	retv = EACCES;				/* assume the worst */
5043 
5044 	/*
5045 	 * Next, get the assigned label of the remote server.
5046 	 */
5047 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5048 	if (tp == NULL)
5049 		goto out;			/* error getting host entry */
5050 
5051 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5052 		goto rel_tpc;			/* invalid domain */
5053 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5054 	    (tp->tpc_tp.host_type != UNLABELED))
5055 		goto rel_tpc;			/* invalid hosttype */
5056 
5057 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5058 		tsl = getflabel_cipso(vfsp);
5059 		if (tsl == NULL)
5060 			goto rel_tpc;		/* error getting server lbl */
5061 
5062 		server_sl = label2bslabel(tsl);
5063 	} else {	/* UNLABELED */
5064 		server_sl = &tp->tpc_tp.tp_def_label;
5065 	}
5066 
5067 	mntlabel = label2bslabel(zlabel);
5068 
5069 	/*
5070 	 * Now compare labels to complete the MAC check.  If the labels
5071 	 * are equal or if the requestor is in the global zone and has
5072 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5073 	 * mounts into the global zone itself; restrict these to
5074 	 * read-only.)
5075 	 *
5076 	 * If the requestor is in some other zone, but his label
5077 	 * dominates the server, then allow read-down.
5078 	 *
5079 	 * Otherwise, access is denied.
5080 	 */
5081 	if (blequal(mntlabel, server_sl) ||
5082 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5083 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5084 		if ((mntzone == global_zone) ||
5085 		    !blequal(mntlabel, server_sl))
5086 			retv = -1;		/* read-only */
5087 		else
5088 			retv = 0;		/* access OK */
5089 	} else if (bldominates(mntlabel, server_sl)) {
5090 		retv = -1;			/* read-only */
5091 	} else {
5092 		retv = EACCES;
5093 	}
5094 
5095 	if (tsl != NULL)
5096 		label_rele(tsl);
5097 
5098 rel_tpc:
5099 	TPC_RELE(tp);
5100 out:
5101 	if (mntzone)
5102 		zone_rele(mntzone);
5103 	label_rele(zlabel);
5104 	return (retv);
5105 }
5106 
5107 boolean_t
5108 nfs_has_ctty(void)
5109 {
5110 	boolean_t rv;
5111 	mutex_enter(&curproc->p_splock);
5112 	rv = (curproc->p_sessp->s_vp != NULL);
5113 	mutex_exit(&curproc->p_splock);
5114 	return (rv);
5115 }
5116 
5117 /*
5118  * TX NFS routine used by NFSv3 and NFSv4 to do label check
5119  * on client label and server's file object lable.
5120  */
5121 boolean_t
5122 do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag)
5123 {
5124 	bslabel_t *slabel;
5125 	ts_label_t *tslabel;
5126 	boolean_t result;
5127 
5128 	if ((tslabel = nfs_getflabel(vp)) == NULL) {
5129 		return (B_FALSE);
5130 	}
5131 	slabel = label2bslabel(tslabel);
5132 	DTRACE_PROBE4(tx__rfs__log__info__labelcheck, char *,
5133 	    "comparing server's file label(1) with client label(2) (vp(3))",
5134 	    bslabel_t *, slabel, bslabel_t *, clabel, vnode_t *, vp);
5135 
5136 	if (flag == EQUALITY_CHECK)
5137 		result = blequal(clabel, slabel);
5138 	else
5139 		result = bldominates(clabel, slabel);
5140 	label_rele(tslabel);
5141 	return (result);
5142 }
5143 
5144 /*
5145  * See if xattr directory to see if it has any generic user attributes
5146  */
5147 int
5148 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5149 {
5150 	struct uio uio;
5151 	struct iovec iov;
5152 	char *dbuf;
5153 	struct dirent64 *dp;
5154 	size_t dlen = 8 * 1024;
5155 	size_t dbuflen;
5156 	int eof = 0;
5157 	int error;
5158 
5159 	*valp = 0;
5160 	dbuf = kmem_alloc(dlen, KM_SLEEP);
5161 	uio.uio_iov = &iov;
5162 	uio.uio_iovcnt = 1;
5163 	uio.uio_segflg = UIO_SYSSPACE;
5164 	uio.uio_fmode = 0;
5165 	uio.uio_extflg = UIO_COPY_CACHED;
5166 	uio.uio_loffset = 0;
5167 	uio.uio_resid = dlen;
5168 	iov.iov_base = dbuf;
5169 	iov.iov_len = dlen;
5170 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5171 	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5172 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5173 
5174 	dbuflen = dlen - uio.uio_resid;
5175 
5176 	if (error || dbuflen == 0) {
5177 		kmem_free(dbuf, dlen);
5178 		return (error);
5179 	}
5180 
5181 	dp = (dirent64_t *)dbuf;
5182 
5183 	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5184 		if (strcmp(dp->d_name, ".") == 0 ||
5185 		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5186 		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5187 		    VIEW_READONLY) == 0) {
5188 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5189 			continue;
5190 		}
5191 
5192 		*valp = 1;
5193 		break;
5194 	}
5195 	kmem_free(dbuf, dlen);
5196 	return (0);
5197 }
5198