fs/nfs/nfs_subr.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
 */

#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cred.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/time.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/tiuser.h>
#include <sys/swap.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/kmem.h>
#include <sys/kstat.h>
#include <sys/cmn_err.h>
#include <sys/vtrace.h>
#include <sys/session.h>
#include <sys/dnlc.h>
#include <sys/bitmap.h>
#include <sys/acl.h>
#include <sys/ddi.h>
#include <sys/pathname.h>
#include <sys/flock.h>
#include <sys/dirent.h>
#include <sys/flock.h>
#include <sys/callb.h>
#include <sys/atomic.h>
#include <sys/list.h>
#include <sys/tsol/tnet.h>
#include <sys/priv.h>
#include <sys/sdt.h>
#include <sys/attr.h>

#include <inet/ip6.h>

#include <rpc/types.h>
#include <rpc/xdr.h>
#include <rpc/auth.h>
#include <rpc/clnt.h>

#include <nfs/nfs.h>
#include <nfs/nfs4.h>
#include <nfs/nfs_clnt.h>
#include <nfs/rnode.h>
#include <nfs/nfs_acl.h>

#include <sys/tsol/label.h>

/*
 * The hash queues for the access to active and cached rnodes
 * are organized as doubly linked lists.  A reader/writer lock
 * for each hash bucket is used to control access and to synchronize
 * lookups, additions, and deletions from the hash queue.
 *
 * The rnode freelist is organized as a doubly linked list with
 * a head pointer.  Additions and deletions are synchronized via
 * a single mutex.
 *
 * In order to add an rnode to the free list, it must be hashed into
 * a hash queue and the exclusive lock to the hash queue be held.
 * If an rnode is not hashed into a hash queue, then it is destroyed
 * because it represents no valuable information that can be reused
 * about the file.  The exclusive lock to the hash queue must be
 * held in order to prevent a lookup in the hash queue from finding
 * the rnode and using it and assuming that the rnode is not on the
 * freelist.  The lookup in the hash queue will have the hash queue
 * locked, either exclusive or shared.
 *
 * The vnode reference count for each rnode is not allowed to drop
 * below 1.  This prevents external entities, such as the VM
 * subsystem, from acquiring references to vnodes already on the
 * freelist and then trying to place them back on the freelist
 * when their reference is released.  This means that the when an
 * rnode is looked up in the hash queues, then either the rnode
 * is removed from the freelist and that reference is transferred to
 * the new reference or the vnode reference count must be incremented
 * accordingly.  The mutex for the freelist must be held in order to
 * accurately test to see if the rnode is on the freelist or not.
 * The hash queue lock might be held shared and it is possible that
 * two different threads may race to remove the rnode from the
 * freelist.  This race can be resolved by holding the mutex for the
 * freelist.  Please note that the mutex for the freelist does not
 * need to held if the rnode is not on the freelist.  It can not be
 * placed on the freelist due to the requirement that the thread
 * putting the rnode on the freelist must hold the exclusive lock
 * to the hash queue and the thread doing the lookup in the hash
 * queue is holding either a shared or exclusive lock to the hash
 * queue.
 *
 * The lock ordering is:
 *
 *	hash bucket lock -> vnode lock
 *	hash bucket lock -> freelist lock
 */
static rhashq_t *rtable;

static kmutex_t rpfreelist_lock;
static rnode_t *rpfreelist = NULL;
static long rnew = 0;
long nrnode = 0;

static int rtablesize;
static int rtablemask;

static int hashlen = 4;

static struct kmem_cache *rnode_cache;

/*
 * Mutex to protect the following variables:
 *	nfs_major
 *	nfs_minor
 */
kmutex_t nfs_minor_lock;
int nfs_major;
int nfs_minor;

/* Do we allow preepoch (negative) time values otw? */
bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */

/*
 * Access cache
 */
static acache_hash_t *acache;
static long nacache;	/* used strictly to size the number of hash queues */

static int acachesize;
static int acachemask;
static struct kmem_cache *acache_cache;

/*
 * Client side utilities
 */

/*
 * client side statistics
 */
static const struct clstat clstat_tmpl = {
	{ "calls",	KSTAT_DATA_UINT64 },
	{ "badcalls",	KSTAT_DATA_UINT64 },
	{ "clgets",	KSTAT_DATA_UINT64 },
	{ "cltoomany",	KSTAT_DATA_UINT64 },
#ifdef DEBUG
	{ "clalloc",	KSTAT_DATA_UINT64 },
	{ "noresponse",	KSTAT_DATA_UINT64 },
	{ "failover",	KSTAT_DATA_UINT64 },
	{ "remap",	KSTAT_DATA_UINT64 },
#endif
};

/*
 * The following are statistics that describe behavior of the system as a whole
 * and doesn't correspond to any one particular zone.
 */
#ifdef DEBUG
static struct clstat_debug {
	kstat_named_t	nrnode;			/* number of allocated rnodes */
	kstat_named_t	access;			/* size of access cache */
	kstat_named_t	dirent;			/* size of readdir cache */
	kstat_named_t	dirents;		/* size of readdir buf cache */
	kstat_named_t	reclaim;		/* number of reclaims */
	kstat_named_t	clreclaim;		/* number of cl reclaims */
	kstat_named_t	f_reclaim;		/* number of free reclaims */
	kstat_named_t	a_reclaim;		/* number of active reclaims */
	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
	kstat_named_t	rpath;			/* bytes used to store rpaths */
} clstat_debug = {
	{ "nrnode",	KSTAT_DATA_UINT64 },
	{ "access",	KSTAT_DATA_UINT64 },
	{ "dirent",	KSTAT_DATA_UINT64 },
	{ "dirents",	KSTAT_DATA_UINT64 },
	{ "reclaim",	KSTAT_DATA_UINT64 },
	{ "clreclaim",	KSTAT_DATA_UINT64 },
	{ "f_reclaim",	KSTAT_DATA_UINT64 },
	{ "a_reclaim",	KSTAT_DATA_UINT64 },
	{ "r_reclaim",	KSTAT_DATA_UINT64 },
	{ "r_path",	KSTAT_DATA_UINT64 },
};
#endif	/* DEBUG */

/*
 * We keep a global list of per-zone client data, so we can clean up all zones
 * if we get low on memory.
 */
static list_t nfs_clnt_list;
static kmutex_t nfs_clnt_list_lock;
static zone_key_t nfsclnt_zone_key;

static struct kmem_cache *chtab_cache;

/*
 * Some servers do not properly update the attributes of the
 * directory when changes are made.  To allow interoperability
 * with these broken servers, the nfs_disable_rddir_cache
 * parameter must be set in /etc/system
 */
int nfs_disable_rddir_cache = 0;

int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
		    struct chtab **);
void		clfree(CLIENT *, struct chtab *);
static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
		    struct chtab **, struct nfs_clnt *);
static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
		    struct chtab **, struct nfs_clnt *);
static void	clreclaim(void *);
static int	nfs_feedback(int, int, mntinfo_t *);
static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
		    failinfo_t *);
static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
		    caddr_t, cred_t *, int *, int, failinfo_t *);
static void	rinactive(rnode_t *, cred_t *);
static int	rtablehash(nfs_fhandle *);
static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
		    struct vnodeops *,
		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
			cred_t *),
		    int (*)(const void *, const void *), int *, cred_t *,
		    char *, char *);
static void	rp_rmfree(rnode_t *);
static void	rp_addhash(rnode_t *);
static void	rp_rmhash_locked(rnode_t *);
static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
static void	destroy_rnode(rnode_t *);
static void	rddir_cache_free(rddir_cache *);
static int	nfs_free_data_reclaim(rnode_t *);
static int	nfs_active_data_reclaim(rnode_t *);
static int	nfs_free_reclaim(void);
static int	nfs_active_reclaim(void);
static int	nfs_rnode_reclaim(void);
static void	nfs_reclaim(void *);
static int	failover_safe(failinfo_t *);
static void	failover_newserver(mntinfo_t *mi);
static void	failover_thread(mntinfo_t *mi);
static int	failover_wait(mntinfo_t *);
static int	failover_remap(failinfo_t *);
static int	failover_lookup(char *, vnode_t *,
		    int (*)(vnode_t *, char *, vnode_t **,
			struct pathname *, int, vnode_t *, cred_t *, int),
		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
		    vnode_t **);
static void	nfs_free_r_path(rnode_t *);
static void	nfs_set_vroot(vnode_t *);
static char	*nfs_getsrvnames(mntinfo_t *, size_t *);

/*
 * from rpcsec module (common/rpcsec)
 */
extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
extern void sec_clnt_freeh(AUTH *);
extern void sec_clnt_freeinfo(struct sec_data *);

/*
 * used in mount policy
 */
extern ts_label_t *getflabel_cipso(vfs_t *);

/*
 * EIO or EINTR are not recoverable errors.
 */
#define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))

#ifdef DEBUG
#define	SRV_QFULL_MSG	"send queue to NFS%d server %s is full; still trying\n"
#define	SRV_NOTRESP_MSG	"NFS%d server %s not responding still trying\n"
#else
#define	SRV_QFULL_MSG	"send queue to NFS server %s is full still trying\n"
#define	SRV_NOTRESP_MSG	"NFS server %s not responding still trying\n"
#endif
/*
 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
 */
static int
clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
    struct chtab **chp, struct nfs_clnt *nfscl)
{
	struct chhead *ch, *newch;
	struct chhead **plistp;
	struct chtab *cp;
	int error;
	k_sigset_t smask;

	if (newcl == NULL || chp == NULL || ci == NULL)
		return (EINVAL);

	*newcl = NULL;
	*chp = NULL;

	/*
	 * Find an unused handle or create one
	 */
	newch = NULL;
	nfscl->nfscl_stat.clgets.value.ui64++;
top:
	/*
	 * Find the correct entry in the cache to check for free
	 * client handles.  The search is based on the RPC program
	 * number, program version number, dev_t for the transport
	 * device, and the protocol family.
	 */
	mutex_enter(&nfscl->nfscl_chtable_lock);
	plistp = &nfscl->nfscl_chtable;
	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
		if (ch->ch_prog == ci->cl_prog &&
		    ch->ch_vers == ci->cl_vers &&
		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
		    (strcmp(ch->ch_protofmly,
		    svp->sv_knconf->knc_protofmly) == 0))
			break;
		plistp = &ch->ch_next;
	}

	/*
	 * If we didn't find a cache entry for this quadruple, then
	 * create one.  If we don't have one already preallocated,
	 * then drop the cache lock, create one, and then start over.
	 * If we did have a preallocated entry, then just add it to
	 * the front of the list.
	 */
	if (ch == NULL) {
		if (newch == NULL) {
			mutex_exit(&nfscl->nfscl_chtable_lock);
			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
			newch->ch_timesused = 0;
			newch->ch_prog = ci->cl_prog;
			newch->ch_vers = ci->cl_vers;
			newch->ch_dev = svp->sv_knconf->knc_rdev;
			newch->ch_protofmly = kmem_alloc(
			    strlen(svp->sv_knconf->knc_protofmly) + 1,
			    KM_SLEEP);
			(void) strcpy(newch->ch_protofmly,
			    svp->sv_knconf->knc_protofmly);
			newch->ch_list = NULL;
			goto top;
		}
		ch = newch;
		newch = NULL;
		ch->ch_next = nfscl->nfscl_chtable;
		nfscl->nfscl_chtable = ch;
	/*
	 * We found a cache entry, but if it isn't on the front of the
	 * list, then move it to the front of the list to try to take
	 * advantage of locality of operations.
	 */
	} else if (ch != nfscl->nfscl_chtable) {
		*plistp = ch->ch_next;
		ch->ch_next = nfscl->nfscl_chtable;
		nfscl->nfscl_chtable = ch;
	}

	/*
	 * If there was a free client handle cached, then remove it
	 * from the list, init it, and use it.
	 */
	if (ch->ch_list != NULL) {
		cp = ch->ch_list;
		ch->ch_list = cp->ch_list;
		mutex_exit(&nfscl->nfscl_chtable_lock);
		if (newch != NULL) {
			kmem_free(newch->ch_protofmly,
			    strlen(newch->ch_protofmly) + 1);
			kmem_free(newch, sizeof (*newch));
		}
		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
		    &cp->ch_client->cl_auth);
		if (error || cp->ch_client->cl_auth == NULL) {
			CLNT_DESTROY(cp->ch_client);
			kmem_cache_free(chtab_cache, cp);
			return ((error != 0) ? error : EINTR);
		}
		ch->ch_timesused++;
		*newcl = cp->ch_client;
		*chp = cp;
		return (0);
	}

	/*
	 * There weren't any free client handles which fit, so allocate
	 * a new one and use that.
	 */
#ifdef DEBUG
	atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
#endif
	mutex_exit(&nfscl->nfscl_chtable_lock);

	nfscl->nfscl_stat.cltoomany.value.ui64++;
	if (newch != NULL) {
		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
		kmem_free(newch, sizeof (*newch));
	}

	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
	cp->ch_head = ch;

	sigintr(&smask, (int)ci->cl_flags & MI_INT);
	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
	sigunintr(&smask);

	if (error != 0) {
		kmem_cache_free(chtab_cache, cp);
#ifdef DEBUG
		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
#endif
		/*
		 * Warning is unnecessary if error is EINTR.
		 */
		if (error != EINTR) {
			nfs_cmn_err(error, CE_WARN,
			    "clget: couldn't create handle: %m\n");
		}
		return (error);
	}
	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
	auth_destroy(cp->ch_client->cl_auth);
	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
	    &cp->ch_client->cl_auth);
	if (error || cp->ch_client->cl_auth == NULL) {
		CLNT_DESTROY(cp->ch_client);
		kmem_cache_free(chtab_cache, cp);
#ifdef DEBUG
		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
#endif
		return ((error != 0) ? error : EINTR);
	}
	ch->ch_timesused++;
	*newcl = cp->ch_client;
	ASSERT(cp->ch_client->cl_nosignal == FALSE);
	*chp = cp;
	return (0);
}

int
clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
    struct chtab **chp)
{
	struct nfs_clnt *nfscl;

	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
	ASSERT(nfscl != NULL);

	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
}

static int
acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
    struct chtab **chp, struct nfs_clnt *nfscl)
{
	clinfo_t ci;
	int error;

	/*
	 * Set read buffer size to rsize
	 * and add room for RPC headers.
	 */
	ci.cl_readsize = mi->mi_tsize;
	if (ci.cl_readsize != 0)
		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);

	/*
	 * If soft mount and server is down just try once.
	 * meaning: do not retransmit.
	 */
	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
		ci.cl_retrans = 0;
	else
		ci.cl_retrans = mi->mi_retrans;

	ci.cl_prog = NFS_ACL_PROGRAM;
	ci.cl_vers = mi->mi_vers;
	ci.cl_flags = mi->mi_flags;

	/*
	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
	 * security flavor, the client tries to establish a security context
	 * by contacting the server. If the connection is timed out or reset,
	 * e.g. server reboot, we will try again.
	 */
	do {
		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);

		if (error == 0)
			break;

		/*
		 * For forced unmount or zone shutdown, bail out, no retry.
		 */
		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
			error = EIO;
			break;
		}

		/* do not retry for softmount */
		if (!(mi->mi_flags & MI_HARD))
			break;

		/* let the caller deal with the failover case */
		if (FAILOVER_MOUNT(mi))
			break;

	} while (error == ETIMEDOUT || error == ECONNRESET);

	return (error);
}

static int
nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
    struct chtab **chp, struct nfs_clnt *nfscl)
{
	clinfo_t ci;
	int error;

	/*
	 * Set read buffer size to rsize
	 * and add room for RPC headers.
	 */
	ci.cl_readsize = mi->mi_tsize;
	if (ci.cl_readsize != 0)
		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);

	/*
	 * If soft mount and server is down just try once.
	 * meaning: do not retransmit.
	 */
	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
		ci.cl_retrans = 0;
	else
		ci.cl_retrans = mi->mi_retrans;

	ci.cl_prog = mi->mi_prog;
	ci.cl_vers = mi->mi_vers;
	ci.cl_flags = mi->mi_flags;

	/*
	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
	 * security flavor, the client tries to establish a security context
	 * by contacting the server. If the connection is timed out or reset,
	 * e.g. server reboot, we will try again.
	 */
	do {
		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);

		if (error == 0)
			break;

		/*
		 * For forced unmount or zone shutdown, bail out, no retry.
		 */
		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
			error = EIO;
			break;
		}

		/* do not retry for softmount */
		if (!(mi->mi_flags & MI_HARD))
			break;

		/* let the caller deal with the failover case */
		if (FAILOVER_MOUNT(mi))
			break;

	} while (error == ETIMEDOUT || error == ECONNRESET);

	return (error);
}

static void
clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
{
	if (cl->cl_auth != NULL) {
		sec_clnt_freeh(cl->cl_auth);
		cl->cl_auth = NULL;
	}

	/*
	 * Timestamp this cache entry so that we know when it was last
	 * used.
	 */
	cp->ch_freed = gethrestime_sec();

	/*
	 * Add the free client handle to the front of the list.
	 * This way, the list will be sorted in youngest to oldest
	 * order.
	 */
	mutex_enter(&nfscl->nfscl_chtable_lock);
	cp->ch_list = cp->ch_head->ch_list;
	cp->ch_head->ch_list = cp;
	mutex_exit(&nfscl->nfscl_chtable_lock);
}

void
clfree(CLIENT *cl, struct chtab *cp)
{
	struct nfs_clnt *nfscl;

	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
	ASSERT(nfscl != NULL);

	clfree_impl(cl, cp, nfscl);
}

#define	CL_HOLDTIME	60	/* time to hold client handles */

static void
clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
{
	struct chhead *ch;
	struct chtab *cp;	/* list of objects that can be reclaimed */
	struct chtab *cpe;
	struct chtab *cpl;
	struct chtab **cpp;
#ifdef DEBUG
	int n = 0;
#endif

	/*
	 * Need to reclaim some memory, so step through the cache
	 * looking through the lists for entries which can be freed.
	 */
	cp = NULL;

	mutex_enter(&nfscl->nfscl_chtable_lock);

	/*
	 * Here we step through each non-NULL quadruple and start to
	 * construct the reclaim list pointed to by cp.  Note that
	 * cp will contain all eligible chtab entries.  When this traversal
	 * completes, chtab entries from the last quadruple will be at the
	 * front of cp and entries from previously inspected quadruples have
	 * been appended to the rear of cp.
	 */
	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
		if (ch->ch_list == NULL)
			continue;
		/*
		 * Search each list for entries older then
		 * cl_holdtime seconds.  The lists are maintained
		 * in youngest to oldest order so that when the
		 * first entry is found which is old enough, then
		 * all of the rest of the entries on the list will
		 * be old enough as well.
		 */
		cpl = ch->ch_list;
		cpp = &ch->ch_list;
		while (cpl != NULL &&
		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
			cpp = &cpl->ch_list;
			cpl = cpl->ch_list;
		}
		if (cpl != NULL) {
			*cpp = NULL;
			if (cp != NULL) {
				cpe = cpl;
				while (cpe->ch_list != NULL)
					cpe = cpe->ch_list;
				cpe->ch_list = cp;
			}
			cp = cpl;
		}
	}

	mutex_exit(&nfscl->nfscl_chtable_lock);

	/*
	 * If cp is empty, then there is nothing to reclaim here.
	 */
	if (cp == NULL)
		return;

	/*
	 * Step through the list of entries to free, destroying each client
	 * handle and kmem_free'ing the memory for each entry.
	 */
	while (cp != NULL) {
#ifdef DEBUG
		n++;
#endif
		CLNT_DESTROY(cp->ch_client);
		cpl = cp->ch_list;
		kmem_cache_free(chtab_cache, cp);
		cp = cpl;
	}

#ifdef DEBUG
	/*
	 * Update clalloc so that nfsstat shows the current number
	 * of allocated client handles.
	 */
	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
#endif
}

/* ARGSUSED */
static void
clreclaim(void *all)
{
	struct nfs_clnt *nfscl;

#ifdef DEBUG
	clstat_debug.clreclaim.value.ui64++;
#endif
	/*
	 * The system is low on memory; go through and try to reclaim some from
	 * every zone on the system.
	 */
	mutex_enter(&nfs_clnt_list_lock);
	nfscl = list_head(&nfs_clnt_list);
	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
		clreclaim_zone(nfscl, CL_HOLDTIME);
	mutex_exit(&nfs_clnt_list_lock);
}

/*
 * Minimum time-out values indexed by call type
 * These units are in "eights" of a second to avoid multiplies
 */
static unsigned int minimum_timeo[] = {
	6, 7, 10
};

/*
 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
 */
#define	MAXTIMO	(20*hz)
#define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
#define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))

#define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
#define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
#define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */

/*
 * Function called when rfscall notices that we have been
 * re-transmitting, or when we get a response without retransmissions.
 * Return 1 if the transfer size was adjusted down - 0 if no change.
 */
static int
nfs_feedback(int flag, int which, mntinfo_t *mi)
{
	int kind;
	int r = 0;

	mutex_enter(&mi->mi_lock);
	if (flag == FEEDBACK_REXMIT1) {
		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
			goto done;
		if (mi->mi_curread > MIN_NFS_TSIZE) {
			mi->mi_curread /= 2;
			if (mi->mi_curread < MIN_NFS_TSIZE)
				mi->mi_curread = MIN_NFS_TSIZE;
			r = 1;
		}

		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
			mi->mi_curwrite /= 2;
			if (mi->mi_curwrite < MIN_NFS_TSIZE)
				mi->mi_curwrite = MIN_NFS_TSIZE;
			r = 1;
		}
	} else if (flag == FEEDBACK_OK) {
		kind = mi->mi_timer_type[which];
		if (kind == 0 ||
		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
			goto done;
		if (kind == 1) {
			if (mi->mi_curread >= mi->mi_tsize)
				goto done;
			mi->mi_curread +=  MIN_NFS_TSIZE;
			if (mi->mi_curread > mi->mi_tsize/2)
				mi->mi_curread = mi->mi_tsize;
		} else if (kind == 2) {
			if (mi->mi_curwrite >= mi->mi_stsize)
				goto done;
			mi->mi_curwrite += MIN_NFS_TSIZE;
			if (mi->mi_curwrite > mi->mi_stsize/2)
				mi->mi_curwrite = mi->mi_stsize;
		}
	}
done:
	mutex_exit(&mi->mi_lock);
	return (r);
}

#ifdef DEBUG
static int rfs2call_hits = 0;
static int rfs2call_misses = 0;
#endif

int
rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
    enum nfsstat *statusp, int flags, failinfo_t *fi)
{
	int rpcerror;
	enum clnt_stat rpc_status;

	ASSERT(statusp != NULL);

	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
	    cr, douprintf, &rpc_status, flags, fi);
	if (!rpcerror) {
		/*
		 * See crnetadjust() for comments.
		 */
		if (*statusp == NFSERR_ACCES &&
		    (cr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
			rfs2call_hits++;
#endif
			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
			    resp, cr, douprintf, NULL, flags, fi);
			crfree(cr);
#ifdef DEBUG
			if (*statusp == NFSERR_ACCES)
				rfs2call_misses++;
#endif
		}
	} else if (rpc_status == RPC_PROCUNAVAIL) {
		*statusp = NFSERR_OPNOTSUPP;
		rpcerror = 0;
	}

	return (rpcerror);
}

#define	NFS3_JUKEBOX_DELAY	10 * hz

static clock_t nfs3_jukebox_delay = 0;

#ifdef DEBUG
static int rfs3call_hits = 0;
static int rfs3call_misses = 0;
#endif

int
rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
    nfsstat3 *statusp, int flags, failinfo_t *fi)
{
	int rpcerror;
	int user_informed;

	user_informed = 0;
	do {
		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
		    cr, douprintf, NULL, flags, fi);
		if (!rpcerror) {
			cred_t *crr;
			if (*statusp == NFS3ERR_JUKEBOX) {
				if (ttoproc(curthread) == &p0) {
					rpcerror = EAGAIN;
					break;
				}
				if (!user_informed) {
					user_informed = 1;
					uprintf(
		"file temporarily unavailable on the server, retrying...\n");
				}
				delay(nfs3_jukebox_delay);
			}
			/*
			 * See crnetadjust() for comments.
			 */
			else if (*statusp == NFS3ERR_ACCES &&
			    (crr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
				rfs3call_hits++;
#endif
				rpcerror = rfscall(mi, which, xdrargs, argsp,
				    xdrres, resp, crr, douprintf,
				    NULL, flags, fi);

				crfree(crr);
#ifdef DEBUG
				if (*statusp == NFS3ERR_ACCES)
					rfs3call_misses++;
#endif
			}
		}
	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);

	return (rpcerror);
}

#define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
#define	INC_READERS(mi)		{ \
	mi->mi_readers++; \
}
#define	DEC_READERS(mi)		{ \
	mi->mi_readers--; \
	if (mi->mi_readers == 0) \
		cv_broadcast(&mi->mi_failover_cv); \
}

static int
rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
    enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
{
	CLIENT *client;
	struct chtab *ch;
	cred_t *cr = icr;
	enum clnt_stat status;
	struct rpc_err rpcerr, rpcerr_tmp;
	struct timeval wait;
	int timeo;		/* in units of hz */
	int my_rsize, my_wsize;
	bool_t tryagain;
	bool_t cred_cloned = FALSE;
	k_sigset_t smask;
	servinfo_t *svp;
	struct nfs_clnt *nfscl;
	zoneid_t zoneid = getzoneid();
	char *msg;
#ifdef DEBUG
	char *bufp;
#endif


	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
	    "rfscall_start:which %d mi %p", which, mi);

	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
	ASSERT(nfscl != NULL);

	nfscl->nfscl_stat.calls.value.ui64++;
	mi->mi_reqs[which].value.ui64++;

	rpcerr.re_status = RPC_SUCCESS;

	/*
	 * In case of forced unmount or zone shutdown, return EIO.
	 */

	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
		rpcerr.re_status = RPC_FAILED;
		rpcerr.re_errno = EIO;
		return (rpcerr.re_errno);
	}

	/*
	 * Remember the transfer sizes in case
	 * nfs_feedback changes them underneath us.
	 */
	my_rsize = mi->mi_curread;
	my_wsize = mi->mi_curwrite;

	/*
	 * NFS client failover support
	 *
	 * If this rnode is not in sync with the current server (VALID_FH),
	 * we'd like to do a remap to get in sync.  We can be interrupted
	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
	 * use the best info we have to try the RPC.  Part of that is
	 * unconditionally updating the filehandle copy kept for V3.
	 *
	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
	 * rw_enter(); we're trying to keep the current server from being
	 * changed on us until we're done with the remapping and have a
	 * matching client handle.  We don't want to sending a filehandle
	 * to the wrong host.
	 */
failoverretry:
	if (FAILOVER_MOUNT(mi)) {
		mutex_enter(&mi->mi_lock);
		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
			if (failover_wait(mi)) {
				mutex_exit(&mi->mi_lock);
				return (EINTR);
			}
		}
		INC_READERS(mi);
		mutex_exit(&mi->mi_lock);
		if (fi) {
			if (!VALID_FH(fi) &&
			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
				int remaperr;

				svp = mi->mi_curr_serv;
				remaperr = failover_remap(fi);
				if (remaperr != 0) {
#ifdef DEBUG
					if (remaperr != EINTR)
						nfs_cmn_err(remaperr, CE_WARN,
					    "rfscall couldn't failover: %m");
#endif
					mutex_enter(&mi->mi_lock);
					DEC_READERS(mi);
					mutex_exit(&mi->mi_lock);
					/*
					 * If failover_remap returns ETIMEDOUT
					 * and the filesystem is hard mounted
					 * we have to retry the call with a new
					 * server.
					 */
					if ((mi->mi_flags & MI_HARD) &&
					    IS_RECOVERABLE_ERROR(remaperr)) {
						if (svp == mi->mi_curr_serv)
							failover_newserver(mi);
						rpcerr.re_status = RPC_SUCCESS;
						goto failoverretry;
					}
					rpcerr.re_errno = remaperr;
					return (remaperr);
				}
			}
			if (fi->fhp && fi->copyproc)
				(*fi->copyproc)(fi->fhp, fi->vp);
		}
	}

	/* For TSOL, use a new cred which has net_mac_aware flag */
	if (!cred_cloned && is_system_labeled()) {
		cred_cloned = TRUE;
		cr = crdup(icr);
		(void) setpflags(NET_MAC_AWARE, 1, cr);
	}

	/*
	 * clget() calls clnt_tli_kinit() which clears the xid, so we
	 * are guaranteed to reprocess the retry as a new request.
	 */
	svp = mi->mi_curr_serv;
	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);

	if (FAILOVER_MOUNT(mi)) {
		mutex_enter(&mi->mi_lock);
		DEC_READERS(mi);
		mutex_exit(&mi->mi_lock);

		if ((rpcerr.re_errno == ETIMEDOUT ||
		    rpcerr.re_errno == ECONNRESET) &&
		    failover_safe(fi)) {
			if (svp == mi->mi_curr_serv)
				failover_newserver(mi);
			goto failoverretry;
		}
	}
	if (rpcerr.re_errno != 0)
		return (rpcerr.re_errno);

	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
		timeo = (mi->mi_timeo * hz) / 10;
	} else {
		mutex_enter(&mi->mi_lock);
		timeo = CLNT_SETTIMERS(client,
		    &(mi->mi_timers[mi->mi_timer_type[which]]),
		    &(mi->mi_timers[NFS_CALLTYPES]),
		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
		    (void (*)())NULL, (caddr_t)mi, 0);
		mutex_exit(&mi->mi_lock);
	}

	/*
	 * If hard mounted fs, retry call forever unless hard error occurs.
	 */
	do {
		tryagain = FALSE;

		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
			status = RPC_FAILED;
			rpcerr.re_status = RPC_FAILED;
			rpcerr.re_errno = EIO;
			break;
		}

		TICK_TO_TIMEVAL(timeo, &wait);

		/*
		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
		 * and SIGTERM. (Preserving the existing masks).
		 * Mask out SIGINT if mount option nointr is specified.
		 */
		sigintr(&smask, (int)mi->mi_flags & MI_INT);
		if (!(mi->mi_flags & MI_INT))
			client->cl_nosignal = TRUE;

		/*
		 * If there is a current signal, then don't bother
		 * even trying to send out the request because we
		 * won't be able to block waiting for the response.
		 * Simply assume RPC_INTR and get on with it.
		 */
		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
			status = RPC_INTR;
		else {
			status = CLNT_CALL(client, which, xdrargs, argsp,
			    xdrres, resp, wait);
		}

		if (!(mi->mi_flags & MI_INT))
			client->cl_nosignal = FALSE;
		/*
		 * restore original signal mask
		 */
		sigunintr(&smask);

		switch (status) {
		case RPC_SUCCESS:
			if ((mi->mi_flags & MI_DYNAMIC) &&
			    mi->mi_timer_type[which] != 0 &&
			    (mi->mi_curread != my_rsize ||
			    mi->mi_curwrite != my_wsize))
				(void) nfs_feedback(FEEDBACK_OK, which, mi);
			break;

		case RPC_INTR:
			/*
			 * There is no way to recover from this error,
			 * even if mount option nointr is specified.
			 * SIGKILL, for example, cannot be blocked.
			 */
			rpcerr.re_status = RPC_INTR;
			rpcerr.re_errno = EINTR;
			break;

		case RPC_UDERROR:
			/*
			 * If the NFS server is local (vold) and
			 * it goes away then we get RPC_UDERROR.
			 * This is a retryable error, so we would
			 * loop, so check to see if the specific
			 * error was ECONNRESET, indicating that
			 * target did not exist at all.  If so,
			 * return with RPC_PROGUNAVAIL and
			 * ECONNRESET to indicate why.
			 */
			CLNT_GETERR(client, &rpcerr);
			if (rpcerr.re_errno == ECONNRESET) {
				rpcerr.re_status = RPC_PROGUNAVAIL;
				rpcerr.re_errno = ECONNRESET;
				break;
			}
			/*FALLTHROUGH*/

		default:		/* probably RPC_TIMEDOUT */
			if (IS_UNRECOVERABLE_RPC(status))
				break;

			/*
			 * increment server not responding count
			 */
			mutex_enter(&mi->mi_lock);
			mi->mi_noresponse++;
			mutex_exit(&mi->mi_lock);
#ifdef DEBUG
			nfscl->nfscl_stat.noresponse.value.ui64++;
#endif

			if (!(mi->mi_flags & MI_HARD)) {
				if (!(mi->mi_flags & MI_SEMISOFT) ||
				    (mi->mi_ss_call_type[which] == 0))
					break;
			}

			/*
			 * The call is in progress (over COTS).
			 * Try the CLNT_CALL again, but don't
			 * print a noisy error message.
			 */
			if (status == RPC_INPROGRESS) {
				tryagain = TRUE;
				break;
			}

			if (flags & RFSCALL_SOFT)
				break;

			/*
			 * On zone shutdown, just move on.
			 */
			if (zone_status_get(curproc->p_zone) >=
			    ZONE_IS_SHUTTING_DOWN) {
				rpcerr.re_status = RPC_FAILED;
				rpcerr.re_errno = EIO;
				break;
			}

			/*
			 * NFS client failover support
			 *
			 * If the current server just failed us, we'll
			 * start the process of finding a new server.
			 * After that, we can just retry.
			 */
			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
				if (svp == mi->mi_curr_serv)
					failover_newserver(mi);
				clfree_impl(client, ch, nfscl);
				goto failoverretry;
			}

			tryagain = TRUE;
			timeo = backoff(timeo);

			CLNT_GETERR(client, &rpcerr_tmp);
			if ((status == RPC_CANTSEND) &&
			    (rpcerr_tmp.re_errno == ENOBUFS))
				msg = SRV_QFULL_MSG;
			else
				msg = SRV_NOTRESP_MSG;

			mutex_enter(&mi->mi_lock);
			if (!(mi->mi_flags & MI_PRINTED)) {
				mi->mi_flags |= MI_PRINTED;
				mutex_exit(&mi->mi_lock);
#ifdef DEBUG
				zprintf(zoneid, msg, mi->mi_vers,
				    svp->sv_hostname);
#else
				zprintf(zoneid, msg, svp->sv_hostname);
#endif
			} else
				mutex_exit(&mi->mi_lock);
			if (*douprintf && nfs_has_ctty()) {
				*douprintf = 0;
				if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
					uprintf(msg, mi->mi_vers,
					    svp->sv_hostname);
#else
					uprintf(msg, svp->sv_hostname);
#endif
			}

			/*
			 * If doing dynamic adjustment of transfer
			 * size and if it's a read or write call
			 * and if the transfer size changed while
			 * retransmitting or if the feedback routine
			 * changed the transfer size,
			 * then exit rfscall so that the transfer
			 * size can be adjusted at the vnops level.
			 */
			if ((mi->mi_flags & MI_DYNAMIC) &&
			    mi->mi_timer_type[which] != 0 &&
			    (mi->mi_curread != my_rsize ||
			    mi->mi_curwrite != my_wsize ||
			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
				/*
				 * On read or write calls, return
				 * back to the vnode ops level if
				 * the transfer size changed.
				 */
				clfree_impl(client, ch, nfscl);
				if (cred_cloned)
					crfree(cr);
				return (ENFS_TRYAGAIN);
			}
		}
	} while (tryagain);

	if (status != RPC_SUCCESS) {
		/*
		 * Let soft mounts use the timed out message.
		 */
		if (status == RPC_INPROGRESS)
			status = RPC_TIMEDOUT;
		nfscl->nfscl_stat.badcalls.value.ui64++;
		if (status != RPC_INTR) {
			mutex_enter(&mi->mi_lock);
			mi->mi_flags |= MI_DOWN;
			mutex_exit(&mi->mi_lock);
			CLNT_GETERR(client, &rpcerr);
#ifdef DEBUG
			bufp = clnt_sperror(client, svp->sv_hostname);
			zprintf(zoneid, "NFS%d %s failed for %s\n",
			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
			if (nfs_has_ctty()) {
				if (!(mi->mi_flags & MI_NOPRINT)) {
					uprintf("NFS%d %s failed for %s\n",
					    mi->mi_vers, mi->mi_rfsnames[which],
					    bufp);
				}
			}
			kmem_free(bufp, MAXPATHLEN);
#else
			zprintf(zoneid,
			    "NFS %s failed for server %s: error %d (%s)\n",
			    mi->mi_rfsnames[which], svp->sv_hostname,
			    status, clnt_sperrno(status));
			if (nfs_has_ctty()) {
				if (!(mi->mi_flags & MI_NOPRINT)) {
					uprintf(
				"NFS %s failed for server %s: error %d (%s)\n",
					    mi->mi_rfsnames[which],
					    svp->sv_hostname, status,
					    clnt_sperrno(status));
				}
			}
#endif
			/*
			 * when CLNT_CALL() fails with RPC_AUTHERROR,
			 * re_errno is set appropriately depending on
			 * the authentication error
			 */
			if (status == RPC_VERSMISMATCH ||
			    status == RPC_PROGVERSMISMATCH)
				rpcerr.re_errno = EIO;
		}
	} else {
		/*
		 * Test the value of mi_down and mi_printed without
		 * holding the mi_lock mutex.  If they are both zero,
		 * then it is okay to skip the down and printed
		 * processing.  This saves on a mutex_enter and
		 * mutex_exit pair for a normal, successful RPC.
		 * This was just complete overhead.
		 */
		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
			mutex_enter(&mi->mi_lock);
			mi->mi_flags &= ~MI_DOWN;
			if (mi->mi_flags & MI_PRINTED) {
				mi->mi_flags &= ~MI_PRINTED;
				mutex_exit(&mi->mi_lock);
#ifdef DEBUG
			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
				zprintf(zoneid, "NFS%d server %s ok\n",
				    mi->mi_vers, svp->sv_hostname);
#else
			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
				zprintf(zoneid, "NFS server %s ok\n",
				    svp->sv_hostname);
#endif
			} else
				mutex_exit(&mi->mi_lock);
		}

		if (*douprintf == 0) {
			if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
					uprintf("NFS%d server %s ok\n",
					    mi->mi_vers, svp->sv_hostname);
#else
			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
				uprintf("NFS server %s ok\n", svp->sv_hostname);
#endif
			*douprintf = 1;
		}
	}

	clfree_impl(client, ch, nfscl);
	if (cred_cloned)
		crfree(cr);

	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);

	if (rpc_status != NULL)
		*rpc_status = rpcerr.re_status;

	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
	    rpcerr.re_errno);

	return (rpcerr.re_errno);
}

#ifdef DEBUG
static int acl2call_hits = 0;
static int acl2call_misses = 0;
#endif

int
acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
    enum nfsstat *statusp, int flags, failinfo_t *fi)
{
	int rpcerror;

	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
	    cr, douprintf, flags, fi);
	if (!rpcerror) {
		/*
		 * See comments with crnetadjust().
		 */
		if (*statusp == NFSERR_ACCES &&
		    (cr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
			acl2call_hits++;
#endif
			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
			    resp, cr, douprintf, flags, fi);
			crfree(cr);
#ifdef DEBUG
			if (*statusp == NFSERR_ACCES)
				acl2call_misses++;
#endif
		}
	}

	return (rpcerror);
}

#ifdef DEBUG
static int acl3call_hits = 0;
static int acl3call_misses = 0;
#endif

int
acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
    nfsstat3 *statusp, int flags, failinfo_t *fi)
{
	int rpcerror;
	int user_informed;

	user_informed = 0;

	do {
		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
		    cr, douprintf, flags, fi);
		if (!rpcerror) {
			cred_t *crr;
			if (*statusp == NFS3ERR_JUKEBOX) {
				if (!user_informed) {
					user_informed = 1;
					uprintf(
		"file temporarily unavailable on the server, retrying...\n");
				}
				delay(nfs3_jukebox_delay);
			}
			/*
			 * See crnetadjust() for comments.
			 */
			else if (*statusp == NFS3ERR_ACCES &&
			    (crr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
				acl3call_hits++;
#endif
				rpcerror = aclcall(mi, which, xdrargs, argsp,
				    xdrres, resp, crr, douprintf, flags, fi);

				crfree(crr);
#ifdef DEBUG
				if (*statusp == NFS3ERR_ACCES)
					acl3call_misses++;
#endif
			}
		}
	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);

	return (rpcerror);
}

static int
aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
    int flags, failinfo_t *fi)
{
	CLIENT *client;
	struct chtab *ch;
	cred_t *cr = icr;
	bool_t cred_cloned = FALSE;
	enum clnt_stat status;
	struct rpc_err rpcerr;
	struct timeval wait;
	int timeo;		/* in units of hz */
#if 0 /* notyet */
	int my_rsize, my_wsize;
#endif
	bool_t tryagain;
	k_sigset_t smask;
	servinfo_t *svp;
	struct nfs_clnt *nfscl;
	zoneid_t zoneid = getzoneid();
#ifdef DEBUG
	char *bufp;
#endif

#if 0 /* notyet */
	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
	    "rfscall_start:which %d mi %p", which, mi);
#endif

	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
	ASSERT(nfscl != NULL);

	nfscl->nfscl_stat.calls.value.ui64++;
	mi->mi_aclreqs[which].value.ui64++;

	rpcerr.re_status = RPC_SUCCESS;

	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
		rpcerr.re_status = RPC_FAILED;
		rpcerr.re_errno = EIO;
		return (rpcerr.re_errno);
	}

#if 0 /* notyet */
	/*
	 * Remember the transfer sizes in case
	 * nfs_feedback changes them underneath us.
	 */
	my_rsize = mi->mi_curread;
	my_wsize = mi->mi_curwrite;
#endif

	/*
	 * NFS client failover support
	 *
	 * If this rnode is not in sync with the current server (VALID_FH),
	 * we'd like to do a remap to get in sync.  We can be interrupted
	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
	 * use the best info we have to try the RPC.  Part of that is
	 * unconditionally updating the filehandle copy kept for V3.
	 *
	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
	 * rw_enter(); we're trying to keep the current server from being
	 * changed on us until we're done with the remapping and have a
	 * matching client handle.  We don't want to sending a filehandle
	 * to the wrong host.
	 */
failoverretry:
	if (FAILOVER_MOUNT(mi)) {
		mutex_enter(&mi->mi_lock);
		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
			if (failover_wait(mi)) {
				mutex_exit(&mi->mi_lock);
				return (EINTR);
			}
		}
		INC_READERS(mi);
		mutex_exit(&mi->mi_lock);
		if (fi) {
			if (!VALID_FH(fi) &&
			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
				int remaperr;

				svp = mi->mi_curr_serv;
				remaperr = failover_remap(fi);
				if (remaperr != 0) {
#ifdef DEBUG
					if (remaperr != EINTR)
						nfs_cmn_err(remaperr, CE_WARN,
					    "aclcall couldn't failover: %m");
#endif
					mutex_enter(&mi->mi_lock);
					DEC_READERS(mi);
					mutex_exit(&mi->mi_lock);

					/*
					 * If failover_remap returns ETIMEDOUT
					 * and the filesystem is hard mounted
					 * we have to retry the call with a new
					 * server.
					 */
					if ((mi->mi_flags & MI_HARD) &&
					    IS_RECOVERABLE_ERROR(remaperr)) {
						if (svp == mi->mi_curr_serv)
							failover_newserver(mi);
						rpcerr.re_status = RPC_SUCCESS;
						goto failoverretry;
					}
					return (remaperr);
				}
			}
			if (fi->fhp && fi->copyproc)
				(*fi->copyproc)(fi->fhp, fi->vp);
		}
	}

	/* For TSOL, use a new cred which has net_mac_aware flag */
	if (!cred_cloned && is_system_labeled()) {
		cred_cloned = TRUE;
		cr = crdup(icr);
		(void) setpflags(NET_MAC_AWARE, 1, cr);
	}

	/*
	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
	 * are guaranteed to reprocess the retry as a new request.
	 */
	svp = mi->mi_curr_serv;
	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
	if (FAILOVER_MOUNT(mi)) {
		mutex_enter(&mi->mi_lock);
		DEC_READERS(mi);
		mutex_exit(&mi->mi_lock);

		if ((rpcerr.re_errno == ETIMEDOUT ||
		    rpcerr.re_errno == ECONNRESET) &&
		    failover_safe(fi)) {
			if (svp == mi->mi_curr_serv)
				failover_newserver(mi);
			goto failoverretry;
		}
	}
	if (rpcerr.re_errno != 0) {
		if (cred_cloned)
			crfree(cr);
		return (rpcerr.re_errno);
	}

	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
		timeo = (mi->mi_timeo * hz) / 10;
	} else {
		mutex_enter(&mi->mi_lock);
		timeo = CLNT_SETTIMERS(client,
		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
		    &(mi->mi_timers[NFS_CALLTYPES]),
		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
		    (void (*)()) 0, (caddr_t)mi, 0);
		mutex_exit(&mi->mi_lock);
	}

	/*
	 * If hard mounted fs, retry call forever unless hard error occurs.
	 */
	do {
		tryagain = FALSE;

		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
			status = RPC_FAILED;
			rpcerr.re_status = RPC_FAILED;
			rpcerr.re_errno = EIO;
			break;
		}

		TICK_TO_TIMEVAL(timeo, &wait);

		/*
		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
		 * and SIGTERM. (Preserving the existing masks).
		 * Mask out SIGINT if mount option nointr is specified.
		 */
		sigintr(&smask, (int)mi->mi_flags & MI_INT);
		if (!(mi->mi_flags & MI_INT))
			client->cl_nosignal = TRUE;

		/*
		 * If there is a current signal, then don't bother
		 * even trying to send out the request because we
		 * won't be able to block waiting for the response.
		 * Simply assume RPC_INTR and get on with it.
		 */
		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
			status = RPC_INTR;
		else {
			status = CLNT_CALL(client, which, xdrargs, argsp,
			    xdrres, resp, wait);
		}

		if (!(mi->mi_flags & MI_INT))
			client->cl_nosignal = FALSE;
		/*
		 * restore original signal mask
		 */
		sigunintr(&smask);

		switch (status) {
		case RPC_SUCCESS:
#if 0 /* notyet */
			if ((mi->mi_flags & MI_DYNAMIC) &&
			    mi->mi_timer_type[which] != 0 &&
			    (mi->mi_curread != my_rsize ||
			    mi->mi_curwrite != my_wsize))
				(void) nfs_feedback(FEEDBACK_OK, which, mi);
#endif
			break;

		/*
		 * Unfortunately, there are servers in the world which
		 * are not coded correctly.  They are not prepared to
		 * handle RPC requests to the NFS port which are not
		 * NFS requests.  Thus, they may try to process the
		 * NFS_ACL request as if it were an NFS request.  This
		 * does not work.  Generally, an error will be generated
		 * on the client because it will not be able to decode
		 * the response from the server.  However, it seems
		 * possible that the server may not be able to decode
		 * the arguments.  Thus, the criteria for deciding
		 * whether the server supports NFS_ACL or not is whether
		 * the following RPC errors are returned from CLNT_CALL.
		 */
		case RPC_CANTDECODERES:
		case RPC_PROGUNAVAIL:
		case RPC_CANTDECODEARGS:
		case RPC_PROGVERSMISMATCH:
			mutex_enter(&mi->mi_lock);
			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
			mutex_exit(&mi->mi_lock);
			break;

		/*
		 * If the server supports NFS_ACL but not the new ops
		 * for extended attributes, make sure we don't retry.
		 */
		case RPC_PROCUNAVAIL:
			mutex_enter(&mi->mi_lock);
			mi->mi_flags &= ~MI_EXTATTR;
			mutex_exit(&mi->mi_lock);
			break;

		case RPC_INTR:
			/*
			 * There is no way to recover from this error,
			 * even if mount option nointr is specified.
			 * SIGKILL, for example, cannot be blocked.
			 */
			rpcerr.re_status = RPC_INTR;
			rpcerr.re_errno = EINTR;
			break;

		case RPC_UDERROR:
			/*
			 * If the NFS server is local (vold) and
			 * it goes away then we get RPC_UDERROR.
			 * This is a retryable error, so we would
			 * loop, so check to see if the specific
			 * error was ECONNRESET, indicating that
			 * target did not exist at all.  If so,
			 * return with RPC_PROGUNAVAIL and
			 * ECONNRESET to indicate why.
			 */
			CLNT_GETERR(client, &rpcerr);
			if (rpcerr.re_errno == ECONNRESET) {
				rpcerr.re_status = RPC_PROGUNAVAIL;
				rpcerr.re_errno = ECONNRESET;
				break;
			}
			/*FALLTHROUGH*/

		default:		/* probably RPC_TIMEDOUT */
			if (IS_UNRECOVERABLE_RPC(status))
				break;

			/*
			 * increment server not responding count
			 */
			mutex_enter(&mi->mi_lock);
			mi->mi_noresponse++;
			mutex_exit(&mi->mi_lock);
#ifdef DEBUG
			nfscl->nfscl_stat.noresponse.value.ui64++;
#endif

			if (!(mi->mi_flags & MI_HARD)) {
				if (!(mi->mi_flags & MI_SEMISOFT) ||
				    (mi->mi_acl_ss_call_type[which] == 0))
					break;
			}

			/*
			 * The call is in progress (over COTS).
			 * Try the CLNT_CALL again, but don't
			 * print a noisy error message.
			 */
			if (status == RPC_INPROGRESS) {
				tryagain = TRUE;
				break;
			}

			if (flags & RFSCALL_SOFT)
				break;

			/*
			 * On zone shutdown, just move on.
			 */
			if (zone_status_get(curproc->p_zone) >=
			    ZONE_IS_SHUTTING_DOWN) {
				rpcerr.re_status = RPC_FAILED;
				rpcerr.re_errno = EIO;
				break;
			}

			/*
			 * NFS client failover support
			 *
			 * If the current server just failed us, we'll
			 * start the process of finding a new server.
			 * After that, we can just retry.
			 */
			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
				if (svp == mi->mi_curr_serv)
					failover_newserver(mi);
				clfree_impl(client, ch, nfscl);
				goto failoverretry;
			}

			tryagain = TRUE;
			timeo = backoff(timeo);
			mutex_enter(&mi->mi_lock);
			if (!(mi->mi_flags & MI_PRINTED)) {
				mi->mi_flags |= MI_PRINTED;
				mutex_exit(&mi->mi_lock);
#ifdef DEBUG
				zprintf(zoneid,
			"NFS_ACL%d server %s not responding still trying\n",
				    mi->mi_vers, svp->sv_hostname);
#else
				zprintf(zoneid,
			    "NFS server %s not responding still trying\n",
				    svp->sv_hostname);
#endif
			} else
				mutex_exit(&mi->mi_lock);
			if (*douprintf && nfs_has_ctty()) {
				*douprintf = 0;
				if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
					uprintf(
			"NFS_ACL%d server %s not responding still trying\n",
					    mi->mi_vers, svp->sv_hostname);
#else
					uprintf(
			    "NFS server %s not responding still trying\n",
					    svp->sv_hostname);
#endif
			}

#if 0 /* notyet */
			/*
			 * If doing dynamic adjustment of transfer
			 * size and if it's a read or write call
			 * and if the transfer size changed while
			 * retransmitting or if the feedback routine
			 * changed the transfer size,
			 * then exit rfscall so that the transfer
			 * size can be adjusted at the vnops level.
			 */
			if ((mi->mi_flags & MI_DYNAMIC) &&
			    mi->mi_acl_timer_type[which] != 0 &&
			    (mi->mi_curread != my_rsize ||
			    mi->mi_curwrite != my_wsize ||
			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
				/*
				 * On read or write calls, return
				 * back to the vnode ops level if
				 * the transfer size changed.
				 */
				clfree_impl(client, ch, nfscl);
				if (cred_cloned)
					crfree(cr);
				return (ENFS_TRYAGAIN);
			}
#endif
		}
	} while (tryagain);

	if (status != RPC_SUCCESS) {
		/*
		 * Let soft mounts use the timed out message.
		 */
		if (status == RPC_INPROGRESS)
			status = RPC_TIMEDOUT;
		nfscl->nfscl_stat.badcalls.value.ui64++;
		if (status == RPC_CANTDECODERES ||
		    status == RPC_PROGUNAVAIL ||
		    status == RPC_PROCUNAVAIL ||
		    status == RPC_CANTDECODEARGS ||
		    status == RPC_PROGVERSMISMATCH)
			CLNT_GETERR(client, &rpcerr);
		else if (status != RPC_INTR) {
			mutex_enter(&mi->mi_lock);
			mi->mi_flags |= MI_DOWN;
			mutex_exit(&mi->mi_lock);
			CLNT_GETERR(client, &rpcerr);
#ifdef DEBUG
			bufp = clnt_sperror(client, svp->sv_hostname);
			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
			    mi->mi_vers, mi->mi_aclnames[which], bufp);
			if (nfs_has_ctty()) {
				if (!(mi->mi_flags & MI_NOPRINT)) {
					uprintf("NFS_ACL%d %s failed for %s\n",
					    mi->mi_vers, mi->mi_aclnames[which],
					    bufp);
				}
			}
			kmem_free(bufp, MAXPATHLEN);
#else
			zprintf(zoneid,
			    "NFS %s failed for server %s: error %d (%s)\n",
			    mi->mi_aclnames[which], svp->sv_hostname,
			    status, clnt_sperrno(status));
			if (nfs_has_ctty()) {
				if (!(mi->mi_flags & MI_NOPRINT))
					uprintf(
				"NFS %s failed for server %s: error %d (%s)\n",
					    mi->mi_aclnames[which],
					    svp->sv_hostname, status,
					    clnt_sperrno(status));
			}
#endif
			/*
			 * when CLNT_CALL() fails with RPC_AUTHERROR,
			 * re_errno is set appropriately depending on
			 * the authentication error
			 */
			if (status == RPC_VERSMISMATCH ||
			    status == RPC_PROGVERSMISMATCH)
				rpcerr.re_errno = EIO;
		}
	} else {
		/*
		 * Test the value of mi_down and mi_printed without
		 * holding the mi_lock mutex.  If they are both zero,
		 * then it is okay to skip the down and printed
		 * processing.  This saves on a mutex_enter and
		 * mutex_exit pair for a normal, successful RPC.
		 * This was just complete overhead.
		 */
		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
			mutex_enter(&mi->mi_lock);
			mi->mi_flags &= ~MI_DOWN;
			if (mi->mi_flags & MI_PRINTED) {
				mi->mi_flags &= ~MI_PRINTED;
				mutex_exit(&mi->mi_lock);
#ifdef DEBUG
				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
				    mi->mi_vers, svp->sv_hostname);
#else
				zprintf(zoneid, "NFS server %s ok\n",
				    svp->sv_hostname);
#endif
			} else
				mutex_exit(&mi->mi_lock);
		}

		if (*douprintf == 0) {
			if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
				uprintf("NFS_ACL%d server %s ok\n",
				    mi->mi_vers, svp->sv_hostname);
#else
				uprintf("NFS server %s ok\n", svp->sv_hostname);
#endif
			*douprintf = 1;
		}
	}

	clfree_impl(client, ch, nfscl);
	if (cred_cloned)
		crfree(cr);

	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);

#if 0 /* notyet */
	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
	    rpcerr.re_errno);
#endif

	return (rpcerr.re_errno);
}

int
vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
{
	uint_t mask = vap->va_mask;

	if (!(mask & AT_MODE))
		sa->sa_mode = (uint32_t)-1;
	else
		sa->sa_mode = vap->va_mode;
	if (!(mask & AT_UID))
		sa->sa_uid = (uint32_t)-1;
	else
		sa->sa_uid = (uint32_t)vap->va_uid;
	if (!(mask & AT_GID))
		sa->sa_gid = (uint32_t)-1;
	else
		sa->sa_gid = (uint32_t)vap->va_gid;
	if (!(mask & AT_SIZE))
		sa->sa_size = (uint32_t)-1;
	else
		sa->sa_size = (uint32_t)vap->va_size;
	if (!(mask & AT_ATIME))
		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
	else {
		/* check time validity */
		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
			return (EOVERFLOW);
		}
		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
	}
	if (!(mask & AT_MTIME))
		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
	else {
		/* check time validity */
		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
			return (EOVERFLOW);
		}
		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
	}
	return (0);
}

int
vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
{
	uint_t mask = vap->va_mask;

	if (!(mask & AT_MODE))
		sa->mode.set_it = FALSE;
	else {
		sa->mode.set_it = TRUE;
		sa->mode.mode = (mode3)vap->va_mode;
	}
	if (!(mask & AT_UID))
		sa->uid.set_it = FALSE;
	else {
		sa->uid.set_it = TRUE;
		sa->uid.uid = (uid3)vap->va_uid;
	}
	if (!(mask & AT_GID))
		sa->gid.set_it = FALSE;
	else {
		sa->gid.set_it = TRUE;
		sa->gid.gid = (gid3)vap->va_gid;
	}
	if (!(mask & AT_SIZE))
		sa->size.set_it = FALSE;
	else {
		sa->size.set_it = TRUE;
		sa->size.size = (size3)vap->va_size;
	}
	if (!(mask & AT_ATIME))
		sa->atime.set_it = DONT_CHANGE;
	else {
		/* check time validity */
		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
			return (EOVERFLOW);
		}
		sa->atime.set_it = SET_TO_CLIENT_TIME;
		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
	}
	if (!(mask & AT_MTIME))
		sa->mtime.set_it = DONT_CHANGE;
	else {
		/* check time validity */
		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
			return (EOVERFLOW);
		}
		sa->mtime.set_it = SET_TO_CLIENT_TIME;
		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
	}
	return (0);
}

void
setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
{

	da->da_fhandle = VTOFH(dvp);
	da->da_name = nm;
	da->da_flags = 0;
}

void
setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
{

	da->dirp = VTOFH3(dvp);
	da->name = nm;
}

int
setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
{
	int error;
	rnode_t *rp;
	struct vattr va;

	va.va_mask = AT_MODE | AT_GID;
	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
	if (error)
		return (error);

	/*
	 * To determine the expected group-id of the created file:
	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
	 *	GRPID option, and the directory's set-gid bit is clear,
	 *	then use the process's gid.
	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
	 */
	rp = VTOR(dvp);
	mutex_enter(&rp->r_statelock);
	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
		*gidp = crgetgid(cr);
	else
		*gidp = va.va_gid;
	mutex_exit(&rp->r_statelock);
	return (0);
}

int
setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
{
	int error;
	struct vattr va;

	va.va_mask = AT_MODE;
	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
	if (error)
		return (error);

	/*
	 * Modify the expected mode (om) so that the set-gid bit matches
	 * that of the parent directory (dvp).
	 */
	if (va.va_mode & VSGID)
		*omp |= VSGID;
	else
		*omp &= ~VSGID;
	return (0);
}

void
nfs_setswaplike(vnode_t *vp, vattr_t *vap)
{

	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
		if (!(vp->v_flag & VSWAPLIKE)) {
			mutex_enter(&vp->v_lock);
			vp->v_flag |= VSWAPLIKE;
			mutex_exit(&vp->v_lock);
		}
	} else {
		if (vp->v_flag & VSWAPLIKE) {
			mutex_enter(&vp->v_lock);
			vp->v_flag &= ~VSWAPLIKE;
			mutex_exit(&vp->v_lock);
		}
	}
}

/*
 * Free the resources associated with an rnode.
 */
static void
rinactive(rnode_t *rp, cred_t *cr)
{
	vnode_t *vp;
	cred_t *cred;
	char *contents;
	int size;
	vsecattr_t *vsp;
	int error;
	nfs3_pathconf_info *info;

	/*
	 * Before freeing anything, wait until all asynchronous
	 * activity is done on this rnode.  This will allow all
	 * asynchronous read ahead and write behind i/o's to
	 * finish.
	 */
	mutex_enter(&rp->r_statelock);
	while (rp->r_count > 0)
		cv_wait(&rp->r_cv, &rp->r_statelock);
	mutex_exit(&rp->r_statelock);

	/*
	 * Flush and invalidate all pages associated with the vnode.
	 */
	vp = RTOV(rp);
	if (vn_has_cached_data(vp)) {
		ASSERT(vp->v_type != VCHR);
		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
			if (error && (error == ENOSPC || error == EDQUOT)) {
				mutex_enter(&rp->r_statelock);
				if (!rp->r_error)
					rp->r_error = error;
				mutex_exit(&rp->r_statelock);
			}
		}
		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
	}

	/*
	 * Free any held credentials and caches which may be associated
	 * with this rnode.
	 */
	mutex_enter(&rp->r_statelock);
	cred = rp->r_cred;
	rp->r_cred = NULL;
	contents = rp->r_symlink.contents;
	size = rp->r_symlink.size;
	rp->r_symlink.contents = NULL;
	vsp = rp->r_secattr;
	rp->r_secattr = NULL;
	info = rp->r_pathconf;
	rp->r_pathconf = NULL;
	mutex_exit(&rp->r_statelock);

	/*
	 * Free the held credential.
	 */
	if (cred != NULL)
		crfree(cred);

	/*
	 * Free the access cache entries.
	 */
	(void) nfs_access_purge_rp(rp);

	/*
	 * Free the readdir cache entries.
	 */
	if (HAVE_RDDIR_CACHE(rp))
		nfs_purge_rddir_cache(vp);

	/*
	 * Free the symbolic link cache.
	 */
	if (contents != NULL) {

		kmem_free((void *)contents, size);
	}

	/*
	 * Free any cached ACL.
	 */
	if (vsp != NULL)
		nfs_acl_free(vsp);

	/*
	 * Free any cached pathconf information.
	 */
	if (info != NULL)
		kmem_free(info, sizeof (*info));
}

/*
 * Return a vnode for the given NFS Version 2 file handle.
 * If no rnode exists for this fhandle, create one and put it
 * into the hash queues.  If the rnode for this fhandle
 * already exists, return it.
 *
 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
 */
vnode_t *
makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
    hrtime_t t, cred_t *cr, char *dnm, char *nm)
{
	int newnode;
	int index;
	vnode_t *vp;
	nfs_fhandle nfh;
	vattr_t va;

	nfh.fh_len = NFS_FHSIZE;
	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);

	index = rtablehash(&nfh);
	rw_enter(&rtable[index].r_lock, RW_READER);

	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);

	if (attr != NULL) {
		if (!newnode) {
			rw_exit(&rtable[index].r_lock);
			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
		} else {
			if (attr->na_type < NFNON || attr->na_type > NFSOC)
				vp->v_type = VBAD;
			else
				vp->v_type = n2v_type(attr);
			/*
			 * A translation here seems to be necessary
			 * because this function can be called
			 * with `attr' that has come from the wire,
			 * and been operated on by vattr_to_nattr().
			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
			 * ->makenfsnode().
			 */
			if ((attr->na_rdev & 0xffff0000) == 0)
				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
			else
				vp->v_rdev = expldev(n2v_rdev(attr));
			nfs_attrcache(vp, attr, t);
			rw_exit(&rtable[index].r_lock);
		}
	} else {
		if (newnode) {
			PURGE_ATTRCACHE(vp);
		}
		rw_exit(&rtable[index].r_lock);
	}

	return (vp);
}

/*
 * Return a vnode for the given NFS Version 3 file handle.
 * If no rnode exists for this fhandle, create one and put it
 * into the hash queues.  If the rnode for this fhandle
 * already exists, return it.
 *
 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
 */
vnode_t *
makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
    cred_t *cr, char *dnm, char *nm)
{
	int newnode;
	int index;
	vnode_t *vp;

	index = rtablehash((nfs_fhandle *)fh);
	rw_enter(&rtable[index].r_lock, RW_READER);

	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
	    dnm, nm);

	if (vap == NULL) {
		if (newnode) {
			PURGE_ATTRCACHE(vp);
		}
		rw_exit(&rtable[index].r_lock);
		return (vp);
	}

	if (!newnode) {
		rw_exit(&rtable[index].r_lock);
		nfs_attr_cache(vp, vap, t, cr);
	} else {
		rnode_t *rp = VTOR(vp);

		vp->v_type = vap->va_type;
		vp->v_rdev = vap->va_rdev;

		mutex_enter(&rp->r_statelock);
		if (rp->r_mtime <= t)
			nfs_attrcache_va(vp, vap);
		mutex_exit(&rp->r_statelock);
		rw_exit(&rtable[index].r_lock);
	}

	return (vp);
}

vnode_t *
makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
    cred_t *cr, char *dnm, char *nm)
{
	int newnode;
	int index;
	vnode_t *vp;
	vattr_t va;

	index = rtablehash((nfs_fhandle *)fh);
	rw_enter(&rtable[index].r_lock, RW_READER);

	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
	    dnm, nm);

	if (attr == NULL) {
		if (newnode) {
			PURGE_ATTRCACHE(vp);
		}
		rw_exit(&rtable[index].r_lock);
		return (vp);
	}

	if (!newnode) {
		rw_exit(&rtable[index].r_lock);
		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
	} else {
		if (attr->type < NF3REG || attr->type > NF3FIFO)
			vp->v_type = VBAD;
		else
			vp->v_type = nf3_to_vt[attr->type];
		vp->v_rdev = makedevice(attr->rdev.specdata1,
		    attr->rdev.specdata2);
		nfs3_attrcache(vp, attr, t);
		rw_exit(&rtable[index].r_lock);
	}

	return (vp);
}

/*
 * Read this comment before making changes to rtablehash()!
 * This is a hash function in which seemingly obvious and harmless
 * changes can cause escalations costing million dollars!
 * Know what you are doing.
 *
 * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
 * algorithm is currently detailed here:
 *
 *   http://burtleburtle.net/bob/hash/doobs.html
 *
 * Of course, the above link may not be valid by the time you are reading
 * this, but suffice it to say that the one-at-a-time algorithm works well in
 * almost all cases.  If you are changing the algorithm be sure to verify that
 * the hash algorithm still provides even distribution in all cases and with
 * any server returning filehandles in whatever order (sequential or random).
 */
static int
rtablehash(nfs_fhandle *fh)
{
	ulong_t hash, len, i;
	char *key;

	key = fh->fh_buf;
	len = (ulong_t)fh->fh_len;
	for (hash = 0, i = 0; i < len; i++) {
		hash += key[i];
		hash += (hash << 10);
		hash ^= (hash >> 6);
	}
	hash += (hash << 3);
	hash ^= (hash >> 11);
	hash += (hash << 15);
	return (hash & rtablemask);
}

static vnode_t *
make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
    struct vnodeops *vops,
    int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
    int (*compar)(const void *, const void *),
    int *newnode, cred_t *cr, char *dnm, char *nm)
{
	rnode_t *rp;
	rnode_t *trp;
	vnode_t *vp;
	mntinfo_t *mi;

	ASSERT(RW_READ_HELD(&rhtp->r_lock));

	mi = VFTOMI(vfsp);
start:
	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
		vp = RTOV(rp);
		nfs_set_vroot(vp);
		*newnode = 0;
		return (vp);
	}
	rw_exit(&rhtp->r_lock);

	mutex_enter(&rpfreelist_lock);
	if (rpfreelist != NULL && rnew >= nrnode) {
		rp = rpfreelist;
		rp_rmfree(rp);
		mutex_exit(&rpfreelist_lock);

		vp = RTOV(rp);

		if (rp->r_flags & RHASHED) {
			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
			mutex_enter(&vp->v_lock);
			if (vp->v_count > 1) {
				VN_RELE_LOCKED(vp);
				mutex_exit(&vp->v_lock);
				rw_exit(&rp->r_hashq->r_lock);
				rw_enter(&rhtp->r_lock, RW_READER);
				goto start;
			}
			mutex_exit(&vp->v_lock);
			rp_rmhash_locked(rp);
			rw_exit(&rp->r_hashq->r_lock);
		}

		rinactive(rp, cr);

		mutex_enter(&vp->v_lock);
		if (vp->v_count > 1) {
			VN_RELE_LOCKED(vp);
			mutex_exit(&vp->v_lock);
			rw_enter(&rhtp->r_lock, RW_READER);
			goto start;
		}
		mutex_exit(&vp->v_lock);
		vn_invalid(vp);
		/*
		 * destroy old locks before bzero'ing and
		 * recreating the locks below.
		 */
		nfs_rw_destroy(&rp->r_rwlock);
		nfs_rw_destroy(&rp->r_lkserlock);
		mutex_destroy(&rp->r_statelock);
		cv_destroy(&rp->r_cv);
		cv_destroy(&rp->r_commit.c_cv);
		nfs_free_r_path(rp);
		avl_destroy(&rp->r_dir);
		/*
		 * Make sure that if rnode is recycled then
		 * VFS count is decremented properly before
		 * reuse.
		 */
		VFS_RELE(vp->v_vfsp);
		vn_reinit(vp);
	} else {
		vnode_t *new_vp;

		mutex_exit(&rpfreelist_lock);

		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
		new_vp = vn_alloc(KM_SLEEP);

		atomic_inc_ulong((ulong_t *)&rnew);
#ifdef DEBUG
		clstat_debug.nrnode.value.ui64++;
#endif
		vp = new_vp;
	}

	bzero(rp, sizeof (*rp));
	rp->r_vnode = vp;
	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
	rp->r_fh.fh_len = fh->fh_len;
	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
	rp->r_server = mi->mi_curr_serv;
	if (FAILOVER_MOUNT(mi)) {
		/*
		 * If replicated servers, stash pathnames
		 */
		if (dnm != NULL && nm != NULL) {
			char *s, *p;
			uint_t len;

			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
			rp->r_path = kmem_alloc(len, KM_SLEEP);
#ifdef DEBUG
			clstat_debug.rpath.value.ui64 += len;
#endif
			s = rp->r_path;
			for (p = dnm; *p; p++)
				*s++ = *p;
			*s++ = '/';
			for (p = nm; *p; p++)
				*s++ = *p;
			*s = '\0';
		} else {
			/* special case for root */
			rp->r_path = kmem_alloc(2, KM_SLEEP);
#ifdef DEBUG
			clstat_debug.rpath.value.ui64 += 2;
#endif
			*rp->r_path = '.';
			*(rp->r_path + 1) = '\0';
		}
	}
	VFS_HOLD(vfsp);
	rp->r_putapage = putapage;
	rp->r_hashq = rhtp;
	rp->r_flags = RREADDIRPLUS;
	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
	    offsetof(rddir_cache, tree));
	vn_setops(vp, vops);
	vp->v_data = (caddr_t)rp;
	vp->v_vfsp = vfsp;
	vp->v_type = VNON;
	vp->v_flag |= VMODSORT;
	nfs_set_vroot(vp);

	/*
	 * There is a race condition if someone else
	 * alloc's the rnode while no locks are held, so we
	 * check again and recover if found.
	 */
	rw_enter(&rhtp->r_lock, RW_WRITER);
	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
		vp = RTOV(trp);
		nfs_set_vroot(vp);
		*newnode = 0;
		rw_exit(&rhtp->r_lock);
		rp_addfree(rp, cr);
		rw_enter(&rhtp->r_lock, RW_READER);
		return (vp);
	}
	rp_addhash(rp);
	*newnode = 1;
	return (vp);
}

/*
 * Callback function to check if the page should be marked as
 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
 */
int
nfs_setmod_check(page_t *pp)
{
	if (pp->p_fsdata != C_NOCOMMIT) {
		pp->p_fsdata = C_NOCOMMIT;
		return (1);
	}
	return (0);
}

static void
nfs_set_vroot(vnode_t *vp)
{
	rnode_t *rp;
	nfs_fhandle *rootfh;

	rp = VTOR(vp);
	rootfh = &rp->r_server->sv_fhandle;
	if (rootfh->fh_len == rp->r_fh.fh_len &&
	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
		if (!(vp->v_flag & VROOT)) {
			mutex_enter(&vp->v_lock);
			vp->v_flag |= VROOT;
			mutex_exit(&vp->v_lock);
		}
	}
}

static void
nfs_free_r_path(rnode_t *rp)
{
	char *path;
	size_t len;

	path = rp->r_path;
	if (path) {
		rp->r_path = NULL;
		len = strlen(path) + 1;
		kmem_free(path, len);
#ifdef DEBUG
		clstat_debug.rpath.value.ui64 -= len;
#endif
	}
}

/*
 * Put an rnode on the free list.
 *
 * Rnodes which were allocated above and beyond the normal limit
 * are immediately freed.
 */
void
rp_addfree(rnode_t *rp, cred_t *cr)
{
	vnode_t *vp;
	struct vfs *vfsp;

	vp = RTOV(rp);
	ASSERT(vp->v_count >= 1);
	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);

	/*
	 * If we have too many rnodes allocated and there are no
	 * references to this rnode, or if the rnode is no longer
	 * accessible by it does not reside in the hash queues,
	 * or if an i/o error occurred while writing to the file,
	 * then just free it instead of putting it on the rnode
	 * freelist.
	 */
	vfsp = vp->v_vfsp;
	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
		if (rp->r_flags & RHASHED) {
			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
			mutex_enter(&vp->v_lock);
			if (vp->v_count > 1) {
				VN_RELE_LOCKED(vp);
				mutex_exit(&vp->v_lock);
				rw_exit(&rp->r_hashq->r_lock);
				return;
			}
			mutex_exit(&vp->v_lock);
			rp_rmhash_locked(rp);
			rw_exit(&rp->r_hashq->r_lock);
		}

		rinactive(rp, cr);

		/*
		 * Recheck the vnode reference count.  We need to
		 * make sure that another reference has not been
		 * acquired while we were not holding v_lock.  The
		 * rnode is not in the rnode hash queues, so the
		 * only way for a reference to have been acquired
		 * is for a VOP_PUTPAGE because the rnode was marked
		 * with RDIRTY or for a modified page.  This
		 * reference may have been acquired before our call
		 * to rinactive.  The i/o may have been completed,
		 * thus allowing rinactive to complete, but the
		 * reference to the vnode may not have been released
		 * yet.  In any case, the rnode can not be destroyed
		 * until the other references to this vnode have been
		 * released.  The other references will take care of
		 * either destroying the rnode or placing it on the
		 * rnode freelist.  If there are no other references,
		 * then the rnode may be safely destroyed.
		 */
		mutex_enter(&vp->v_lock);
		if (vp->v_count > 1) {
			VN_RELE_LOCKED(vp);
			mutex_exit(&vp->v_lock);
			return;
		}
		mutex_exit(&vp->v_lock);

		destroy_rnode(rp);
		return;
	}

	/*
	 * Lock the hash queue and then recheck the reference count
	 * to ensure that no other threads have acquired a reference
	 * to indicate that the rnode should not be placed on the
	 * freelist.  If another reference has been acquired, then
	 * just release this one and let the other thread complete
	 * the processing of adding this rnode to the freelist.
	 */
	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);

	mutex_enter(&vp->v_lock);
	if (vp->v_count > 1) {
		VN_RELE_LOCKED(vp);
		mutex_exit(&vp->v_lock);
		rw_exit(&rp->r_hashq->r_lock);
		return;
	}
	mutex_exit(&vp->v_lock);

	/*
	 * If there is no cached data or metadata for this file, then
	 * put the rnode on the front of the freelist so that it will
	 * be reused before other rnodes which may have cached data or
	 * metadata associated with them.
	 */
	mutex_enter(&rpfreelist_lock);
	if (rpfreelist == NULL) {
		rp->r_freef = rp;
		rp->r_freeb = rp;
		rpfreelist = rp;
	} else {
		rp->r_freef = rpfreelist;
		rp->r_freeb = rpfreelist->r_freeb;
		rpfreelist->r_freeb->r_freef = rp;
		rpfreelist->r_freeb = rp;
		if (!vn_has_cached_data(vp) &&
		    !HAVE_RDDIR_CACHE(rp) &&
		    rp->r_symlink.contents == NULL &&
		    rp->r_secattr == NULL &&
		    rp->r_pathconf == NULL)
			rpfreelist = rp;
	}
	mutex_exit(&rpfreelist_lock);

	rw_exit(&rp->r_hashq->r_lock);
}

/*
 * Remove an rnode from the free list.
 *
 * The caller must be holding rpfreelist_lock and the rnode
 * must be on the freelist.
 */
static void
rp_rmfree(rnode_t *rp)
{

	ASSERT(MUTEX_HELD(&rpfreelist_lock));
	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);

	if (rp == rpfreelist) {
		rpfreelist = rp->r_freef;
		if (rp == rpfreelist)
			rpfreelist = NULL;
	}

	rp->r_freeb->r_freef = rp->r_freef;
	rp->r_freef->r_freeb = rp->r_freeb;

	rp->r_freef = rp->r_freeb = NULL;
}

/*
 * Put a rnode in the hash table.
 *
 * The caller must be holding the exclusive hash queue lock.
 */
static void
rp_addhash(rnode_t *rp)
{

	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
	ASSERT(!(rp->r_flags & RHASHED));

	rp->r_hashf = rp->r_hashq->r_hashf;
	rp->r_hashq->r_hashf = rp;
	rp->r_hashb = (rnode_t *)rp->r_hashq;
	rp->r_hashf->r_hashb = rp;

	mutex_enter(&rp->r_statelock);
	rp->r_flags |= RHASHED;
	mutex_exit(&rp->r_statelock);
}

/*
 * Remove a rnode from the hash table.
 *
 * The caller must be holding the hash queue lock.
 */
static void
rp_rmhash_locked(rnode_t *rp)
{

	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
	ASSERT(rp->r_flags & RHASHED);

	rp->r_hashb->r_hashf = rp->r_hashf;
	rp->r_hashf->r_hashb = rp->r_hashb;

	mutex_enter(&rp->r_statelock);
	rp->r_flags &= ~RHASHED;
	mutex_exit(&rp->r_statelock);
}

/*
 * Remove a rnode from the hash table.
 *
 * The caller must not be holding the hash queue lock.
 */
void
rp_rmhash(rnode_t *rp)
{

	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
	rp_rmhash_locked(rp);
	rw_exit(&rp->r_hashq->r_lock);
}

/*
 * Lookup a rnode by fhandle.
 *
 * The caller must be holding the hash queue lock, either shared or exclusive.
 */
static rnode_t *
rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
{
	rnode_t *rp;
	vnode_t *vp;

	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));

	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
		vp = RTOV(rp);
		if (vp->v_vfsp == vfsp &&
		    rp->r_fh.fh_len == fh->fh_len &&
		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
			/*
			 * remove rnode from free list, if necessary.
			 */
			if (rp->r_freef != NULL) {
				mutex_enter(&rpfreelist_lock);
				/*
				 * If the rnode is on the freelist,
				 * then remove it and use that reference
				 * as the new reference.  Otherwise,
				 * need to increment the reference count.
				 */
				if (rp->r_freef != NULL) {
					rp_rmfree(rp);
					mutex_exit(&rpfreelist_lock);
				} else {
					mutex_exit(&rpfreelist_lock);
					VN_HOLD(vp);
				}
			} else
				VN_HOLD(vp);
			return (rp);
		}
	}
	return (NULL);
}

/*
 * Return 1 if there is a active vnode belonging to this vfs in the
 * rtable cache.
 *
 * Several of these checks are done without holding the usual
 * locks.  This is safe because destroy_rtable(), rp_addfree(),
 * etc. will redo the necessary checks before actually destroying
 * any rnodes.
 */
int
check_rtable(struct vfs *vfsp)
{
	int index;
	rnode_t *rp;
	vnode_t *vp;

	for (index = 0; index < rtablesize; index++) {
		rw_enter(&rtable[index].r_lock, RW_READER);
		for (rp = rtable[index].r_hashf;
		    rp != (rnode_t *)(&rtable[index]);
		    rp = rp->r_hashf) {
			vp = RTOV(rp);
			if (vp->v_vfsp == vfsp) {
				if (rp->r_freef == NULL ||
				    (vn_has_cached_data(vp) &&
				    (rp->r_flags & RDIRTY)) ||
				    rp->r_count > 0) {
					rw_exit(&rtable[index].r_lock);
					return (1);
				}
			}
		}
		rw_exit(&rtable[index].r_lock);
	}
	return (0);
}

/*
 * Destroy inactive vnodes from the hash queues which belong to this
 * vfs.  It is essential that we destroy all inactive vnodes during a
 * forced unmount as well as during a normal unmount.
 */
void
destroy_rtable(struct vfs *vfsp, cred_t *cr)
{
	int index;
	rnode_t *rp;
	rnode_t *rlist;
	rnode_t *r_hashf;
	vnode_t *vp;

	rlist = NULL;

	for (index = 0; index < rtablesize; index++) {
		rw_enter(&rtable[index].r_lock, RW_WRITER);
		for (rp = rtable[index].r_hashf;
		    rp != (rnode_t *)(&rtable[index]);
		    rp = r_hashf) {
			/* save the hash pointer before destroying */
			r_hashf = rp->r_hashf;
			vp = RTOV(rp);
			if (vp->v_vfsp == vfsp) {
				mutex_enter(&rpfreelist_lock);
				if (rp->r_freef != NULL) {
					rp_rmfree(rp);
					mutex_exit(&rpfreelist_lock);
					rp_rmhash_locked(rp);
					rp->r_hashf = rlist;
					rlist = rp;
				} else
					mutex_exit(&rpfreelist_lock);
			}
		}
		rw_exit(&rtable[index].r_lock);
	}

	for (rp = rlist; rp != NULL; rp = rlist) {
		rlist = rp->r_hashf;
		/*
		 * This call to rp_addfree will end up destroying the
		 * rnode, but in a safe way with the appropriate set
		 * of checks done.
		 */
		rp_addfree(rp, cr);
	}

}

/*
 * This routine destroys all the resources associated with the rnode
 * and then the rnode itself.
 */
static void
destroy_rnode(rnode_t *rp)
{
	vnode_t *vp;
	vfs_t *vfsp;

	vp = RTOV(rp);
	vfsp = vp->v_vfsp;

	ASSERT(vp->v_count == 1);
	ASSERT(rp->r_count == 0);
	ASSERT(rp->r_lmpl == NULL);
	ASSERT(rp->r_mapcnt == 0);
	ASSERT(!(rp->r_flags & RHASHED));
	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
	atomic_dec_ulong((ulong_t *)&rnew);
#ifdef DEBUG
	clstat_debug.nrnode.value.ui64--;
#endif
	nfs_rw_destroy(&rp->r_rwlock);
	nfs_rw_destroy(&rp->r_lkserlock);
	mutex_destroy(&rp->r_statelock);
	cv_destroy(&rp->r_cv);
	cv_destroy(&rp->r_commit.c_cv);
	if (rp->r_flags & RDELMAPLIST)
		list_destroy(&rp->r_indelmap);
	nfs_free_r_path(rp);
	avl_destroy(&rp->r_dir);
	vn_invalid(vp);
	vn_free(vp);
	kmem_cache_free(rnode_cache, rp);
	VFS_RELE(vfsp);
}

/*
 * Flush all vnodes in this (or every) vfs.
 * Used by nfs_sync and by nfs_unmount.
 */
void
rflush(struct vfs *vfsp, cred_t *cr)
{
	int index;
	rnode_t *rp;
	vnode_t *vp, **vplist;
	long num, cnt;

	/*
	 * Check to see whether there is anything to do.
	 */
	num = rnew;
	if (num == 0)
		return;

	/*
	 * Allocate a slot for all currently active rnodes on the
	 * supposition that they all may need flushing.
	 */
	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
	cnt = 0;

	/*
	 * Walk the hash queues looking for rnodes with page
	 * lists associated with them.  Make a list of these
	 * files.
	 */
	for (index = 0; index < rtablesize; index++) {
		rw_enter(&rtable[index].r_lock, RW_READER);
		for (rp = rtable[index].r_hashf;
		    rp != (rnode_t *)(&rtable[index]);
		    rp = rp->r_hashf) {
			vp = RTOV(rp);
			/*
			 * Don't bother sync'ing a vp if it
			 * is part of virtual swap device or
			 * if VFS is read-only
			 */
			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
				continue;
			/*
			 * If flushing all mounted file systems or
			 * the vnode belongs to this vfs, has pages
			 * and is marked as either dirty or mmap'd,
			 * hold and add this vnode to the list of
			 * vnodes to flush.
			 */
			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
			    vn_has_cached_data(vp) &&
			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
				VN_HOLD(vp);
				vplist[cnt++] = vp;
				if (cnt == num) {
					rw_exit(&rtable[index].r_lock);
					goto toomany;
				}
			}
		}
		rw_exit(&rtable[index].r_lock);
	}
toomany:

	/*
	 * Flush and release all of the files on the list.
	 */
	while (cnt-- > 0) {
		vp = vplist[cnt];
		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
		VN_RELE(vp);
	}

	/*
	 * Free the space allocated to hold the list.
	 */
	kmem_free(vplist, num * sizeof (*vplist));
}

/*
 * This probably needs to be larger than or equal to
 * log2(sizeof (struct rnode)) due to the way that rnodes are
 * allocated.
 */
#define	ACACHE_SHIFT_BITS	9

static int
acachehash(rnode_t *rp, cred_t *cr)
{

	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
	    acachemask);
}

#ifdef DEBUG
static long nfs_access_cache_hits = 0;
static long nfs_access_cache_misses = 0;
#endif

nfs_access_type_t
nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
{
	vnode_t *vp;
	acache_t *ap;
	acache_hash_t *hp;
	nfs_access_type_t all;

	vp = RTOV(rp);
	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
		return (NFS_ACCESS_UNKNOWN);

	if (rp->r_acache != NULL) {
		hp = &acache[acachehash(rp, cr)];
		rw_enter(&hp->lock, RW_READER);
		ap = hp->next;
		while (ap != (acache_t *)hp) {
			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
				if ((ap->known & acc) == acc) {
#ifdef DEBUG
					nfs_access_cache_hits++;
#endif
					if ((ap->allowed & acc) == acc)
						all = NFS_ACCESS_ALLOWED;
					else
						all = NFS_ACCESS_DENIED;
				} else {
#ifdef DEBUG
					nfs_access_cache_misses++;
#endif
					all = NFS_ACCESS_UNKNOWN;
				}
				rw_exit(&hp->lock);
				return (all);
			}
			ap = ap->next;
		}
		rw_exit(&hp->lock);
	}

#ifdef DEBUG
	nfs_access_cache_misses++;
#endif
	return (NFS_ACCESS_UNKNOWN);
}

void
nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
{
	acache_t *ap;
	acache_t *nap;
	acache_hash_t *hp;

	hp = &acache[acachehash(rp, cr)];

	/*
	 * Allocate now assuming that mostly an allocation will be
	 * required.  This allows the allocation to happen without
	 * holding the hash bucket locked.
	 */
	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
	if (nap != NULL) {
		nap->known = acc;
		nap->allowed = resacc;
		nap->rnode = rp;
		crhold(cr);
		nap->cred = cr;
		nap->hashq = hp;
	}

	rw_enter(&hp->lock, RW_WRITER);

	if (rp->r_acache != NULL) {
		ap = hp->next;
		while (ap != (acache_t *)hp) {
			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
				ap->known |= acc;
				ap->allowed &= ~acc;
				ap->allowed |= resacc;
				rw_exit(&hp->lock);
				if (nap != NULL) {
					crfree(nap->cred);
					kmem_cache_free(acache_cache, nap);
				}
				return;
			}
			ap = ap->next;
		}
	}

	if (nap != NULL) {
#ifdef DEBUG
		clstat_debug.access.value.ui64++;
#endif
		nap->next = hp->next;
		hp->next = nap;
		nap->next->prev = nap;
		nap->prev = (acache_t *)hp;

		mutex_enter(&rp->r_statelock);
		nap->list = rp->r_acache;
		rp->r_acache = nap;
		mutex_exit(&rp->r_statelock);
	}

	rw_exit(&hp->lock);
}

int
nfs_access_purge_rp(rnode_t *rp)
{
	acache_t *ap;
	acache_t *tmpap;
	acache_t *rplist;

	/*
	 * If there aren't any cached entries, then there is nothing
	 * to free.
	 */
	if (rp->r_acache == NULL)
		return (0);

	mutex_enter(&rp->r_statelock);
	rplist = rp->r_acache;
	rp->r_acache = NULL;
	mutex_exit(&rp->r_statelock);

	/*
	 * Loop through each entry in the list pointed to in the
	 * rnode.  Remove each of these entries from the hash
	 * queue that it is on and remove it from the list in
	 * the rnode.
	 */
	for (ap = rplist; ap != NULL; ap = tmpap) {
		rw_enter(&ap->hashq->lock, RW_WRITER);
		ap->prev->next = ap->next;
		ap->next->prev = ap->prev;
		rw_exit(&ap->hashq->lock);

		tmpap = ap->list;
		crfree(ap->cred);
		kmem_cache_free(acache_cache, ap);
#ifdef DEBUG
		clstat_debug.access.value.ui64--;
#endif
	}

	return (1);
}

static const char prefix[] = ".nfs";

static kmutex_t newnum_lock;

int
newnum(void)
{
	static uint_t newnum = 0;
	uint_t id;

	mutex_enter(&newnum_lock);
	if (newnum == 0)
		newnum = gethrestime_sec() & 0xffff;
	id = newnum++;
	mutex_exit(&newnum_lock);
	return (id);
}

char *
newname(void)
{
	char *news;
	char *s;
	const char *p;
	uint_t id;

	id = newnum();
	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
	s = news;
	p = prefix;
	while (*p != '\0')
		*s++ = *p++;
	while (id != 0) {
		*s++ = "0123456789ABCDEF"[id & 0x0f];
		id >>= 4;
	}
	*s = '\0';
	return (news);
}

/*
 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
 * framework.
 */
static int
cl_snapshot(kstat_t *ksp, void *buf, int rw)
{
	ksp->ks_snaptime = gethrtime();
	if (rw == KSTAT_WRITE) {
		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
#ifdef DEBUG
		/*
		 * Currently only the global zone can write to kstats, but we
		 * add the check just for paranoia.
		 */
		if (INGLOBALZONE(curproc))
			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
			    sizeof (clstat_debug));
#endif
	} else {
		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
#ifdef DEBUG
		/*
		 * If we're displaying the "global" debug kstat values, we
		 * display them as-is to all zones since in fact they apply to
		 * the system as a whole.
		 */
		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
		    sizeof (clstat_debug));
#endif
	}
	return (0);
}

static void *
clinit_zone(zoneid_t zoneid)
{
	kstat_t *nfs_client_kstat;
	struct nfs_clnt *nfscl;
	uint_t ndata;

	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
	nfscl->nfscl_chtable = NULL;
	nfscl->nfscl_zoneid = zoneid;

	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
#ifdef DEBUG
	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
#endif
	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
	    "misc", KSTAT_TYPE_NAMED, ndata,
	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
		nfs_client_kstat->ks_snapshot = cl_snapshot;
		kstat_install(nfs_client_kstat);
	}
	mutex_enter(&nfs_clnt_list_lock);
	list_insert_head(&nfs_clnt_list, nfscl);
	mutex_exit(&nfs_clnt_list_lock);
	return (nfscl);
}

/*ARGSUSED*/
static void
clfini_zone(zoneid_t zoneid, void *arg)
{
	struct nfs_clnt *nfscl = arg;
	chhead_t *chp, *next;

	if (nfscl == NULL)
		return;
	mutex_enter(&nfs_clnt_list_lock);
	list_remove(&nfs_clnt_list, nfscl);
	mutex_exit(&nfs_clnt_list_lock);
	clreclaim_zone(nfscl, 0);
	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
		ASSERT(chp->ch_list == NULL);
		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
		next = chp->ch_next;
		kmem_free(chp, sizeof (*chp));
	}
	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
	mutex_destroy(&nfscl->nfscl_chtable_lock);
	kmem_free(nfscl, sizeof (*nfscl));
}

/*
 * Called by endpnt_destructor to make sure the client handles are
 * cleaned up before the RPC endpoints.  This becomes a no-op if
 * clfini_zone (above) is called first.  This function is needed
 * (rather than relying on clfini_zone to clean up) because the ZSD
 * callbacks have no ordering mechanism, so we have no way to ensure
 * that clfini_zone is called before endpnt_destructor.
 */
void
clcleanup_zone(zoneid_t zoneid)
{
	struct nfs_clnt *nfscl;

	mutex_enter(&nfs_clnt_list_lock);
	nfscl = list_head(&nfs_clnt_list);
	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
		if (nfscl->nfscl_zoneid == zoneid) {
			clreclaim_zone(nfscl, 0);
			break;
		}
	}
	mutex_exit(&nfs_clnt_list_lock);
}

int
nfs_subrinit(void)
{
	int i;
	ulong_t nrnode_max;

	/*
	 * Allocate and initialize the rnode hash queues
	 */
	if (nrnode <= 0)
		nrnode = ncsize;
	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
		    "!setting nrnode to max value of %ld", nrnode_max);
		nrnode = nrnode_max;
	}

	rtablesize = 1 << highbit(nrnode / hashlen);
	rtablemask = rtablesize - 1;
	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
	for (i = 0; i < rtablesize; i++) {
		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
	}
	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);

	/*
	 * Allocate and initialize the access cache
	 */

	/*
	 * Initial guess is one access cache entry per rnode unless
	 * nacache is set to a non-zero value and then it is used to
	 * indicate a guess at the number of access cache entries.
	 */
	if (nacache > 0)
		acachesize = 1 << highbit(nacache / hashlen);
	else
		acachesize = rtablesize;
	acachemask = acachesize - 1;
	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
	for (i = 0; i < acachesize; i++) {
		acache[i].next = (acache_t *)&acache[i];
		acache[i].prev = (acache_t *)&acache[i];
		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
	}
	acache_cache = kmem_cache_create("nfs_access_cache",
	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	/*
	 * Allocate and initialize the client handle cache
	 */
	chtab_cache = kmem_cache_create("client_handle_cache",
	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
	/*
	 * Initialize the list of per-zone client handles (and associated data).
	 * This needs to be done before we call zone_key_create().
	 */
	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
	    offsetof(struct nfs_clnt, nfscl_node));
	/*
	 * Initialize the zone_key for per-zone client handle lists.
	 */
	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
	/*
	 * Initialize the various mutexes and reader/writer locks
	 */
	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);

	/*
	 * Assign unique major number for all nfs mounts
	 */
	if ((nfs_major = getudev()) == -1) {
		zcmn_err(GLOBAL_ZONEID, CE_WARN,
		    "nfs: init: can't get unique device number");
		nfs_major = 0;
	}
	nfs_minor = 0;

	if (nfs3_jukebox_delay == 0)
		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;

	return (0);
}

void
nfs_subrfini(void)
{
	int i;

	/*
	 * Deallocate the rnode hash queues
	 */
	kmem_cache_destroy(rnode_cache);

	for (i = 0; i < rtablesize; i++)
		rw_destroy(&rtable[i].r_lock);
	kmem_free(rtable, rtablesize * sizeof (*rtable));

	/*
	 * Deallocated the access cache
	 */
	kmem_cache_destroy(acache_cache);

	for (i = 0; i < acachesize; i++)
		rw_destroy(&acache[i].lock);
	kmem_free(acache, acachesize * sizeof (*acache));

	/*
	 * Deallocate the client handle cache
	 */
	kmem_cache_destroy(chtab_cache);

	/*
	 * Destroy the various mutexes and reader/writer locks
	 */
	mutex_destroy(&rpfreelist_lock);
	mutex_destroy(&newnum_lock);
	mutex_destroy(&nfs_minor_lock);
	(void) zone_key_delete(nfsclnt_zone_key);
}

enum nfsstat
puterrno(int error)
{

	switch (error) {
	case EOPNOTSUPP:
		return (NFSERR_OPNOTSUPP);
	case ENAMETOOLONG:
		return (NFSERR_NAMETOOLONG);
	case ENOTEMPTY:
		return (NFSERR_NOTEMPTY);
	case EDQUOT:
		return (NFSERR_DQUOT);
	case ESTALE:
		return (NFSERR_STALE);
	case EREMOTE:
		return (NFSERR_REMOTE);
	case ENOSYS:
		return (NFSERR_OPNOTSUPP);
	case EOVERFLOW:
		return (NFSERR_INVAL);
	default:
		return ((enum nfsstat)error);
	}
	/* NOTREACHED */
}

int
geterrno(enum nfsstat status)
{

	switch (status) {
	case NFSERR_OPNOTSUPP:
		return (EOPNOTSUPP);
	case NFSERR_NAMETOOLONG:
		return (ENAMETOOLONG);
	case NFSERR_NOTEMPTY:
		return (ENOTEMPTY);
	case NFSERR_DQUOT:
		return (EDQUOT);
	case NFSERR_STALE:
		return (ESTALE);
	case NFSERR_REMOTE:
		return (EREMOTE);
	case NFSERR_WFLUSH:
		return (EIO);
	default:
		return ((int)status);
	}
	/* NOTREACHED */
}

enum nfsstat3
puterrno3(int error)
{

#ifdef DEBUG
	switch (error) {
	case 0:
		return (NFS3_OK);
	case EPERM:
		return (NFS3ERR_PERM);
	case ENOENT:
		return (NFS3ERR_NOENT);
	case EIO:
		return (NFS3ERR_IO);
	case ENXIO:
		return (NFS3ERR_NXIO);
	case EACCES:
		return (NFS3ERR_ACCES);
	case EEXIST:
		return (NFS3ERR_EXIST);
	case EXDEV:
		return (NFS3ERR_XDEV);
	case ENODEV:
		return (NFS3ERR_NODEV);
	case ENOTDIR:
		return (NFS3ERR_NOTDIR);
	case EISDIR:
		return (NFS3ERR_ISDIR);
	case EINVAL:
		return (NFS3ERR_INVAL);
	case EFBIG:
		return (NFS3ERR_FBIG);
	case ENOSPC:
		return (NFS3ERR_NOSPC);
	case EROFS:
		return (NFS3ERR_ROFS);
	case EMLINK:
		return (NFS3ERR_MLINK);
	case ENAMETOOLONG:
		return (NFS3ERR_NAMETOOLONG);
	case ENOTEMPTY:
		return (NFS3ERR_NOTEMPTY);
	case EDQUOT:
		return (NFS3ERR_DQUOT);
	case ESTALE:
		return (NFS3ERR_STALE);
	case EREMOTE:
		return (NFS3ERR_REMOTE);
	case ENOSYS:
	case EOPNOTSUPP:
		return (NFS3ERR_NOTSUPP);
	case EOVERFLOW:
		return (NFS3ERR_INVAL);
	default:
		zcmn_err(getzoneid(), CE_WARN,
		    "puterrno3: got error %d", error);
		return ((enum nfsstat3)error);
	}
#else
	switch (error) {
	case ENAMETOOLONG:
		return (NFS3ERR_NAMETOOLONG);
	case ENOTEMPTY:
		return (NFS3ERR_NOTEMPTY);
	case EDQUOT:
		return (NFS3ERR_DQUOT);
	case ESTALE:
		return (NFS3ERR_STALE);
	case ENOSYS:
	case EOPNOTSUPP:
		return (NFS3ERR_NOTSUPP);
	case EREMOTE:
		return (NFS3ERR_REMOTE);
	case EOVERFLOW:
		return (NFS3ERR_INVAL);
	default:
		return ((enum nfsstat3)error);
	}
#endif
}

int
geterrno3(enum nfsstat3 status)
{

#ifdef DEBUG
	switch (status) {
	case NFS3_OK:
		return (0);
	case NFS3ERR_PERM:
		return (EPERM);
	case NFS3ERR_NOENT:
		return (ENOENT);
	case NFS3ERR_IO:
		return (EIO);
	case NFS3ERR_NXIO:
		return (ENXIO);
	case NFS3ERR_ACCES:
		return (EACCES);
	case NFS3ERR_EXIST:
		return (EEXIST);
	case NFS3ERR_XDEV:
		return (EXDEV);
	case NFS3ERR_NODEV:
		return (ENODEV);
	case NFS3ERR_NOTDIR:
		return (ENOTDIR);
	case NFS3ERR_ISDIR:
		return (EISDIR);
	case NFS3ERR_INVAL:
		return (EINVAL);
	case NFS3ERR_FBIG:
		return (EFBIG);
	case NFS3ERR_NOSPC:
		return (ENOSPC);
	case NFS3ERR_ROFS:
		return (EROFS);
	case NFS3ERR_MLINK:
		return (EMLINK);
	case NFS3ERR_NAMETOOLONG:
		return (ENAMETOOLONG);
	case NFS3ERR_NOTEMPTY:
		return (ENOTEMPTY);
	case NFS3ERR_DQUOT:
		return (EDQUOT);
	case NFS3ERR_STALE:
		return (ESTALE);
	case NFS3ERR_REMOTE:
		return (EREMOTE);
	case NFS3ERR_BADHANDLE:
		return (ESTALE);
	case NFS3ERR_NOT_SYNC:
		return (EINVAL);
	case NFS3ERR_BAD_COOKIE:
		return (ENOENT);
	case NFS3ERR_NOTSUPP:
		return (EOPNOTSUPP);
	case NFS3ERR_TOOSMALL:
		return (EINVAL);
	case NFS3ERR_SERVERFAULT:
		return (EIO);
	case NFS3ERR_BADTYPE:
		return (EINVAL);
	case NFS3ERR_JUKEBOX:
		return (ENXIO);
	default:
		zcmn_err(getzoneid(), CE_WARN,
		    "geterrno3: got status %d", status);
		return ((int)status);
	}
#else
	switch (status) {
	case NFS3ERR_NAMETOOLONG:
		return (ENAMETOOLONG);
	case NFS3ERR_NOTEMPTY:
		return (ENOTEMPTY);
	case NFS3ERR_DQUOT:
		return (EDQUOT);
	case NFS3ERR_STALE:
	case NFS3ERR_BADHANDLE:
		return (ESTALE);
	case NFS3ERR_NOTSUPP:
		return (EOPNOTSUPP);
	case NFS3ERR_REMOTE:
		return (EREMOTE);
	case NFS3ERR_NOT_SYNC:
	case NFS3ERR_TOOSMALL:
	case NFS3ERR_BADTYPE:
		return (EINVAL);
	case NFS3ERR_BAD_COOKIE:
		return (ENOENT);
	case NFS3ERR_SERVERFAULT:
		return (EIO);
	case NFS3ERR_JUKEBOX:
		return (ENXIO);
	default:
		return ((int)status);
	}
#endif
}

rddir_cache *
rddir_cache_alloc(int flags)
{
	rddir_cache *rc;

	rc = kmem_alloc(sizeof (*rc), flags);
	if (rc != NULL) {
		rc->entries = NULL;
		rc->flags = RDDIR;
		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
		rc->count = 1;
#ifdef DEBUG
		atomic_inc_64(&clstat_debug.dirent.value.ui64);
#endif
	}
	return (rc);
}

static void
rddir_cache_free(rddir_cache *rc)
{

#ifdef DEBUG
	atomic_dec_64(&clstat_debug.dirent.value.ui64);
#endif
	if (rc->entries != NULL) {
#ifdef DEBUG
		rddir_cache_buf_free(rc->entries, rc->buflen);
#else
		kmem_free(rc->entries, rc->buflen);
#endif
	}
	cv_destroy(&rc->cv);
	mutex_destroy(&rc->lock);
	kmem_free(rc, sizeof (*rc));
}

void
rddir_cache_hold(rddir_cache *rc)
{

	mutex_enter(&rc->lock);
	rc->count++;
	mutex_exit(&rc->lock);
}

void
rddir_cache_rele(rddir_cache *rc)
{

	mutex_enter(&rc->lock);
	ASSERT(rc->count > 0);
	if (--rc->count == 0) {
		mutex_exit(&rc->lock);
		rddir_cache_free(rc);
	} else
		mutex_exit(&rc->lock);
}

#ifdef DEBUG
char *
rddir_cache_buf_alloc(size_t size, int flags)
{
	char *rc;

	rc = kmem_alloc(size, flags);
	if (rc != NULL)
		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
	return (rc);
}

void
rddir_cache_buf_free(void *addr, size_t size)
{

	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
	kmem_free(addr, size);
}
#endif

static int
nfs_free_data_reclaim(rnode_t *rp)
{
	char *contents;
	int size;
	vsecattr_t *vsp;
	nfs3_pathconf_info *info;
	int freed;
	cred_t *cred;

	/*
	 * Free any held credentials and caches which
	 * may be associated with this rnode.
	 */
	mutex_enter(&rp->r_statelock);
	cred = rp->r_cred;
	rp->r_cred = NULL;
	contents = rp->r_symlink.contents;
	size = rp->r_symlink.size;
	rp->r_symlink.contents = NULL;
	vsp = rp->r_secattr;
	rp->r_secattr = NULL;
	info = rp->r_pathconf;
	rp->r_pathconf = NULL;
	mutex_exit(&rp->r_statelock);

	if (cred != NULL)
		crfree(cred);

	/*
	 * Free the access cache entries.
	 */
	freed = nfs_access_purge_rp(rp);

	if (!HAVE_RDDIR_CACHE(rp) &&
	    contents == NULL &&
	    vsp == NULL &&
	    info == NULL)
		return (freed);

	/*
	 * Free the readdir cache entries
	 */
	if (HAVE_RDDIR_CACHE(rp))
		nfs_purge_rddir_cache(RTOV(rp));

	/*
	 * Free the symbolic link cache.
	 */
	if (contents != NULL) {

		kmem_free((void *)contents, size);
	}

	/*
	 * Free any cached ACL.
	 */
	if (vsp != NULL)
		nfs_acl_free(vsp);

	/*
	 * Free any cached pathconf information.
	 */
	if (info != NULL)
		kmem_free(info, sizeof (*info));

	return (1);
}

static int
nfs_active_data_reclaim(rnode_t *rp)
{
	char *contents;
	int size;
	vsecattr_t *vsp;
	nfs3_pathconf_info *info;
	int freed;

	/*
	 * Free any held credentials and caches which
	 * may be associated with this rnode.
	 */
	if (!mutex_tryenter(&rp->r_statelock))
		return (0);
	contents = rp->r_symlink.contents;
	size = rp->r_symlink.size;
	rp->r_symlink.contents = NULL;
	vsp = rp->r_secattr;
	rp->r_secattr = NULL;
	info = rp->r_pathconf;
	rp->r_pathconf = NULL;
	mutex_exit(&rp->r_statelock);

	/*
	 * Free the access cache entries.
	 */
	freed = nfs_access_purge_rp(rp);

	if (!HAVE_RDDIR_CACHE(rp) &&
	    contents == NULL &&
	    vsp == NULL &&
	    info == NULL)
		return (freed);

	/*
	 * Free the readdir cache entries
	 */
	if (HAVE_RDDIR_CACHE(rp))
		nfs_purge_rddir_cache(RTOV(rp));

	/*
	 * Free the symbolic link cache.
	 */
	if (contents != NULL) {

		kmem_free((void *)contents, size);
	}

	/*
	 * Free any cached ACL.
	 */
	if (vsp != NULL)
		nfs_acl_free(vsp);

	/*
	 * Free any cached pathconf information.
	 */
	if (info != NULL)
		kmem_free(info, sizeof (*info));

	return (1);
}

static int
nfs_free_reclaim(void)
{
	int freed;
	rnode_t *rp;

#ifdef DEBUG
	clstat_debug.f_reclaim.value.ui64++;
#endif
	freed = 0;
	mutex_enter(&rpfreelist_lock);
	rp = rpfreelist;
	if (rp != NULL) {
		do {
			if (nfs_free_data_reclaim(rp))
				freed = 1;
		} while ((rp = rp->r_freef) != rpfreelist);
	}
	mutex_exit(&rpfreelist_lock);
	return (freed);
}

static int
nfs_active_reclaim(void)
{
	int freed;
	int index;
	rnode_t *rp;

#ifdef DEBUG
	clstat_debug.a_reclaim.value.ui64++;
#endif
	freed = 0;
	for (index = 0; index < rtablesize; index++) {
		rw_enter(&rtable[index].r_lock, RW_READER);
		for (rp = rtable[index].r_hashf;
		    rp != (rnode_t *)(&rtable[index]);
		    rp = rp->r_hashf) {
			if (nfs_active_data_reclaim(rp))
				freed = 1;
		}
		rw_exit(&rtable[index].r_lock);
	}
	return (freed);
}

static int
nfs_rnode_reclaim(void)
{
	int freed;
	rnode_t *rp;
	vnode_t *vp;

#ifdef DEBUG
	clstat_debug.r_reclaim.value.ui64++;
#endif
	freed = 0;
	mutex_enter(&rpfreelist_lock);
	while ((rp = rpfreelist) != NULL) {
		rp_rmfree(rp);
		mutex_exit(&rpfreelist_lock);
		if (rp->r_flags & RHASHED) {
			vp = RTOV(rp);
			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
			mutex_enter(&vp->v_lock);
			if (vp->v_count > 1) {
				VN_RELE_LOCKED(vp);
				mutex_exit(&vp->v_lock);
				rw_exit(&rp->r_hashq->r_lock);
				mutex_enter(&rpfreelist_lock);
				continue;
			}
			mutex_exit(&vp->v_lock);
			rp_rmhash_locked(rp);
			rw_exit(&rp->r_hashq->r_lock);
		}
		/*
		 * This call to rp_addfree will end up destroying the
		 * rnode, but in a safe way with the appropriate set
		 * of checks done.
		 */
		rp_addfree(rp, CRED());
		mutex_enter(&rpfreelist_lock);
	}
	mutex_exit(&rpfreelist_lock);
	return (freed);
}

/*ARGSUSED*/
static void
nfs_reclaim(void *cdrarg)
{

#ifdef DEBUG
	clstat_debug.reclaim.value.ui64++;
#endif
	if (nfs_free_reclaim())
		return;

	if (nfs_active_reclaim())
		return;

	(void) nfs_rnode_reclaim();
}

/*
 * NFS client failover support
 *
 * Routines to copy filehandles
 */
void
nfscopyfh(caddr_t fhp, vnode_t *vp)
{
	fhandle_t *dest = (fhandle_t *)fhp;

	if (dest != NULL)
		*dest = *VTOFH(vp);
}

void
nfs3copyfh(caddr_t fhp, vnode_t *vp)
{
	nfs_fh3 *dest = (nfs_fh3 *)fhp;

	if (dest != NULL)
		*dest = *VTOFH3(vp);
}

/*
 * NFS client failover support
 *
 * failover_safe() will test various conditions to ensure that
 * failover is permitted for this vnode.  It will be denied
 * if:
 *	1) the operation in progress does not support failover (NULL fi)
 *	2) there are no available replicas (NULL mi_servers->sv_next)
 *	3) any locks are outstanding on this file
 */
static int
failover_safe(failinfo_t *fi)
{

	/*
	 * Does this op permit failover?
	 */
	if (fi == NULL || fi->vp == NULL)
		return (0);

	/*
	 * Are there any alternates to failover to?
	 */
	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
		return (0);

	/*
	 * Disable check; we've forced local locking
	 *
	 * if (flk_has_remote_locks(fi->vp))
	 *	return (0);
	 */

	/*
	 * If we have no partial path, we can't do anything
	 */
	if (VTOR(fi->vp)->r_path == NULL)
		return (0);

	return (1);
}

#include <sys/thread.h>

/*
 * NFS client failover support
 *
 * failover_newserver() will start a search for a new server,
 * preferably by starting an async thread to do the work.  If
 * someone is already doing this (recognizable by MI_BINDINPROG
 * being set), it will simply return and the calling thread
 * will queue on the mi_failover_cv condition variable.
 */
static void
failover_newserver(mntinfo_t *mi)
{
	/*
	 * Check if someone else is doing this already
	 */
	mutex_enter(&mi->mi_lock);
	if (mi->mi_flags & MI_BINDINPROG) {
		mutex_exit(&mi->mi_lock);
		return;
	}
	mi->mi_flags |= MI_BINDINPROG;

	/*
	 * Need to hold the vfs struct so that it can't be released
	 * while the failover thread is selecting a new server.
	 */
	VFS_HOLD(mi->mi_vfsp);

	/*
	 * Start a thread to do the real searching.
	 */
	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);

	mutex_exit(&mi->mi_lock);
}

/*
 * NFS client failover support
 *
 * failover_thread() will find a new server to replace the one
 * currently in use, wake up other threads waiting on this mount
 * point, and die.  It will start at the head of the server list
 * and poll servers until it finds one with an NFS server which is
 * registered and responds to a NULL procedure ping.
 *
 * XXX failover_thread is unsafe within the scope of the
 * present model defined for cpr to suspend the system.
 * Specifically, over-the-wire calls made by the thread
 * are unsafe. The thread needs to be reevaluated in case of
 * future updates to the cpr suspend model.
 */
static void
failover_thread(mntinfo_t *mi)
{
	servinfo_t *svp = NULL;
	CLIENT *cl;
	enum clnt_stat status;
	struct timeval tv;
	int error;
	int oncethru = 0;
	callb_cpr_t cprinfo;
	rnode_t *rp;
	int index;
	char *srvnames;
	size_t srvnames_len;
	struct nfs_clnt *nfscl = NULL;
	zoneid_t zoneid = getzoneid();

#ifdef DEBUG
	/*
	 * This is currently only needed to access counters which exist on
	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
	 * on non-DEBUG kernels.
	 */
	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
	ASSERT(nfscl != NULL);
#endif

	/*
	 * Its safe to piggyback on the mi_lock since failover_newserver()
	 * code guarantees that there will be only one failover thread
	 * per mountinfo at any instance.
	 */
	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
	    "failover_thread");

	mutex_enter(&mi->mi_lock);
	while (mi->mi_readers) {
		CALLB_CPR_SAFE_BEGIN(&cprinfo);
		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
	}
	mutex_exit(&mi->mi_lock);

	tv.tv_sec = 2;
	tv.tv_usec = 0;

	/*
	 * Ping the null NFS procedure of every server in
	 * the list until one responds.  We always start
	 * at the head of the list and always skip the one
	 * that is current, since it's caused us a problem.
	 */
	while (svp == NULL) {
		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
			if (!oncethru && svp == mi->mi_curr_serv)
				continue;

			/*
			 * If the file system was forcibly umounted
			 * while trying to do a failover, then just
			 * give up on the failover.  It won't matter
			 * what the server is.
			 */
			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
				svp = NULL;
				goto done;
			}

			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
			if (error)
				continue;

			if (!(mi->mi_flags & MI_INT))
				cl->cl_nosignal = TRUE;
			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
			    xdr_void, NULL, tv);
			if (!(mi->mi_flags & MI_INT))
				cl->cl_nosignal = FALSE;
			AUTH_DESTROY(cl->cl_auth);
			CLNT_DESTROY(cl);
			if (status == RPC_SUCCESS) {
				if (svp == mi->mi_curr_serv) {
#ifdef DEBUG
					zcmn_err(zoneid, CE_NOTE,
			"NFS%d: failing over: selecting original server %s",
					    mi->mi_vers, svp->sv_hostname);
#else
					zcmn_err(zoneid, CE_NOTE,
			"NFS: failing over: selecting original server %s",
					    svp->sv_hostname);
#endif
				} else {
#ifdef DEBUG
					zcmn_err(zoneid, CE_NOTE,
				    "NFS%d: failing over from %s to %s",
					    mi->mi_vers,
					    mi->mi_curr_serv->sv_hostname,
					    svp->sv_hostname);
#else
					zcmn_err(zoneid, CE_NOTE,
				    "NFS: failing over from %s to %s",
					    mi->mi_curr_serv->sv_hostname,
					    svp->sv_hostname);
#endif
				}
				break;
			}
		}

		if (svp == NULL) {
			if (!oncethru) {
				srvnames = nfs_getsrvnames(mi, &srvnames_len);
#ifdef DEBUG
				zprintf(zoneid,
				    "NFS%d servers %s not responding "
				    "still trying\n", mi->mi_vers, srvnames);
#else
				zprintf(zoneid, "NFS servers %s not responding "
				    "still trying\n", srvnames);
#endif
				oncethru = 1;
			}
			mutex_enter(&mi->mi_lock);
			CALLB_CPR_SAFE_BEGIN(&cprinfo);
			mutex_exit(&mi->mi_lock);
			delay(hz);
			mutex_enter(&mi->mi_lock);
			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
			mutex_exit(&mi->mi_lock);
		}
	}

	if (oncethru) {
#ifdef DEBUG
		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
#else
		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
#endif
	}

	if (svp != mi->mi_curr_serv) {
		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
		rw_enter(&rtable[index].r_lock, RW_WRITER);
		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
		    mi->mi_vfsp);
		if (rp != NULL) {
			if (rp->r_flags & RHASHED)
				rp_rmhash_locked(rp);
			rw_exit(&rtable[index].r_lock);
			rp->r_server = svp;
			rp->r_fh = svp->sv_fhandle;
			(void) nfs_free_data_reclaim(rp);
			index = rtablehash(&rp->r_fh);
			rp->r_hashq = &rtable[index];
			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
			vn_exists(RTOV(rp));
			rp_addhash(rp);
			rw_exit(&rp->r_hashq->r_lock);
			VN_RELE(RTOV(rp));
		} else
			rw_exit(&rtable[index].r_lock);
	}

done:
	if (oncethru)
		kmem_free(srvnames, srvnames_len);
	mutex_enter(&mi->mi_lock);
	mi->mi_flags &= ~MI_BINDINPROG;
	if (svp != NULL) {
		mi->mi_curr_serv = svp;
		mi->mi_failover++;
#ifdef DEBUG
	nfscl->nfscl_stat.failover.value.ui64++;
#endif
	}
	cv_broadcast(&mi->mi_failover_cv);
	CALLB_CPR_EXIT(&cprinfo);
	VFS_RELE(mi->mi_vfsp);
	zthread_exit();
	/* NOTREACHED */
}

/*
 * NFS client failover support
 *
 * failover_wait() will put the thread to sleep until MI_BINDINPROG
 * is cleared, meaning that failover is complete.  Called with
 * mi_lock mutex held.
 */
static int
failover_wait(mntinfo_t *mi)
{
	k_sigset_t smask;

	/*
	 * If someone else is hunting for a living server,
	 * sleep until it's done.  After our sleep, we may
	 * be bound to the right server and get off cheaply.
	 */
	while (mi->mi_flags & MI_BINDINPROG) {
		/*
		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
		 * and SIGTERM. (Preserving the existing masks).
		 * Mask out SIGINT if mount option nointr is specified.
		 */
		sigintr(&smask, (int)mi->mi_flags & MI_INT);
		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
			/*
			 * restore original signal mask
			 */
			sigunintr(&smask);
			return (EINTR);
		}
		/*
		 * restore original signal mask
		 */
		sigunintr(&smask);
	}
	return (0);
}

/*
 * NFS client failover support
 *
 * failover_remap() will do a partial pathname lookup and find the
 * desired vnode on the current server.  The interim vnode will be
 * discarded after we pilfer the new filehandle.
 *
 * Side effects:
 * - This routine will also update the filehandle in the args structure
 *    pointed to by the fi->fhp pointer if it is non-NULL.
 */

static int
failover_remap(failinfo_t *fi)
{
	vnode_t *vp, *nvp, *rootvp;
	rnode_t *rp, *nrp;
	mntinfo_t *mi;
	int error;
#ifdef DEBUG
	struct nfs_clnt *nfscl;

	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
	ASSERT(nfscl != NULL);
#endif
	/*
	 * Sanity check
	 */
	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
		return (EINVAL);
	vp = fi->vp;
	rp = VTOR(vp);
	mi = VTOMI(vp);

	if (!(vp->v_flag & VROOT)) {
		/*
		 * Given the root fh, use the path stored in
		 * the rnode to find the fh for the new server.
		 */
		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
		if (error)
			return (error);

		error = failover_lookup(rp->r_path, rootvp,
		    fi->lookupproc, fi->xattrdirproc, &nvp);

		VN_RELE(rootvp);

		if (error)
			return (error);

		/*
		 * If we found the same rnode, we're done now
		 */
		if (nvp == vp) {
			/*
			 * Failed and the new server may physically be same
			 * OR may share a same disk subsystem. In this case
			 * file handle for a particular file path is not going
			 * to change, given the same filehandle lookup will
			 * always locate the same rnode as the existing one.
			 * All we might need to do is to update the r_server
			 * with the current servinfo.
			 */
			if (!VALID_FH(fi)) {
				rp->r_server = mi->mi_curr_serv;
			}
			VN_RELE(nvp);
			return (0);
		}

		/*
		 * Try to make it so that no one else will find this
		 * vnode because it is just a temporary to hold the
		 * new file handle until that file handle can be
		 * copied to the original vnode/rnode.
		 */
		nrp = VTOR(nvp);
		mutex_enter(&mi->mi_remap_lock);
		/*
		 * Some other thread could have raced in here and could
		 * have done the remap for this particular rnode before
		 * this thread here. Check for rp->r_server and
		 * mi->mi_curr_serv and return if they are same.
		 */
		if (VALID_FH(fi)) {
			mutex_exit(&mi->mi_remap_lock);
			VN_RELE(nvp);
			return (0);
		}

		if (nrp->r_flags & RHASHED)
			rp_rmhash(nrp);

		/*
		 * As a heuristic check on the validity of the new
		 * file, check that the size and type match against
		 * that we remember from the old version.
		 */
		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
			mutex_exit(&mi->mi_remap_lock);
			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
			    "NFS replicas %s and %s: file %s not same.",
			    rp->r_server->sv_hostname,
			    nrp->r_server->sv_hostname, rp->r_path);
			VN_RELE(nvp);
			return (EINVAL);
		}

		/*
		 * snarf the filehandle from the new rnode
		 * then release it, again while updating the
		 * hash queues for the rnode.
		 */
		if (rp->r_flags & RHASHED)
			rp_rmhash(rp);
		rp->r_server = mi->mi_curr_serv;
		rp->r_fh = nrp->r_fh;
		rp->r_hashq = nrp->r_hashq;
		/*
		 * Copy the attributes from the new rnode to the old
		 * rnode.  This will help to reduce unnecessary page
		 * cache flushes.
		 */
		rp->r_attr = nrp->r_attr;
		rp->r_attrtime = nrp->r_attrtime;
		rp->r_mtime = nrp->r_mtime;
		(void) nfs_free_data_reclaim(rp);
		nfs_setswaplike(vp, &rp->r_attr);
		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
		rp_addhash(rp);
		rw_exit(&rp->r_hashq->r_lock);
		mutex_exit(&mi->mi_remap_lock);
		VN_RELE(nvp);
	}

	/*
	 * Update successful failover remap count
	 */
	mutex_enter(&mi->mi_lock);
	mi->mi_remap++;
	mutex_exit(&mi->mi_lock);
#ifdef DEBUG
	nfscl->nfscl_stat.remap.value.ui64++;
#endif

	/*
	 * If we have a copied filehandle to update, do it now.
	 */
	if (fi->fhp != NULL && fi->copyproc != NULL)
		(*fi->copyproc)(fi->fhp, vp);

	return (0);
}

/*
 * NFS client failover support
 *
 * We want a simple pathname lookup routine to parse the pieces
 * of path in rp->r_path.  We know that the path was a created
 * as rnodes were made, so we know we have only to deal with
 * paths that look like:
 *	dir1/dir2/dir3/file
 * Any evidence of anything like .., symlinks, and ENOTDIR
 * are hard errors, because they mean something in this filesystem
 * is different from the one we came from, or has changed under
 * us in some way.  If this is true, we want the failure.
 *
 * Extended attributes: if the filesystem is mounted with extended
 * attributes enabled (-o xattr), the attribute directory will be
 * represented in the r_path as the magic name XATTR_RPATH. So if
 * we see that name in the pathname, is must be because this node
 * is an extended attribute.  Therefore, look it up that way.
 */
static int
failover_lookup(char *path, vnode_t *root,
    int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
    vnode_t *, cred_t *, int),
    int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
    vnode_t **new)
{
	vnode_t *dvp, *nvp;
	int error = EINVAL;
	char *s, *p, *tmppath;
	size_t len;
	mntinfo_t *mi;
	bool_t xattr;

	/* Make local copy of path */
	len = strlen(path) + 1;
	tmppath = kmem_alloc(len, KM_SLEEP);
	(void) strcpy(tmppath, path);
	s = tmppath;

	dvp = root;
	VN_HOLD(dvp);
	mi = VTOMI(root);
	xattr = mi->mi_flags & MI_EXTATTR;

	do {
		p = strchr(s, '/');
		if (p != NULL)
			*p = '\0';
		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
			    RFSCALL_SOFT);
		} else {
			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
			    CRED(), RFSCALL_SOFT);
		}
		if (p != NULL)
			*p++ = '/';
		if (error) {
			VN_RELE(dvp);
			kmem_free(tmppath, len);
			return (error);
		}
		s = p;
		VN_RELE(dvp);
		dvp = nvp;
	} while (p != NULL);

	if (nvp != NULL && new != NULL)
		*new = nvp;
	kmem_free(tmppath, len);
	return (0);
}

/*
 * NFS client failover support
 *
 * sv_free() frees the malloc'd portion of a "servinfo_t".
 */
void
sv_free(servinfo_t *svp)
{
	servinfo_t *next;
	struct knetconfig *knconf;

	while (svp != NULL) {
		next = svp->sv_next;
		if (svp->sv_secdata)
			sec_clnt_freeinfo(svp->sv_secdata);
		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
		knconf = svp->sv_knconf;
		if (knconf != NULL) {
			if (knconf->knc_protofmly != NULL)
				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
			if (knconf->knc_proto != NULL)
				kmem_free(knconf->knc_proto, KNC_STRSIZE);
			kmem_free(knconf, sizeof (*knconf));
		}
		knconf = svp->sv_origknconf;
		if (knconf != NULL) {
			if (knconf->knc_protofmly != NULL)
				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
			if (knconf->knc_proto != NULL)
				kmem_free(knconf->knc_proto, KNC_STRSIZE);
			kmem_free(knconf, sizeof (*knconf));
		}
		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
		mutex_destroy(&svp->sv_lock);
		kmem_free(svp, sizeof (*svp));
		svp = next;
	}
}

/*
 * Only can return non-zero if intr != 0.
 */
int
nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
{

	mutex_enter(&l->lock);

	/*
	 * If this is a nested enter, then allow it.  There
	 * must be as many exits as enters through.
	 */
	if (l->owner == curthread) {
		/* lock is held for writing by current thread */
		ASSERT(rw == RW_READER || rw == RW_WRITER);
		l->count--;
	} else if (rw == RW_READER) {
		/*
		 * While there is a writer active or writers waiting,
		 * then wait for them to finish up and move on.  Then,
		 * increment the count to indicate that a reader is
		 * active.
		 */
		while (l->count < 0 || l->waiters > 0) {
			if (intr) {
				klwp_t *lwp = ttolwp(curthread);

				if (lwp != NULL)
					lwp->lwp_nostop++;
				if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
					if (lwp != NULL)
						lwp->lwp_nostop--;
					mutex_exit(&l->lock);
					return (EINTR);
				}
				if (lwp != NULL)
					lwp->lwp_nostop--;
			} else
				cv_wait(&l->cv_rd, &l->lock);
		}
		ASSERT(l->count < INT_MAX);
#ifdef	DEBUG
		if ((l->count % 10000) == 9999)
			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
			    "rwlock @ %p\n", l->count, (void *)&l);
#endif
		l->count++;
	} else {
		ASSERT(rw == RW_WRITER);
		/*
		 * While there are readers active or a writer
		 * active, then wait for all of the readers
		 * to finish or for the writer to finish.
		 * Then, set the owner field to curthread and
		 * decrement count to indicate that a writer
		 * is active.
		 */
		while (l->count != 0) {
			l->waiters++;
			if (intr) {
				klwp_t *lwp = ttolwp(curthread);

				if (lwp != NULL)
					lwp->lwp_nostop++;
				if (cv_wait_sig(&l->cv, &l->lock) == 0) {
					if (lwp != NULL)
						lwp->lwp_nostop--;
					l->waiters--;
					/*
					 * If there are readers active and no
					 * writers waiting then wake up all of
					 * the waiting readers (if any).
					 */
					if (l->count > 0 && l->waiters == 0)
						cv_broadcast(&l->cv_rd);
					mutex_exit(&l->lock);
					return (EINTR);
				}
				if (lwp != NULL)
					lwp->lwp_nostop--;
			} else
				cv_wait(&l->cv, &l->lock);
			l->waiters--;
		}
		ASSERT(l->owner == NULL);
		l->owner = curthread;
		l->count--;
	}

	mutex_exit(&l->lock);

	return (0);
}

/*
 * If the lock is available, obtain it and return non-zero.  If there is
 * already a conflicting lock, return 0 immediately.
 */

int
nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
{
	mutex_enter(&l->lock);

	/*
	 * If this is a nested enter, then allow it.  There
	 * must be as many exits as enters through.
	 */
	if (l->owner == curthread) {
		/* lock is held for writing by current thread */
		ASSERT(rw == RW_READER || rw == RW_WRITER);
		l->count--;
	} else if (rw == RW_READER) {
		/*
		 * If there is a writer active or writers waiting, deny the
		 * lock.  Otherwise, bump the count of readers.
		 */
		if (l->count < 0 || l->waiters > 0) {
			mutex_exit(&l->lock);
			return (0);
		}
		l->count++;
	} else {
		ASSERT(rw == RW_WRITER);
		/*
		 * If there are readers active or a writer active, deny the
		 * lock.  Otherwise, set the owner field to curthread and
		 * decrement count to indicate that a writer is active.
		 */
		if (l->count != 0) {
			mutex_exit(&l->lock);
			return (0);
		}
		ASSERT(l->owner == NULL);
		l->owner = curthread;
		l->count--;
	}

	mutex_exit(&l->lock);

	return (1);
}

void
nfs_rw_exit(nfs_rwlock_t *l)
{

	mutex_enter(&l->lock);

	if (l->owner != NULL) {
		ASSERT(l->owner == curthread);

		/*
		 * To release a writer lock increment count to indicate that
		 * there is one less writer active.  If this was the last of
		 * possibly nested writer locks, then clear the owner field as
		 * well to indicate that there is no writer active.
		 */
		ASSERT(l->count < 0);
		l->count++;
		if (l->count == 0) {
			l->owner = NULL;

			/*
			 * If there are no writers waiting then wakeup all of
			 * the waiting readers (if any).
			 */
			if (l->waiters == 0)
				cv_broadcast(&l->cv_rd);
		}
	} else {
		/*
		 * To release a reader lock just decrement count to indicate
		 * that there is one less reader active.
		 */
		ASSERT(l->count > 0);
		l->count--;
	}

	/*
	 * If there are no readers active nor a writer active and there is a
	 * writer waiting we need to wake up it.
	 */
	if (l->count == 0 && l->waiters > 0)
		cv_signal(&l->cv);
	mutex_exit(&l->lock);
}

int
nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
{

	if (rw == RW_READER)
		return (l->count > 0);
	ASSERT(rw == RW_WRITER);
	return (l->count < 0);
}

/* ARGSUSED */
void
nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
{

	l->count = 0;
	l->waiters = 0;
	l->owner = NULL;
	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
	cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
}

void
nfs_rw_destroy(nfs_rwlock_t *l)
{

	mutex_destroy(&l->lock);
	cv_destroy(&l->cv);
	cv_destroy(&l->cv_rd);
}

int
nfs3_rddir_compar(const void *x, const void *y)
{
	rddir_cache *a = (rddir_cache *)x;
	rddir_cache *b = (rddir_cache *)y;

	if (a->nfs3_cookie == b->nfs3_cookie) {
		if (a->buflen == b->buflen)
			return (0);
		if (a->buflen < b->buflen)
			return (-1);
		return (1);
	}

	if (a->nfs3_cookie < b->nfs3_cookie)
		return (-1);

	return (1);
}

int
nfs_rddir_compar(const void *x, const void *y)
{
	rddir_cache *a = (rddir_cache *)x;
	rddir_cache *b = (rddir_cache *)y;

	if (a->nfs_cookie == b->nfs_cookie) {
		if (a->buflen == b->buflen)
			return (0);
		if (a->buflen < b->buflen)
			return (-1);
		return (1);
	}

	if (a->nfs_cookie < b->nfs_cookie)
		return (-1);

	return (1);
}

static char *
nfs_getsrvnames(mntinfo_t *mi, size_t *len)
{
	servinfo_t *s;
	char *srvnames;
	char *namep;
	size_t length;

	/*
	 * Calculate the length of the string required to hold all
	 * of the server names plus either a comma or a null
	 * character following each individual one.
	 */
	length = 0;
	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
		length += s->sv_hostnamelen;

	srvnames = kmem_alloc(length, KM_SLEEP);

	namep = srvnames;
	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
		(void) strcpy(namep, s->sv_hostname);
		namep += s->sv_hostnamelen - 1;
		*namep++ = ',';
	}
	*--namep = '\0';

	*len = length;

	return (srvnames);
}

/*
 * These two functions are temporary and designed for the upgrade-workaround
 * only.  They cannot be used for general zone-crossing NFS client support, and
 * will be removed shortly.
 *
 * When the workaround is enabled, all NFS traffic is forced into the global
 * zone.  These functions are called when the code needs to refer to the state
 * of the underlying network connection.  They're not called when the function
 * needs to refer to the state of the process that invoked the system call.
 * (E.g., when checking whether the zone is shutting down during the mount()
 * call.)
 */

struct zone *
nfs_zone(void)
{
	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
}

zoneid_t
nfs_zoneid(void)
{
	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
}

/*
 * nfs_mount_label_policy:
 *	Determine whether the mount is allowed according to MAC check,
 *	by comparing (where appropriate) label of the remote server
 *	against the label of the zone being mounted into.
 *
 *	Returns:
 *		 0 :	access allowed
 *		-1 :	read-only access allowed (i.e., read-down)
 *		>0 :	error code, such as EACCES
 */
int
nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
    struct knetconfig *knconf, cred_t *cr)
{
	int		addr_type;
	void		*ipaddr;
	bslabel_t	*server_sl, *mntlabel;
	zone_t		*mntzone = NULL;
	ts_label_t	*zlabel;
	tsol_tpc_t	*tp;
	ts_label_t	*tsl = NULL;
	int		retv;

	/*
	 * Get the zone's label.  Each zone on a labeled system has a label.
	 */
	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
	zlabel = mntzone->zone_slabel;
	ASSERT(zlabel != NULL);
	label_hold(zlabel);

	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
		addr_type = IPV4_VERSION;
		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
		addr_type = IPV6_VERSION;
		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
	} else {
		retv = 0;
		goto out;
	}

	retv = EACCES;				/* assume the worst */

	/*
	 * Next, get the assigned label of the remote server.
	 */
	tp = find_tpc(ipaddr, addr_type, B_FALSE);
	if (tp == NULL)
		goto out;			/* error getting host entry */

	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
		goto rel_tpc;			/* invalid domain */
	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
	    (tp->tpc_tp.host_type != UNLABELED))
		goto rel_tpc;			/* invalid hosttype */

	if (tp->tpc_tp.host_type == SUN_CIPSO) {
		tsl = getflabel_cipso(vfsp);
		if (tsl == NULL)
			goto rel_tpc;		/* error getting server lbl */

		server_sl = label2bslabel(tsl);
	} else {	/* UNLABELED */
		server_sl = &tp->tpc_tp.tp_def_label;
	}

	mntlabel = label2bslabel(zlabel);

	/*
	 * Now compare labels to complete the MAC check.  If the labels
	 * are equal or if the requestor is in the global zone and has
	 * NET_MAC_AWARE, then allow read-write access.   (Except for
	 * mounts into the global zone itself; restrict these to
	 * read-only.)
	 *
	 * If the requestor is in some other zone, but their label
	 * dominates the server, then allow read-down.
	 *
	 * Otherwise, access is denied.
	 */
	if (blequal(mntlabel, server_sl) ||
	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
	    getpflags(NET_MAC_AWARE, cr) != 0)) {
		if ((mntzone == global_zone) ||
		    !blequal(mntlabel, server_sl))
			retv = -1;		/* read-only */
		else
			retv = 0;		/* access OK */
	} else if (bldominates(mntlabel, server_sl)) {
		retv = -1;			/* read-only */
	} else {
		retv = EACCES;
	}

	if (tsl != NULL)
		label_rele(tsl);

rel_tpc:
	TPC_RELE(tp);
out:
	if (mntzone)
		zone_rele(mntzone);
	label_rele(zlabel);
	return (retv);
}

boolean_t
nfs_has_ctty(void)
{
	boolean_t rv;
	mutex_enter(&curproc->p_splock);
	rv = (curproc->p_sessp->s_vp != NULL);
	mutex_exit(&curproc->p_splock);
	return (rv);
}

/*
 * See if xattr directory to see if it has any generic user attributes
 */
int
do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
{
	struct uio uio;
	struct iovec iov;
	char *dbuf;
	struct dirent64 *dp;
	size_t dlen = 8 * 1024;
	size_t dbuflen;
	int eof = 0;
	int error;

	*valp = 0;
	dbuf = kmem_alloc(dlen, KM_SLEEP);
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_segflg = UIO_SYSSPACE;
	uio.uio_fmode = 0;
	uio.uio_extflg = UIO_COPY_CACHED;
	uio.uio_loffset = 0;
	uio.uio_resid = dlen;
	iov.iov_base = dbuf;
	iov.iov_len = dlen;
	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);

	dbuflen = dlen - uio.uio_resid;

	if (error || dbuflen == 0) {
		kmem_free(dbuf, dlen);
		return (error);
	}

	dp = (dirent64_t *)dbuf;

	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
		if (strcmp(dp->d_name, ".") == 0 ||
		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
		    VIEW_READONLY) == 0) {
			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
			continue;
		}

		*valp = 1;
		break;
	}
	kmem_free(dbuf, dlen);
	return (0);
}