/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */


#include <nfs/nfs4_clnt.h>
#include <nfs/rnode4.h>
#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/atomic.h>

static void	nfs4_free_open_owner(nfs4_open_owner_t *, mntinfo4_t *);
static nfs4_open_owner_t *find_freed_open_owner(cred_t *,
				nfs4_oo_hash_bucket_t *, mntinfo4_t *);
static open_delegation_type4 get_dtype(rnode4_t *);

#ifdef DEBUG
int nfs4_client_foo_debug = 0x0;
int nfs4_client_open_dg = 0x0;
/*
 * If this is non-zero, the lockowner and openowner seqid sync primitives
 * will intermittently return errors.
 */
static int seqid_sync_faults = 0;
#endif

stateid4 clnt_special0 = {
	0,
	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
};

stateid4 clnt_special1 = {
	0xffffffff,
	{
		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
		(char)0xff, (char)0xff, (char)0xff, (char)0xff
	}
};

/* finds hash bucket and locks it */
static nfs4_oo_hash_bucket_t *
lock_bucket(cred_t *cr, mntinfo4_t *mi)
{
	nfs4_oo_hash_bucket_t *bucketp;
	uint32_t hash_key;

	hash_key = (uint32_t)(crgetuid(cr) + crgetruid(cr))
	    % NFS4_NUM_OO_BUCKETS;
	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "lock_bucket: "
	    "hash_key %d for cred %p", hash_key, (void*)cr));

	ASSERT(hash_key >= 0 && hash_key < NFS4_NUM_OO_BUCKETS);
	ASSERT(mi != NULL);
	ASSERT(mutex_owned(&mi->mi_lock));

	bucketp = &(mi->mi_oo_list[hash_key]);
	mutex_enter(&bucketp->b_lock);
	return (bucketp);
}

/* unlocks hash bucket pointed by bucket_ptr */
static void
unlock_bucket(nfs4_oo_hash_bucket_t *bucketp)
{
	mutex_exit(&bucketp->b_lock);
}

/*
 * Removes the lock owner from the rnode's lock_owners list and frees the
 * corresponding reference.
 */
void
nfs4_rnode_remove_lock_owner(rnode4_t *rp, nfs4_lock_owner_t *lop)
{
	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "nfs4_rnode_remove_lock_owner"));

	mutex_enter(&rp->r_statev4_lock);

	if (lop->lo_next_rnode == NULL) {
		/* already removed from list */
		mutex_exit(&rp->r_statev4_lock);
		return;
	}

	ASSERT(lop->lo_prev_rnode != NULL);

	lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
	lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;

	lop->lo_next_rnode = lop->lo_prev_rnode = NULL;

	mutex_exit(&rp->r_statev4_lock);

	/*
	 * This would be an appropriate place for
	 * RELEASE_LOCKOWNER.  For now, this is overkill
	 * because in the common case, close is going to
	 * release any lockowners anyway.
	 */
	lock_owner_rele(lop);
}

/*
 * Remove all lock owners from the rnode's lock_owners list.  Frees up
 * their references from the list.
 */

void
nfs4_flush_lock_owners(rnode4_t *rp)
{
	nfs4_lock_owner_t *lop;

	mutex_enter(&rp->r_statev4_lock);
	while (rp->r_lo_head.lo_next_rnode != &rp->r_lo_head) {
		lop = rp->r_lo_head.lo_next_rnode;
		lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
		lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
		lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
		lock_owner_rele(lop);
	}
	mutex_exit(&rp->r_statev4_lock);
}

void
nfs4_clear_open_streams(rnode4_t *rp)
{
	nfs4_open_stream_t *osp;

	mutex_enter(&rp->r_os_lock);
	while ((osp = list_head(&rp->r_open_streams)) != NULL) {
		open_owner_rele(osp->os_open_owner);
		list_remove(&rp->r_open_streams, osp);
		mutex_destroy(&osp->os_sync_lock);
		osp->os_open_owner = NULL;
		kmem_free(osp, sizeof (*osp));
	}
	mutex_exit(&rp->r_os_lock);
}

void
open_owner_hold(nfs4_open_owner_t *oop)
{
	mutex_enter(&oop->oo_lock);
	oop->oo_ref_count++;
	mutex_exit(&oop->oo_lock);
}

/*
 * Frees the open owner if the ref count hits zero.
 */
void
open_owner_rele(nfs4_open_owner_t *oop)
{
	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "open_owner_rele"));

	mutex_enter(&oop->oo_lock);
	oop->oo_ref_count--;
	if (oop->oo_ref_count == 0) {
		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
		    "open_owner_rele: freeing open owner"));
		oop->oo_valid = 0;
		mutex_exit(&oop->oo_lock);
		/*
		 * Ok, we don't destroy the open owner, nor do we put it on
		 * the mntinfo4's free list just yet.  We are lazy about it
		 * and let callers to find_open_owner() do that to keep locking
		 * simple.
		 */
	} else {
		mutex_exit(&oop->oo_lock);
	}
}

void
open_stream_hold(nfs4_open_stream_t *osp)
{
	mutex_enter(&osp->os_sync_lock);
	osp->os_ref_count++;
	mutex_exit(&osp->os_sync_lock);
}

/*
 * Frees the open stream and removes it from the rnode4's open streams list if
 * the ref count drops to zero.
 */
void
open_stream_rele(nfs4_open_stream_t *osp, rnode4_t *rp)
{
	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "open_stream_rele"));

	ASSERT(!mutex_owned(&rp->r_os_lock));

	mutex_enter(&osp->os_sync_lock);
	ASSERT(osp->os_ref_count > 0);
	osp->os_ref_count--;
	if (osp->os_ref_count == 0) {
		nfs4_open_owner_t *tmp_oop;

		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
		    "open_stream_rele: freeing open stream"));
		osp->os_valid = 0;
		tmp_oop = osp->os_open_owner;
		mutex_exit(&osp->os_sync_lock);

		/* now see if we need to destroy the open owner */
		open_owner_rele(tmp_oop);

		mutex_enter(&rp->r_os_lock);
		list_remove(&rp->r_open_streams, osp);
		mutex_exit(&rp->r_os_lock);

		/* free up osp */
		mutex_destroy(&osp->os_sync_lock);
		osp->os_open_owner = NULL;
		kmem_free(osp, sizeof (*osp));
	} else {
		mutex_exit(&osp->os_sync_lock);
	}
}

void
lock_owner_hold(nfs4_lock_owner_t *lop)
{
	mutex_enter(&lop->lo_lock);
	lop->lo_ref_count++;
	mutex_exit(&lop->lo_lock);
}

/*
 * Frees the lock owner if the ref count hits zero and
 * the structure no longer has no locks.
 */
void
lock_owner_rele(nfs4_lock_owner_t *lop)
{
	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "lock_owner_rele"));

	mutex_enter(&lop->lo_lock);
	lop->lo_ref_count--;
	if (lop->lo_ref_count == 0) {
		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
		    "lock_owner_rele: freeing lock owner: "
		    "%x", lop->lo_pid));
		lop->lo_valid = 0;
		/*
		 * If there are no references, the lock_owner should
		 * already be off the rnode's list.
		 */
		ASSERT(lop->lo_next_rnode == NULL);
		ASSERT(lop->lo_prev_rnode == NULL);
		ASSERT(!(lop->lo_flags & NFS4_LOCK_SEQID_INUSE));
		ASSERT(lop->lo_seqid_holder == NULL);
		mutex_exit(&lop->lo_lock);

		/* free up lop */
		cv_destroy(&lop->lo_cv_seqid_sync);
		mutex_destroy(&lop->lo_lock);
		kmem_free(lop, sizeof (*lop));
	} else {
		mutex_exit(&lop->lo_lock);
	}
}

/*
 * This increments the open owner ref count if found.
 * The argument 'just_created' determines whether we are looking for open
 * owners with the 'oo_just_created' flag set or not.
 */
nfs4_open_owner_t *
find_open_owner_nolock(cred_t *cr, int just_created, mntinfo4_t *mi)
{
	nfs4_open_owner_t	*oop = NULL, *next_oop;
	nfs4_oo_hash_bucket_t	*bucketp;

	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "find_open_owner: cred %p, just_created %d",
	    (void*)cr, just_created));

	ASSERT(mi != NULL);
	ASSERT(mutex_owned(&mi->mi_lock));

	bucketp = lock_bucket(cr, mi);

	/* got hash bucket, search through open owners */
	for (oop = list_head(&bucketp->b_oo_hash_list); oop != NULL; ) {
		mutex_enter(&oop->oo_lock);
		if (!crcmp(oop->oo_cred, cr) &&
		    (oop->oo_just_created == just_created ||
		    just_created == NFS4_JUST_CREATED)) {
			/* match */
			if (oop->oo_valid == 0) {
				/* reactivate the open owner */
				oop->oo_valid = 1;
				ASSERT(oop->oo_ref_count == 0);
			}
			oop->oo_ref_count++;
			mutex_exit(&oop->oo_lock);
			unlock_bucket(bucketp);
			return (oop);
		}
		next_oop = list_next(&bucketp->b_oo_hash_list, oop);
		if (oop->oo_valid == 0) {
			list_remove(&bucketp->b_oo_hash_list, oop);

			/*
			 * Now we go ahead and put this open owner
			 * on the freed list.  This is our lazy method.
			 */
			nfs4_free_open_owner(oop, mi);
		}

		mutex_exit(&oop->oo_lock);
		oop = next_oop;
	}

	/* search through recently freed open owners */
	oop = find_freed_open_owner(cr, bucketp, mi);

	unlock_bucket(bucketp);

	return (oop);
}

nfs4_open_owner_t *
find_open_owner(cred_t *cr, int just_created, mntinfo4_t *mi)
{
	nfs4_open_owner_t *oop;

	mutex_enter(&mi->mi_lock);
	oop = find_open_owner_nolock(cr, just_created, mi);
	mutex_exit(&mi->mi_lock);

	return (oop);
}

/*
 * This increments osp's ref count if found.
 * Returns with 'os_sync_lock' held.
 */
nfs4_open_stream_t *
find_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
{
	nfs4_open_stream_t	*osp;

	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "find_open_stream"));

	mutex_enter(&rp->r_os_lock);
	/* Now, no one can add or delete to rp's open streams list */
	for (osp = list_head(&rp->r_open_streams); osp != NULL;
	    osp = list_next(&rp->r_open_streams, osp)) {
		mutex_enter(&osp->os_sync_lock);
		if (osp->os_open_owner == oop && osp->os_valid != 0) {
			/* match */
			NFS4_DEBUG(nfs4_client_state_debug,
			    (CE_NOTE, "find_open_stream "
			    "got a match"));

			osp->os_ref_count++;
			mutex_exit(&rp->r_os_lock);
			return (osp);
		}
		mutex_exit(&osp->os_sync_lock);
	}

	mutex_exit(&rp->r_os_lock);
	return (NULL);
}

/*
 * Find the lock owner for the given file and process ID.  If "which" is
 * LOWN_VALID_STATEID, require that the lock owner contain a valid stateid
 * from the server.
 *
 * This increments the lock owner's ref count if found.  Returns NULL if
 * there was no match.
 */
nfs4_lock_owner_t *
find_lock_owner(rnode4_t *rp, pid_t pid, lown_which_t which)
{
	nfs4_lock_owner_t	*lop, *next_lop;

	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "find_lock_owner: pid %x, which %d", pid, which));

	ASSERT(which == LOWN_ANY || which == LOWN_VALID_STATEID);

	/* search by pid */
	mutex_enter(&rp->r_statev4_lock);

	lop = rp->r_lo_head.lo_next_rnode;
	while (lop != &rp->r_lo_head) {
		mutex_enter(&lop->lo_lock);
		if (lop->lo_pid == pid && lop->lo_valid != 0 &&
		    !(lop->lo_flags & NFS4_BAD_SEQID_LOCK)) {
			if (which == LOWN_ANY ||
			    lop->lo_just_created != NFS4_JUST_CREATED) {
				/* Found a matching lock owner */
				NFS4_DEBUG(nfs4_client_state_debug,
				    (CE_NOTE, "find_lock_owner: "
				    "got a match"));

				lop->lo_ref_count++;
				mutex_exit(&lop->lo_lock);
				mutex_exit(&rp->r_statev4_lock);
				return (lop);
			}
		}
		next_lop = lop->lo_next_rnode;
		mutex_exit(&lop->lo_lock);
		lop = next_lop;
	}

	mutex_exit(&rp->r_statev4_lock);
	return (NULL);
}

/*
 * This returns the delegation stateid as 'sid'. Returns 1 if a successful
 * delegation stateid was found, otherwise returns 0.
 */

static int
nfs4_get_deleg_stateid(rnode4_t *rp, nfs_opnum4 op, stateid4 *sid)
{
	ASSERT(!mutex_owned(&rp->r_statev4_lock));

	mutex_enter(&rp->r_statev4_lock);
	if (((rp->r_deleg_type == OPEN_DELEGATE_WRITE && op == OP_WRITE) ||
	    (rp->r_deleg_type != OPEN_DELEGATE_NONE && op != OP_WRITE)) &&
	    !rp->r_deleg_return_pending) {

		*sid = rp->r_deleg_stateid;
		mutex_exit(&rp->r_statev4_lock);
		return (1);
	}
	mutex_exit(&rp->r_statev4_lock);
	return (0);
}

/*
 * This returns the lock stateid as 'sid'. Returns 1 if a successful lock
 * stateid was found, otherwise returns 0.
 */
static int
nfs4_get_lock_stateid(rnode4_t *rp, pid_t pid, stateid4 *sid)
{
	nfs4_lock_owner_t *lop;

	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);

	if (lop) {
		/*
		 * Found a matching lock owner, so use a lock
		 * stateid rather than an open stateid.
		 */
		mutex_enter(&lop->lo_lock);
		*sid = lop->lock_stateid;
		mutex_exit(&lop->lo_lock);
		lock_owner_rele(lop);
		return (1);
	}

	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "nfs4_get_lock_stateid: no lop"));
	return (0);
}

/*
 * This returns the open stateid as 'sid'. Returns 1 if a successful open
 * stateid was found, otherwise returns 0.
 *
 * Once the stateid is returned to the caller, it is no longer protected;
 * so the caller must be prepared to handle OLD/BAD_STATEID where
 * appropiate.
 */
static int
nfs4_get_open_stateid(rnode4_t *rp, cred_t *cr, mntinfo4_t *mi, stateid4 *sid)
{
	nfs4_open_owner_t *oop;
	nfs4_open_stream_t *osp;

	ASSERT(mi != NULL);

	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
	if (!oop) {
		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
		    "nfs4_get_open_stateid: no oop"));
		return (0);
	}

	osp = find_open_stream(oop, rp);
	open_owner_rele(oop);
	if (!osp) {
		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
		    "nfs4_get_open_stateid: no osp"));
		return (0);
	}

	if (osp->os_failed_reopen) {
		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
		    "nfs4_get_open_stateid: osp %p failed reopen",
		    (void *)osp));
		mutex_exit(&osp->os_sync_lock);
		open_stream_rele(osp, rp);
		return (0);
	}
	*sid = osp->open_stateid;
	mutex_exit(&osp->os_sync_lock);
	open_stream_rele(osp, rp);
	return (1);
}

/*
 * Returns the delegation stateid if this 'op' is OP_WRITE and the
 * delegation we hold is a write delegation, OR this 'op' is not
 * OP_WRITE and we have a delegation held (read or write), otherwise
 * returns the lock stateid if there is a lock owner, otherwise
 * returns the open stateid if there is a open stream, otherwise
 * returns special stateid <seqid = 0, other = 0>.
 *
 * Used for WRITE operations.
 */
stateid4
nfs4_get_w_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
    nfs_opnum4 op, nfs4_stateid_types_t *sid_tp)
{
	stateid4 sid;

	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
			sid_tp->cur_sid_type = DEL_SID;
			return (sid);
		}
	}
	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
			sid_tp->cur_sid_type = LOCK_SID;
			return (sid);
		}
	}
	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
			sid_tp->cur_sid_type = OPEN_SID;
			return (sid);
		}
	}
	bzero(&sid, sizeof (stateid4));
	sid_tp->cur_sid_type = SPEC_SID;
	return (sid);
}

/*
 * Returns the delegation stateid if this 'op' is OP_WRITE and the
 * delegation we hold is a write delegation, OR this 'op' is not
 * OP_WRITE and we have a delegation held (read or write), otherwise
 * returns the lock stateid if there is a lock owner, otherwise
 * returns the open stateid if there is a open stream, otherwise
 * returns special stateid <seqid = 0, other = 0>.
 *
 * This also updates which stateid we are using in 'sid_tp', skips
 * previously attempted stateids, and skips checking higher priority
 * stateids than the current level as dictated by 'sid_tp->cur_sid_type'
 * for async reads.
 *
 * Used for READ and SETATTR operations.
 */
stateid4
nfs4_get_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
    nfs_opnum4 op, nfs4_stateid_types_t *sid_tp, bool_t async_read)
{
	stateid4 sid;

	/*
	 * For asynchronous READs, do not attempt to retry from the start of
	 * the stateid priority list, just continue from where you last left
	 * off.
	 */
	if (async_read) {
		switch (sid_tp->cur_sid_type) {
		case NO_SID:
			break;
		case DEL_SID:
			goto lock_stateid;
		case LOCK_SID:
			goto open_stateid;
		case OPEN_SID:
			goto special_stateid;
		case SPEC_SID:
		default:
			cmn_err(CE_PANIC, "nfs4_get_stateid: illegal current "
			    "stateid type %d", sid_tp->cur_sid_type);
		}
	}

	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
			sid_tp->cur_sid_type = DEL_SID;
			return (sid);
		}
	}
lock_stateid:
	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
			sid_tp->cur_sid_type = LOCK_SID;
			return (sid);
		}
	}
open_stateid:
	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
			sid_tp->cur_sid_type = OPEN_SID;
			return (sid);
		}
	}
special_stateid:
	bzero(&sid, sizeof (stateid4));
	sid_tp->cur_sid_type = SPEC_SID;
	return	(sid);
}

void
nfs4_set_lock_stateid(nfs4_lock_owner_t *lop, stateid4 stateid)
{
	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "nfs4_set_lock_stateid"));

	ASSERT(lop);
	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);

	mutex_enter(&lop->lo_lock);
	lop->lock_stateid = stateid;
	mutex_exit(&lop->lo_lock);
}

/*
 * Sequence number used when a new open owner is needed.
 * This is used so as to not confuse the server.  Since a open owner
 * is based off of cred, a cred could be re-used quickly, and the server
 * may not release all state for a cred.
 */
static uint64_t open_owner_seq_num = 0;

uint64_t
nfs4_get_new_oo_name(void)
{
	return (atomic_inc_64_nv(&open_owner_seq_num));
}

/*
 * Create a new open owner and add it to the open owner hash table.
 */
nfs4_open_owner_t *
create_open_owner(cred_t *cr, mntinfo4_t *mi)
{
	nfs4_open_owner_t	*oop;
	nfs4_oo_hash_bucket_t	*bucketp;

	oop = kmem_alloc(sizeof (nfs4_open_owner_t), KM_SLEEP);
	/*
	 * Make sure the cred doesn't go away when we put this open owner
	 * on the free list, as well as make crcmp() a valid check.
	 */
	crhold(cr);
	oop->oo_cred = cr;
	mutex_init(&oop->oo_lock, NULL, MUTEX_DEFAULT, NULL);
	oop->oo_ref_count = 1;
	oop->oo_valid = 1;
	oop->oo_just_created = NFS4_JUST_CREATED;
	oop->oo_seqid = 0;
	oop->oo_seqid_inuse = 0;
	oop->oo_last_good_seqid = 0;
	oop->oo_last_good_op = TAG_NONE;
	oop->oo_cred_otw = NULL;
	cv_init(&oop->oo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);

	/*
	 * A Solaris open_owner is <oo_seq_num>
	 */
	oop->oo_name = nfs4_get_new_oo_name();

	/* now add the struct into the cred hash table */
	ASSERT(mutex_owned(&mi->mi_lock));
	bucketp = lock_bucket(cr, mi);
	list_insert_head(&bucketp->b_oo_hash_list, oop);
	unlock_bucket(bucketp);

	return (oop);
}

/*
 * Create a new open stream and it to the rnode's list.
 * Increments the ref count on oop.
 * Returns with 'os_sync_lock' held.
 */
nfs4_open_stream_t *
create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
{
	nfs4_open_stream_t	*osp;

#ifdef DEBUG
	mutex_enter(&oop->oo_lock);
	ASSERT(oop->oo_seqid_inuse);
	mutex_exit(&oop->oo_lock);
#endif

	osp = kmem_alloc(sizeof (nfs4_open_stream_t), KM_SLEEP);
	osp->os_open_ref_count = 1;
	osp->os_mapcnt = 0;
	osp->os_ref_count = 2;
	osp->os_valid = 1;
	osp->os_open_owner = oop;
	osp->os_orig_oo_name = oop->oo_name;
	bzero(&osp->open_stateid, sizeof (stateid4));
	osp->os_share_acc_read = 0;
	osp->os_share_acc_write = 0;
	osp->os_mmap_read = 0;
	osp->os_mmap_write = 0;
	osp->os_share_deny_none = 0;
	osp->os_share_deny_read = 0;
	osp->os_share_deny_write = 0;
	osp->os_delegation = 0;
	osp->os_dc_openacc = 0;
	osp->os_final_close = 0;
	osp->os_pending_close = 0;
	osp->os_failed_reopen = 0;
	osp->os_force_close = 0;
	mutex_init(&osp->os_sync_lock, NULL, MUTEX_DEFAULT, NULL);

	/* open owner gets a reference */
	open_owner_hold(oop);

	/* now add the open stream to rp */
	mutex_enter(&rp->r_os_lock);
	mutex_enter(&osp->os_sync_lock);
	list_insert_head(&rp->r_open_streams, osp);
	mutex_exit(&rp->r_os_lock);

	return (osp);
}

/*
 * Returns an open stream with 'os_sync_lock' held.
 * If the open stream is found (rather than created), its
 * 'os_open_ref_count' is bumped.
 *
 * There is no race with two threads entering this function
 * and creating two open streams for the same <oop, rp> pair.
 * This is because the open seqid sync must be acquired, thus
 * only allowing one thread in at a time.
 */
nfs4_open_stream_t *
find_or_create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp,
    int *created_osp)
{
	nfs4_open_stream_t *osp;

#ifdef DEBUG
	mutex_enter(&oop->oo_lock);
	ASSERT(oop->oo_seqid_inuse);
	mutex_exit(&oop->oo_lock);
#endif

	osp = find_open_stream(oop, rp);
	if (!osp) {
		osp = create_open_stream(oop, rp);
		if (osp)
			*created_osp = 1;
	} else {
		*created_osp = 0;
		osp->os_open_ref_count++;
	}

	return (osp);
}

static uint64_t lock_owner_seq_num = 0;

/*
 * Create a new lock owner and add it to the rnode's list.
 * Assumes the rnode's r_statev4_lock is held.
 * The created lock owner has a reference count of 2: one for the list and
 * one for the caller to use.  Returns the lock owner locked down.
 */
nfs4_lock_owner_t *
create_lock_owner(rnode4_t *rp, pid_t pid)
{
	nfs4_lock_owner_t	*lop;

	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "create_lock_owner: pid %x", pid));

	ASSERT(mutex_owned(&rp->r_statev4_lock));

	lop = kmem_alloc(sizeof (nfs4_lock_owner_t), KM_SLEEP);
	lop->lo_ref_count = 2;
	lop->lo_valid = 1;
	bzero(&lop->lock_stateid, sizeof (stateid4));
	lop->lo_pid = pid;
	lop->lock_seqid = 0;
	lop->lo_pending_rqsts = 0;
	lop->lo_just_created = NFS4_JUST_CREATED;
	lop->lo_flags = 0;
	lop->lo_seqid_holder = NULL;

	/*
	 * A Solaris lock_owner is <seq_num><pid>
	 */
	lop->lock_owner_name.ln_seq_num =
	    atomic_inc_64_nv(&lock_owner_seq_num);
	lop->lock_owner_name.ln_pid = pid;

	cv_init(&lop->lo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
	mutex_init(&lop->lo_lock, NULL, MUTEX_DEFAULT, NULL);

	mutex_enter(&lop->lo_lock);

	/* now add the lock owner to rp */
	lop->lo_prev_rnode = &rp->r_lo_head;
	lop->lo_next_rnode = rp->r_lo_head.lo_next_rnode;
	rp->r_lo_head.lo_next_rnode->lo_prev_rnode = lop;
	rp->r_lo_head.lo_next_rnode = lop;

	return (lop);

}

/*
 * This sets the lock seqid of a lock owner.
 */
void
nfs4_set_lock_seqid(seqid4 seqid, nfs4_lock_owner_t *lop)
{
	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "nfs4_set_lock_seqid"));

	ASSERT(lop != NULL);
	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);

	lop->lock_seqid = seqid;
}

static void
nfs4_set_new_lock_owner_args(lock_owner4 *owner, pid_t pid)
{
	nfs4_lo_name_t *cast_namep;

	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "nfs4_set_new_lock_owner_args"));

	owner->owner_len = sizeof (*cast_namep);
	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
	/*
	 * A Solaris lock_owner is <seq_num><pid>
	 */
	cast_namep = (nfs4_lo_name_t *)owner->owner_val;
	cast_namep->ln_seq_num = atomic_inc_64_nv(&lock_owner_seq_num);
	cast_namep->ln_pid = pid;
}

/*
 * Fill in the lock owner args.
 */
void
nfs4_setlockowner_args(lock_owner4 *owner, rnode4_t *rp, pid_t pid)
{
	nfs4_lock_owner_t *lop;

	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "nfs4_setlockowner_args"));

	/* This increments lop's ref count */
	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);

	if (!lop)
		goto make_up_args;

	mutex_enter(&lop->lo_lock);
	owner->owner_len = sizeof (lop->lock_owner_name);
	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
	bcopy(&lop->lock_owner_name, owner->owner_val,
	    owner->owner_len);
	mutex_exit(&lop->lo_lock);
	lock_owner_rele(lop);
	return;

make_up_args:
	nfs4_set_new_lock_owner_args(owner, pid);
}

/*
 * This ends our use of the open owner's open seqid by setting
 * the appropiate flags and issuing a cv_signal to wake up another
 * thread waiting to use the open seqid.
 */

void
nfs4_end_open_seqid_sync(nfs4_open_owner_t *oop)
{
	mutex_enter(&oop->oo_lock);
	ASSERT(oop->oo_seqid_inuse);
	oop->oo_seqid_inuse = 0;
	cv_broadcast(&oop->oo_cv_seqid_sync);
	mutex_exit(&oop->oo_lock);
}

/*
 * This starts our use of the open owner's open seqid by setting
 * the oo_seqid_inuse to true.  We will wait (forever) with a
 * cv_wait() until we are woken up.
 *
 * Return values:
 * 0		no problems
 * EAGAIN	caller should retry (like a recovery retry)
 */
int
nfs4_start_open_seqid_sync(nfs4_open_owner_t *oop, mntinfo4_t *mi)
{
	int error = 0;
#ifdef DEBUG
	static int ops = 0;		/* fault injection */
#endif

#ifdef DEBUG
	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
	    ++ops % 5 == 0)
		return (EAGAIN);
#endif

	mutex_enter(&mi->mi_lock);
	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
	    curthread != mi->mi_recovthread)
		error = EAGAIN;
	mutex_exit(&mi->mi_lock);
	if (error != 0)
		goto done;

	mutex_enter(&oop->oo_lock);

	while (oop->oo_seqid_inuse) {
		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
		    "nfs4_start_open_seqid_sync waiting on cv"));

		cv_wait(&oop->oo_cv_seqid_sync, &oop->oo_lock);
	}

	oop->oo_seqid_inuse = 1;

	mutex_exit(&oop->oo_lock);

	mutex_enter(&mi->mi_lock);
	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
	    curthread != mi->mi_recovthread)
		error = EAGAIN;
	mutex_exit(&mi->mi_lock);

	if (error == EAGAIN)
		nfs4_end_open_seqid_sync(oop);

	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
	    "nfs4_start_open_seqid_sync: error=%d", error));

done:
	return (error);
}

#ifdef	DEBUG
int bypass_otw[2];
#endif

/*
 * Checks to see if the OPEN OTW is necessary that is, if it's already
 * been opened with the same access and deny bits we are now asking for.
 * Note, this assumes that *vp is a rnode.
 */
int
nfs4_is_otw_open_necessary(nfs4_open_owner_t *oop, int flag, vnode_t *vp,
    int just_been_created, int *errorp, int acc, nfs4_recov_state_t *rsp)
{
	rnode4_t *rp;
	nfs4_open_stream_t *osp;
	open_delegation_type4 dt;

	rp = VTOR4(vp);

	/*
	 * Grab the delegation type.  This function is protected against
	 * the delegation being returned by virtue of start_op (called
	 * by nfs4open_otw) taking the r_deleg_recall_lock in read mode,
	 * delegreturn requires this lock in write mode to proceed.
	 */
	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_READER));
	dt = get_dtype(rp);

	/* returns with 'os_sync_lock' held */
	osp = find_open_stream(oop, rp);

	if (osp) {
		uint32_t	do_otw = 0;

		if (osp->os_failed_reopen) {
			NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
			    "nfs4_is_otw_open_necessary: os_failed_reopen "
			    "set on osp %p, cr %p, rp %s", (void *)osp,
			    (void *)osp->os_open_owner->oo_cred,
			    rnode4info(rp)));
			do_otw = 1;
		}

		/*
		 * check access/deny bits
		 */
		if (!do_otw && (flag & FREAD))
			if (osp->os_share_acc_read == 0 &&
			    dt == OPEN_DELEGATE_NONE)
				do_otw = 1;

		if (!do_otw && (flag & FWRITE))
			if (osp->os_share_acc_write == 0 &&
			    dt != OPEN_DELEGATE_WRITE)
				do_otw = 1;

		if (!do_otw) {
			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
			    "nfs4_is_otw_open_necessary: can skip this "
			    "open OTW"));
			if (!just_been_created) {
				osp->os_open_ref_count++;
				if (flag & FREAD)
					osp->os_share_acc_read++;
				if (flag & FWRITE)
					osp->os_share_acc_write++;
				osp->os_share_deny_none++;
			}

			/*
			 * Need to reset this bitfield for the possible case
			 * where we were going to OTW CLOSE the file, got a
			 * non-recoverable error, and before we could retry
			 * the CLOSE, OPENed the file again.
			 */
			ASSERT(osp->os_open_owner->oo_seqid_inuse);
			osp->os_final_close = 0;
			osp->os_force_close = 0;

			mutex_exit(&osp->os_sync_lock);
			open_stream_rele(osp, rp);

#ifdef	DEBUG
			bypass_otw[0]++;
#endif

			*errorp = 0;
			return (0);
		}
		mutex_exit(&osp->os_sync_lock);
		open_stream_rele(osp, rp);

	} else if (dt != OPEN_DELEGATE_NONE) {
		/*
		 * Even if there isn't an open_stream yet, we may still be
		 * able to bypass the otw open if the client owns a delegation.
		 *
		 * If you are asking for for WRITE, but I only have
		 * a read delegation, then you still have to go otw.
		 */

		if (flag & FWRITE && dt == OPEN_DELEGATE_READ)
			return (1);

		/*
		 * TODO - evaluate the nfsace4
		 */

		/*
		 * Check the access flags to make sure the caller
		 * had permission.
		 */
		if (flag & FREAD && !(acc & VREAD))
			return (1);

		if (flag & FWRITE && !(acc & VWRITE))
			return (1);

		/*
		 * create_open_stream will add a reference to oop,
		 * this will prevent the open_owner_rele done in
		 * nfs4open_otw from destroying the open_owner.
		 */

		/* returns with 'os_sync_lock' held */
		osp = create_open_stream(oop, rp);
		if (osp == NULL)
			return (1);

		osp->open_stateid = rp->r_deleg_stateid;
		osp->os_delegation = 1;

		if (flag & FREAD)
			osp->os_share_acc_read++;
		if (flag & FWRITE)
			osp->os_share_acc_write++;

		osp->os_share_deny_none++;
		mutex_exit(&osp->os_sync_lock);

		open_stream_rele(osp, rp);

		mutex_enter(&oop->oo_lock);
		oop->oo_just_created = NFS4_PERM_CREATED;
		mutex_exit(&oop->oo_lock);

		ASSERT(rsp != NULL);
		if (rsp->rs_sp != NULL) {
			mutex_enter(&rsp->rs_sp->s_lock);
			nfs4_inc_state_ref_count_nolock(rsp->rs_sp,
			    VTOMI4(vp));
			mutex_exit(&rsp->rs_sp->s_lock);
		}
#ifdef	DEBUG
		bypass_otw[1]++;
#endif

		*errorp = 0;
		return (0);
	}

	return (1);
}

static open_delegation_type4
get_dtype(rnode4_t *rp)
{
	open_delegation_type4 dt;

	mutex_enter(&rp->r_statev4_lock);
	ASSERT(!rp->r_deleg_return_inprog);
	if (rp->r_deleg_return_pending)
		dt = OPEN_DELEGATE_NONE;
	else
		dt = rp->r_deleg_type;
	mutex_exit(&rp->r_statev4_lock);

	return (dt);
}

/*
 * Fill in *locker with the lock state arguments for a LOCK call.  If
 * lop->lo_just_created == NFS4_JUST_CREATED, oop and osp must be non-NULL.
 * Caller must already hold the necessary seqid sync lock(s).
 */

void
nfs4_setup_lock_args(nfs4_lock_owner_t *lop, nfs4_open_owner_t *oop,
    nfs4_open_stream_t *osp, clientid4 clientid, locker4 *locker)
{
	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
	if (lop->lo_just_created == NFS4_JUST_CREATED) {
		/* this is a new lock request */
		open_to_lock_owner4 *nown;

		ASSERT(oop != NULL);
		ASSERT(osp != NULL);

		locker->new_lock_owner = TRUE;
		nown = &locker->locker4_u.open_owner;
		nown->open_seqid = nfs4_get_open_seqid(oop) + 1;
		mutex_enter(&osp->os_sync_lock);
		nown->open_stateid = osp->open_stateid;
		mutex_exit(&osp->os_sync_lock);
		nown->lock_seqid = lop->lock_seqid; /* initial, so no +1 */

		nown->lock_owner.clientid = clientid;
		nown->lock_owner.owner_len = sizeof (lop->lock_owner_name);
		nown->lock_owner.owner_val =
		    kmem_alloc(nown->lock_owner.owner_len, KM_SLEEP);
		bcopy(&lop->lock_owner_name, nown->lock_owner.owner_val,
		    nown->lock_owner.owner_len);
	} else {
		exist_lock_owner4 *eown;
		/* have an existing lock owner */

		locker->new_lock_owner = FALSE;
		eown = &locker->locker4_u.lock_owner;
		mutex_enter(&lop->lo_lock);
		eown->lock_stateid = lop->lock_stateid;
		mutex_exit(&lop->lo_lock);
		eown->lock_seqid = lop->lock_seqid + 1;
	}
}

/*
 * This starts our use of the lock owner's lock seqid by setting
 * the lo_flags to NFS4_LOCK_SEQID_INUSE.  We will wait (forever)
 * with a cv_wait() until we are woken up.
 *
 * Return values:
 * 0		no problems
 * EAGAIN	caller should retry (like a recovery retry)
 */
int
nfs4_start_lock_seqid_sync(nfs4_lock_owner_t *lop, mntinfo4_t *mi)
{
	int error = 0;
#ifdef DEBUG
	static int ops = 0;		/* fault injection */
#endif

#ifdef DEBUG
	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
	    ++ops % 7 == 0)
		return (EAGAIN);
#endif

	mutex_enter(&mi->mi_lock);
	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
	    curthread != mi->mi_recovthread)
		error = EAGAIN;
	mutex_exit(&mi->mi_lock);
	if (error != 0)
		goto done;

	mutex_enter(&lop->lo_lock);

	ASSERT(lop->lo_seqid_holder != curthread);
	while (lop->lo_flags & NFS4_LOCK_SEQID_INUSE) {
		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
		    "nfs4_start_lock_seqid_sync: waiting on cv"));

		cv_wait(&lop->lo_cv_seqid_sync, &lop->lo_lock);
	}
	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4_start_lock_seqid_sync: "
	    "NFS4_LOCK_SEQID_INUSE"));

	lop->lo_flags |= NFS4_LOCK_SEQID_INUSE;
	lop->lo_seqid_holder = curthread;
	mutex_exit(&lop->lo_lock);

	mutex_enter(&mi->mi_lock);
	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
	    curthread != mi->mi_recovthread)
		error = EAGAIN;
	mutex_exit(&mi->mi_lock);

	if (error == EAGAIN)
		nfs4_end_lock_seqid_sync(lop);

	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
	    "nfs4_start_lock_seqid_sync: error=%d", error));

done:
	return (error);
}

/*
 * This ends our use of the lock owner's lock seqid by setting
 * the appropiate flags and issuing a cv_signal to wake up another
 * thread waiting to use the lock seqid.
 */
void
nfs4_end_lock_seqid_sync(nfs4_lock_owner_t *lop)
{
	mutex_enter(&lop->lo_lock);
	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
	ASSERT(lop->lo_seqid_holder == curthread);
	lop->lo_flags &= ~NFS4_LOCK_SEQID_INUSE;
	lop->lo_seqid_holder = NULL;
	cv_broadcast(&lop->lo_cv_seqid_sync);
	mutex_exit(&lop->lo_lock);
}

/*
 * Returns a reference to a lock owner via lopp, which has its lock seqid
 * synchronization started.
 * If the lock owner is in the 'just_created' state, then we return its open
 * owner and open stream and start the open seqid synchronization.
 *
 * Return value:
 * NFS4_OK		no problems
 * NFS4ERR_DELAY	there is lost state to recover; caller should retry
 * NFS4ERR_IO		no open stream
 */
nfsstat4
nfs4_find_or_create_lock_owner(pid_t pid, rnode4_t *rp, cred_t *cr,
    nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
    nfs4_lock_owner_t **lopp)
{
	nfs4_lock_owner_t *lop, *next_lop;
	mntinfo4_t *mi;
	int error = 0;
	nfsstat4 stat;

	mi = VTOMI4(RTOV4(rp));

	mutex_enter(&rp->r_statev4_lock);

	lop = rp->r_lo_head.lo_next_rnode;
	while (lop != &rp->r_lo_head) {
		mutex_enter(&lop->lo_lock);
		if (lop->lo_pid == pid && lop->lo_valid != 0) {
			/* Found a matching lock owner */
			NFS4_DEBUG(nfs4_client_state_debug,
			    (CE_NOTE, "nfs4_find_or_create_lock_owner: "
			    "got a match"));
			lop->lo_ref_count++;
			break;
		}
		next_lop = lop->lo_next_rnode;
		mutex_exit(&lop->lo_lock);
		lop = next_lop;
	}

	if (lop == &rp->r_lo_head) {
		/* create temporary lock owner */
		lop = create_lock_owner(rp, pid);
	}
	mutex_exit(&rp->r_statev4_lock);

	/* Have a locked down lock owner struct now */
	if (lop->lo_just_created != NFS4_JUST_CREATED) {
		/* This is an existing lock owner */
		*oopp = NULL;
		*ospp = NULL;
	} else {
		/* Lock owner doesn't exist yet */

		/* First grab open owner seqid synchronization */
		mutex_exit(&lop->lo_lock);
		*oopp = find_open_owner(cr, NFS4_PERM_CREATED, mi);
		if (*oopp == NULL)
			goto kill_new_lop;
		error = nfs4_start_open_seqid_sync(*oopp, mi);
		if (error == EAGAIN) {
			stat = NFS4ERR_DELAY;
			goto failed;
		}
		*ospp = find_open_stream(*oopp, rp);
		if (*ospp == NULL) {
			nfs4_end_open_seqid_sync(*oopp);
			goto kill_new_lop;
		}
		if ((*ospp)->os_failed_reopen) {
			mutex_exit(&(*ospp)->os_sync_lock);
			NFS4_DEBUG((nfs4_open_stream_debug ||
			    nfs4_client_lock_debug), (CE_NOTE,
			    "nfs4_find_or_create_lock_owner: os_failed_reopen;"
			    "osp %p, cr %p, rp %s", (void *)(*ospp),
			    (void *)cr, rnode4info(rp)));
			nfs4_end_open_seqid_sync(*oopp);
			stat = NFS4ERR_IO;
			goto failed;
		}
		mutex_exit(&(*ospp)->os_sync_lock);

		/*
		 * Now see if the lock owner has become permanent while we
		 * had released our lock.
		 */
		mutex_enter(&lop->lo_lock);
		if (lop->lo_just_created != NFS4_JUST_CREATED) {
			nfs4_end_open_seqid_sync(*oopp);
			open_stream_rele(*ospp, rp);
			open_owner_rele(*oopp);
			*oopp = NULL;
			*ospp = NULL;
		}
	}
	mutex_exit(&lop->lo_lock);

	error = nfs4_start_lock_seqid_sync(lop, mi);
	if (error == EAGAIN) {
		if (*oopp != NULL)
			nfs4_end_open_seqid_sync(*oopp);
		stat = NFS4ERR_DELAY;
		goto failed;
	}
	ASSERT(error == 0);

	*lopp = lop;
	return (NFS4_OK);

kill_new_lop:
	/*
	 * A previous CLOSE was attempted but got EINTR, but the application
	 * continued to use the unspecified state file descriptor.  But now the
	 * open stream is gone (which could also destroy the open owner), hence
	 * we can no longer continue.  The calling function should return EIO
	 * to the application.
	 */
	NFS4_DEBUG(nfs4_lost_rqst_debug || nfs4_client_lock_debug,
	    (CE_NOTE, "nfs4_find_or_create_lock_owner: destroy newly created "
	    "lop %p, oop %p, osp %p", (void *)lop, (void *)(*oopp),
	    (void *)(*ospp)));

	nfs4_rnode_remove_lock_owner(rp, lop);
	stat = NFS4ERR_IO;

failed:
	lock_owner_rele(lop);
	if (*oopp) {
		open_owner_rele(*oopp);
		*oopp = NULL;
	}
	if (*ospp) {
		open_stream_rele(*ospp, rp);
		*ospp = NULL;
	}
	return (stat);
}

/*
 * This function grabs a recently freed open owner off of the freed open
 * owner list if there is a match on the cred 'cr'.  It returns NULL if no
 * such match is found.  It will set the 'oo_ref_count' and 'oo_valid' back
 * to both 1 (sane values) in the case a match is found.
 */
static nfs4_open_owner_t *
find_freed_open_owner(cred_t *cr, nfs4_oo_hash_bucket_t *bucketp,
    mntinfo4_t *mi)
{
	nfs4_open_owner_t		*foop;

	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
	    "find_freed_open_owner: cred %p", (void*)cr));

	ASSERT(mutex_owned(&mi->mi_lock));
	ASSERT(mutex_owned(&bucketp->b_lock));

	/* got hash bucket, search through freed open owners */
	for (foop = list_head(&mi->mi_foo_list); foop != NULL;
	    foop = list_next(&mi->mi_foo_list, foop)) {
		if (!crcmp(foop->oo_cred, cr)) {
			NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
			    "find_freed_open_owner: got a match open owner "
			    "%p", (void *)foop));
			foop->oo_ref_count = 1;
			foop->oo_valid = 1;
			list_remove(&mi->mi_foo_list, foop);
			mi->mi_foo_num--;

			/* now add the struct into the cred hash table */
			list_insert_head(&bucketp->b_oo_hash_list, foop);
			return (foop);
		}
	}

	return (NULL);
}

/*
 * Insert the newly freed 'oop' into the mi's freed oop list,
 * always at the head of the list.  If we've already reached
 * our maximum allowed number of freed open owners (mi_foo_max),
 * then remove the LRU open owner on the list (namely the tail).
 */
static void
nfs4_free_open_owner(nfs4_open_owner_t *oop, mntinfo4_t *mi)
{
	nfs4_open_owner_t *lru_foop;

	if (mi->mi_foo_num < mi->mi_foo_max) {
		NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
		    "nfs4_free_open_owner: num free %d, max free %d, "
		    "insert open owner %p for mntinfo4 %p",
		    mi->mi_foo_num, mi->mi_foo_max, (void *)oop,
		    (void *)mi));
		list_insert_head(&mi->mi_foo_list, oop);
		mi->mi_foo_num++;
		return;
	}

	/* need to replace a freed open owner */

	lru_foop = list_tail(&mi->mi_foo_list);

	NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
	    "nfs4_free_open_owner: destroy %p, insert %p",
	    (void *)lru_foop, (void *)oop));

	list_remove(&mi->mi_foo_list, lru_foop);
	nfs4_destroy_open_owner(lru_foop);

	/* head always has latest freed oop */
	list_insert_head(&mi->mi_foo_list, oop);
}

void
nfs4_destroy_open_owner(nfs4_open_owner_t *oop)
{
	ASSERT(oop != NULL);

	crfree(oop->oo_cred);
	if (oop->oo_cred_otw)
		crfree(oop->oo_cred_otw);
	mutex_destroy(&oop->oo_lock);
	cv_destroy(&oop->oo_cv_seqid_sync);
	kmem_free(oop, sizeof (*oop));
}

seqid4
nfs4_get_open_seqid(nfs4_open_owner_t *oop)
{
	ASSERT(oop->oo_seqid_inuse);
	return (oop->oo_seqid);
}

/*
 * This set's the open seqid for a <open owner/ mntinfo4> pair.
 */
void
nfs4_set_open_seqid(seqid4 seqid, nfs4_open_owner_t *oop,
    nfs4_tag_type_t tag_type)
{
	ASSERT(oop->oo_seqid_inuse);
	oop->oo_seqid = seqid;
	oop->oo_last_good_seqid = seqid;
	oop->oo_last_good_op = tag_type;
}

/*
 * This bumps the current open seqid for the open owner 'oop'.
 */
void
nfs4_get_and_set_next_open_seqid(nfs4_open_owner_t *oop,
    nfs4_tag_type_t tag_type)
{
	ASSERT(oop->oo_seqid_inuse);
	oop->oo_seqid++;
	oop->oo_last_good_seqid = oop->oo_seqid;
	oop->oo_last_good_op = tag_type;
}

/*
 * If no open owner was provided, this function takes the cred to find an
 * open owner within the given mntinfo4_t.  Either way we return the
 * open owner's OTW credential if it exists; otherwise returns the
 * supplied 'cr'.
 *
 * A hold is put on the returned credential, and it is up to the caller
 * to free the cred.
 */
cred_t *
nfs4_get_otw_cred(cred_t *cr, mntinfo4_t *mi, nfs4_open_owner_t *provided_oop)
{
	cred_t *ret_cr;
	nfs4_open_owner_t *oop = provided_oop;

	if (oop == NULL)
		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
	if (oop != NULL) {
		mutex_enter(&oop->oo_lock);
		if (oop->oo_cred_otw)
			ret_cr = oop->oo_cred_otw;
		else
			ret_cr = cr;
		crhold(ret_cr);
		mutex_exit(&oop->oo_lock);
		if (provided_oop == NULL)
			open_owner_rele(oop);
	} else {
		ret_cr = cr;
		crhold(ret_cr);
	}
	return (ret_cr);
}

/*
 * Retrieves the next open stream in the rnode's list if an open stream
 * is provided; otherwise gets the first open stream in the list.
 * The open owner for that open stream is then retrieved, and if its
 * oo_cred_otw exists then it is returned; otherwise the provided 'cr'
 * is returned.  *osp is set to the 'found' open stream.
 *
 * Note: we don't set *osp to the open stream retrieved via the
 * optimized check since that won't necessarily be at the beginning
 * of the rnode list, and if that osp doesn't work we'd like to
 * check _all_ open streams (starting from the beginning of the
 * rnode list).
 */
cred_t *
nfs4_get_otw_cred_by_osp(rnode4_t *rp, cred_t *cr,
    nfs4_open_stream_t **osp, bool_t *first_time, bool_t *last_time)
{
	nfs4_open_stream_t *next_osp = NULL;
	cred_t *ret_cr;

	ASSERT(cr != NULL);
	/*
	 * As an optimization, try to find the open owner
	 * for the cred provided since that's most likely
	 * to work.
	 */
	if (*first_time) {
		nfs4_open_owner_t *oop;

		oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(RTOV4(rp)));
		if (oop) {
			next_osp = find_open_stream(oop, rp);
			if (next_osp)
				mutex_exit(&next_osp->os_sync_lock);
			open_owner_rele(oop);
		}
	}
	if (next_osp == NULL) {
		int delay_rele = 0;
		*first_time = FALSE;

		/* return the next open stream for this rnode */
		mutex_enter(&rp->r_os_lock);
		/* Now, no one can add or delete to rp's open streams list */

		if (*osp) {
			next_osp = list_next(&rp->r_open_streams, *osp);
			/*
			 * Delay the rele of *osp until after we drop
			 * r_os_lock to not deadlock with oo_lock
			 * via an open_stream_rele()->open_owner_rele().
			 */
			delay_rele = 1;
		} else {
			next_osp = list_head(&rp->r_open_streams);
		}
		if (next_osp) {
			nfs4_open_stream_t *tmp_osp;

			/* find the next valid open stream */
			mutex_enter(&next_osp->os_sync_lock);
			while (next_osp && !next_osp->os_valid) {
				tmp_osp =
				    list_next(&rp->r_open_streams, next_osp);
				mutex_exit(&next_osp->os_sync_lock);
				next_osp = tmp_osp;
				if (next_osp)
					mutex_enter(&next_osp->os_sync_lock);
			}
			if (next_osp) {
				next_osp->os_ref_count++;
				mutex_exit(&next_osp->os_sync_lock);
			}
		}
		mutex_exit(&rp->r_os_lock);
		if (delay_rele)
			open_stream_rele(*osp, rp);
	}

	if (next_osp) {
		nfs4_open_owner_t *oop;

		oop = next_osp->os_open_owner;
		mutex_enter(&oop->oo_lock);
		if (oop->oo_cred_otw)
			ret_cr = oop->oo_cred_otw;
		else
			ret_cr = cr;
		crhold(ret_cr);
		mutex_exit(&oop->oo_lock);
		if (*first_time) {
			open_stream_rele(next_osp, rp);
			*osp = NULL;
		} else
			*osp = next_osp;
	} else {
		/* just return the cred provided to us */
		*last_time = TRUE;
		*osp = NULL;
		ret_cr = cr;
		crhold(ret_cr);
	}

	*first_time = FALSE;
	return (ret_cr);
}

void
nfs4_init_stateid_types(nfs4_stateid_types_t *sid_tp)
{
	bzero(&sid_tp->d_sid, sizeof (stateid4));
	bzero(&sid_tp->l_sid, sizeof (stateid4));
	bzero(&sid_tp->o_sid, sizeof (stateid4));
	sid_tp->cur_sid_type = NO_SID;
}

void
nfs4_save_stateid(stateid4 *s1, nfs4_stateid_types_t *sid_tp)
{
	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
	    "nfs4_save_stateid: saved %s stateid",
	    sid_tp->cur_sid_type == DEL_SID ? "delegation" :
	    sid_tp->cur_sid_type == LOCK_SID ? "lock" :
	    sid_tp->cur_sid_type == OPEN_SID ? "open" : "special"));

	switch (sid_tp->cur_sid_type) {
	case DEL_SID:
		sid_tp->d_sid = *s1;
		break;
	case LOCK_SID:
		sid_tp->l_sid = *s1;
		break;
	case OPEN_SID:
		sid_tp->o_sid = *s1;
		break;
	case SPEC_SID:
	default:
		cmn_err(CE_PANIC, "nfs4_save_stateid: illegal "
		    "stateid type %d", sid_tp->cur_sid_type);
	}
}

/*
 * We got NFS4ERR_BAD_SEQID.  Setup some arguments to pass to recovery.
 * Caller is responsible for freeing.
 */
nfs4_bseqid_entry_t *
nfs4_create_bseqid_entry(nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop,
    vnode_t *vp, pid_t pid, nfs4_tag_type_t tag, seqid4 seqid)
{
	nfs4_bseqid_entry_t	*bsep;

	bsep = kmem_alloc(sizeof (*bsep), KM_SLEEP);
	bsep->bs_oop = oop;
	bsep->bs_lop = lop;
	bsep->bs_vp = vp;
	bsep->bs_pid = pid;
	bsep->bs_tag = tag;
	bsep->bs_seqid = seqid;

	return (bsep);
}

void
nfs4open_dg_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
    nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
    vnode_t *vp, int access_close, int deny_close)
{
	lost_rqstp->lr_putfirst = FALSE;

	ASSERT(vp != NULL);
	if (error == ETIMEDOUT || error == EINTR ||
	    NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
		    "nfs4open_dg_save_lost_rqst: error %d", error));

		lost_rqstp->lr_op = OP_OPEN_DOWNGRADE;
		/*
		 * The vp is held and rele'd via the recovery code.
		 * See nfs4_save_lost_rqst.
		 */
		lost_rqstp->lr_vp = vp;
		lost_rqstp->lr_dvp = NULL;
		lost_rqstp->lr_oop = oop;
		lost_rqstp->lr_osp = osp;
		lost_rqstp->lr_lop = NULL;
		lost_rqstp->lr_cr = cr;
		lost_rqstp->lr_flk = NULL;
		lost_rqstp->lr_dg_acc = access_close;
		lost_rqstp->lr_dg_deny = deny_close;
		lost_rqstp->lr_putfirst = FALSE;
	} else {
		lost_rqstp->lr_op = 0;
	}
}

/*
 * Change the access and deny bits of an OPEN.
 * If recovery is needed, *recov_credpp is set to the cred used OTW,
 * a hold is placed on it, and *recov_seqidp is set to the seqid used OTW.
 */
void
nfs4_open_downgrade(int access_close, int deny_close, nfs4_open_owner_t *oop,
    nfs4_open_stream_t *osp, vnode_t *vp, cred_t *cr, nfs4_lost_rqst_t *lrp,
    nfs4_error_t *ep, cred_t **recov_credpp, seqid4 *recov_seqidp)
{
	mntinfo4_t		*mi;
	int			downgrade_acc, downgrade_deny;
	int			new_acc, new_deny;
	COMPOUND4args_clnt	args;
	COMPOUND4res_clnt	res;
	OPEN_DOWNGRADE4res	*odg_res;
	nfs_argop4		argop[3];
	nfs_resop4		*resop;
	rnode4_t		*rp;
	bool_t			needrecov = FALSE;
	int			doqueue = 1;
	seqid4			seqid = 0;
	cred_t			*cred_otw;
	hrtime_t		t;

	ASSERT(mutex_owned(&osp->os_sync_lock));
#if DEBUG
	mutex_enter(&oop->oo_lock);
	ASSERT(oop->oo_seqid_inuse);
	mutex_exit(&oop->oo_lock);
#endif


	if (access_close == 0 && deny_close == 0) {
		nfs4_error_zinit(ep);
		return;
	}

	cred_otw = nfs4_get_otw_cred(cr, VTOMI4(vp), oop);

cred_retry:
	nfs4_error_zinit(ep);
	downgrade_acc = 0;
	downgrade_deny = 0;
	mi = VTOMI4(vp);
	rp = VTOR4(vp);

	/*
	 * Check to see if the open stream got closed before we go OTW,
	 * now that we have acquired the 'os_sync_lock'.
	 */
	if (!osp->os_valid) {
		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
		    " open stream has already been closed, return success"));
		/* error has already been set */
		goto no_args_out;
	}

	/* If the file failed recovery, just quit. */
	mutex_enter(&rp->r_statelock);
	if (rp->r_flags & R4RECOVERR) {
		mutex_exit(&rp->r_statelock);
		ep->error = EIO;
		goto no_args_out;
	}
	mutex_exit(&rp->r_statelock);

	seqid = nfs4_get_open_seqid(oop) + 1;

	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
	    "access_close %d, acc_read %"PRIu64" acc_write %"PRIu64"",
	    access_close, osp->os_share_acc_read, osp->os_share_acc_write));

	/* If we're closing the last READ, need to downgrade */
	if ((access_close & FREAD) && (osp->os_share_acc_read == 1))
		downgrade_acc |= OPEN4_SHARE_ACCESS_READ;

	/* if we're closing the last WRITE, need to downgrade */
	if ((access_close & FWRITE) && (osp->os_share_acc_write == 1))
		downgrade_acc |= OPEN4_SHARE_ACCESS_WRITE;

	downgrade_deny = OPEN4_SHARE_DENY_NONE;

	new_acc = 0;
	new_deny = 0;

	/* set our new access and deny share bits */
	if ((osp->os_share_acc_read > 0) &&
	    !(downgrade_acc & OPEN4_SHARE_ACCESS_READ))
		new_acc |= OPEN4_SHARE_ACCESS_READ;
	if ((osp->os_share_acc_write > 0) &&
	    !(downgrade_acc & OPEN4_SHARE_ACCESS_WRITE))
		new_acc |= OPEN4_SHARE_ACCESS_WRITE;

	new_deny = OPEN4_SHARE_DENY_NONE;

	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
	    "downgrade acc 0x%x deny 0x%x", downgrade_acc, downgrade_deny));
	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
	    "new acc 0x%x deny 0x%x", new_acc, new_deny));

	/*
	 * Check to see if we aren't actually doing any downgrade or
	 * if this is the last 'close' but the file is still mmapped.
	 * Skip this if this a lost request resend so we don't decrement
	 * the osp's share counts more than once.
	 */
	if (!lrp &&
	    ((downgrade_acc == 0 && downgrade_deny == 0) ||
	    (new_acc == 0 && new_deny == 0))) {
		/*
		 * No downgrade to do, but still need to
		 * update osp's os_share_* counts.
		 */
		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE,
		    "nfs4_open_downgrade: just lower the osp's count by %s",
		    (access_close & FREAD) && (access_close & FWRITE) ?
		    "read and write" : (access_close & FREAD) ? "read" :
		    (access_close & FWRITE) ? "write" : "bogus"));
		if (access_close & FREAD)
			osp->os_share_acc_read--;
		if (access_close & FWRITE)
			osp->os_share_acc_write--;
		osp->os_share_deny_none--;
		nfs4_error_zinit(ep);

		goto no_args_out;
	}

	if (osp->os_orig_oo_name != oop->oo_name) {
		ep->error = EIO;
		goto no_args_out;
	}

	/* setup the COMPOUND args */
	if (lrp)
		args.ctag = TAG_OPEN_DG_LOST;
	else
		args.ctag = TAG_OPEN_DG;

	args.array_len = 3;
	args.array = argop;

	/* putfh */
	argop[0].argop = OP_CPUTFH;
	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;

	argop[1].argop = OP_GETATTR;
	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
	argop[1].nfs_argop4_u.opgetattr.mi = mi;

	ASSERT(mutex_owned(&osp->os_sync_lock));
	ASSERT(osp->os_delegation == FALSE);

	/* open downgrade */
	argop[2].argop = OP_OPEN_DOWNGRADE;
	argop[2].nfs_argop4_u.opopen_downgrade.open_stateid = osp->open_stateid;
	argop[2].nfs_argop4_u.opopen_downgrade.share_access = new_acc;
	argop[2].nfs_argop4_u.opopen_downgrade.share_deny = new_deny;
	argop[2].nfs_argop4_u.opopen_downgrade.seqid = seqid;

	t = gethrtime();

	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);

	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
		nfs4_set_open_seqid(seqid, oop, args.ctag);

	if ((ep->error == EACCES ||
	    (ep->error == 0 && res.status == NFS4ERR_ACCESS)) &&
	    cred_otw != cr) {
		crfree(cred_otw);
		cred_otw = cr;
		crhold(cred_otw);
		if (!ep->error)
			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
		goto cred_retry;
	}

	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);

	if (needrecov && recov_credpp) {
		*recov_credpp = cred_otw;
		crhold(*recov_credpp);
		if (recov_seqidp)
			*recov_seqidp = seqid;
	}

	if (!ep->error && !res.status) {
		/* get the open downgrade results */
		resop = &res.array[2];
		odg_res = &resop->nfs_resop4_u.opopen_downgrade;

		osp->open_stateid = odg_res->open_stateid;

		/* set the open streams new access/deny bits */
		if (access_close & FREAD)
			osp->os_share_acc_read--;
		if (access_close & FWRITE)
			osp->os_share_acc_write--;
		osp->os_share_deny_none--;
		osp->os_dc_openacc = new_acc;

		nfs4_attr_cache(vp,
		    &res.array[1].nfs_resop4_u.opgetattr.ga_res,
		    t, cred_otw, TRUE, NULL);
	}

	if (!ep->error)
		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);

no_args_out:
	crfree(cred_otw);
}

/*
 * If an OPEN request gets ETIMEDOUT or EINTR (that includes bailing out
 * because the filesystem was forcibly unmounted) then we don't know if we
 * potentially left state dangling on the server, therefore the recovery
 * framework makes this call to resend the OPEN request and then undo it.
 */
void
nfs4_resend_open_otw(vnode_t **vpp, nfs4_lost_rqst_t *resend_rqstp,
    nfs4_error_t *ep)
{
	COMPOUND4args_clnt	args;
	COMPOUND4res_clnt	res;
	nfs_argop4		argop[4];
	GETFH4res		*gf_res = NULL;
	OPEN4cargs		*open_args;
	OPEN4res		*op_res;
	char			*destcfp;
	int			destclen;
	nfs4_ga_res_t		*garp;
	vnode_t			*dvp = NULL, *vp = NULL;
	rnode4_t		*rp = NULL, *drp = NULL;
	cred_t			*cr = NULL;
	seqid4			seqid;
	nfs4_open_owner_t	*oop = NULL;
	nfs4_open_stream_t	*osp = NULL;
	component4		*srcfp;
	open_claim_type4	claim;
	mntinfo4_t		*mi;
	int			doqueue = 1;
	bool_t			retry_open = FALSE;
	int			created_osp = 0;
	hrtime_t		t;
	char 			*failed_msg = "";
	int			fh_different;
	int			reopen = 0;

	nfs4_error_zinit(ep);

	cr = resend_rqstp->lr_cr;
	dvp = resend_rqstp->lr_dvp;

	vp = *vpp;
	if (vp) {
		ASSERT(nfs4_consistent_type(vp));
		rp = VTOR4(vp);
	}

	if (rp) {
		/* If the file failed recovery, just quit. */
		mutex_enter(&rp->r_statelock);
		if (rp->r_flags & R4RECOVERR) {
			mutex_exit(&rp->r_statelock);
			ep->error = EIO;
			return;
		}
		mutex_exit(&rp->r_statelock);
	}

	if (dvp) {
		drp = VTOR4(dvp);
		/* If the parent directory failed recovery, just quit. */
		mutex_enter(&drp->r_statelock);
		if (drp->r_flags & R4RECOVERR) {
			mutex_exit(&drp->r_statelock);
			ep->error = EIO;
			return;
		}
		mutex_exit(&drp->r_statelock);
	} else
		reopen = 1;	/* NULL dvp means this is a reopen */

	claim = resend_rqstp->lr_oclaim;
	ASSERT(claim == CLAIM_NULL || claim == CLAIM_DELEGATE_CUR);

	args.ctag = TAG_OPEN_LOST;
	args.array_len = 4;
	args.array = argop;

	argop[0].argop = OP_CPUTFH;
	if (reopen) {
		ASSERT(vp != NULL);

		mi = VTOMI4(vp);
		/*
		 * if this is a file mount then
		 * use the mntinfo parentfh
		 */
		argop[0].nfs_argop4_u.opcputfh.sfh =
		    (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
		    VTOSV(vp)->sv_dfh;
		args.ctag = TAG_REOPEN_LOST;
	} else {
		argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
		mi = VTOMI4(dvp);
	}

	argop[1].argop = OP_COPEN;
	open_args = &argop[1].nfs_argop4_u.opcopen;
	open_args->claim = claim;

	/*
	 * If we sent over a OPEN with CREATE then the only
	 * thing we care about is to not leave dangling state
	 * on the server, not whether the file we potentially
	 * created remains on the server.  So even though the
	 * lost open request specified a CREATE, we only wish
	 * to do a non-CREATE OPEN.
	 */
	open_args->opentype = OPEN4_NOCREATE;

	srcfp = &resend_rqstp->lr_ofile;
	destclen = srcfp->utf8string_len;
	destcfp = kmem_alloc(destclen + 1, KM_SLEEP);
	bcopy(srcfp->utf8string_val, destcfp, destclen);
	destcfp[destclen] = '\0';
	if (claim == CLAIM_DELEGATE_CUR) {
		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
		    resend_rqstp->lr_ostateid;
		open_args->open_claim4_u.delegate_cur_info.cfile = destcfp;
	} else {
		open_args->open_claim4_u.cfile = destcfp;
	}

	open_args->share_access = resend_rqstp->lr_oacc;
	open_args->share_deny = resend_rqstp->lr_odeny;
	oop = resend_rqstp->lr_oop;
	ASSERT(oop != NULL);

	open_args->owner.clientid = mi2clientid(mi);
	/* this length never changes */
	open_args->owner.owner_len = sizeof (oop->oo_name);
	open_args->owner.owner_val =
	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);

	ep->error = nfs4_start_open_seqid_sync(oop, mi);
	ASSERT(ep->error == 0);		/* recov thread always succeeds */
	/*
	 * We can get away with not saving the seqid upon detection
	 * of a lost request, and now just use the open owner's current
	 * seqid since we only allow one op OTW per seqid and lost
	 * requests are saved FIFO.
	 */
	seqid = nfs4_get_open_seqid(oop) + 1;
	open_args->seqid = seqid;

	bcopy(&oop->oo_name, open_args->owner.owner_val,
	    open_args->owner.owner_len);

	/* getfh */
	argop[2].argop = OP_GETFH;

	/* Construct the getattr part of the compound */
	argop[3].argop = OP_GETATTR;
	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
	argop[3].nfs_argop4_u.opgetattr.mi = mi;

	res.array = NULL;

	t = gethrtime();

	rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);

	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
		nfs4_set_open_seqid(seqid, oop, args.ctag);

	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
	    "nfs4_resend_open_otw: error %d stat %d", ep->error, res.status));

	if (ep->error || res.status)
		goto err_out;

	op_res = &res.array[1].nfs_resop4_u.opopen;
	gf_res = &res.array[2].nfs_resop4_u.opgetfh;
	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;

	if (!vp) {
		int rnode_err = 0;
		nfs4_sharedfh_t *sfh;

		/*
		 * If we can't decode all the attributes they are not usable,
		 * just make the vnode.
		 */

		sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
		*vpp = makenfs4node(sfh, garp, dvp->v_vfsp, t, cr, dvp,
		    fn_get(VTOSV(dvp)->sv_name,
		    open_args->open_claim4_u.cfile, sfh));
		sfh4_rele(&sfh);
		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
		    "nfs4_resend_open_otw: made vp %p for file %s",
		    (void *)(*vpp), open_args->open_claim4_u.cfile));

		if (ep->error)
			PURGE_ATTRCACHE4(*vpp);

		/*
		 * For the newly created *vpp case, make sure the rnode
		 * isn't bad before using it.
		 */
		mutex_enter(&(VTOR4(*vpp))->r_statelock);
		if (VTOR4(*vpp)->r_flags & R4RECOVERR)
			rnode_err = EIO;
		mutex_exit(&(VTOR4(*vpp))->r_statelock);

		if (rnode_err) {
			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
			    "nfs4_resend_open_otw: rp %p is bad",
			    (void *)VTOR4(*vpp)));
			ep->error = rnode_err;
			goto err_out;
		}

		vp = *vpp;
		rp = VTOR4(vp);
	}

	if (reopen) {
		/*
		 * Check if the path we reopened really is the same
		 * file. We could end up in a situation were the file
		 * was removed and a new file created with the same name.
		 */
		(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
		fh_different =
		    (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
		if (fh_different) {
			if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
			    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
				/* Oops, we don't have the same file */
				if (mi->mi_fh_expire_type == FH4_PERSISTENT)
					failed_msg =
					    "Couldn't reopen: Persistant "
					    "file handle changed";
				else
					failed_msg =
					    "Couldn't reopen: Volatile "
					    "(no expire on open) file handle "
					    "changed";

				nfs4_end_open_seqid_sync(oop);
				kmem_free(destcfp, destclen + 1);
				nfs4args_copen_free(open_args);
				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
				nfs_rw_exit(&mi->mi_fh_lock);
				nfs4_fail_recov(vp, failed_msg, ep->error,
				    ep->stat);
				return;
			} else {
				/*
				 * We have volatile file handles that don't
				 * compare.  If the fids are the same then we
				 * assume that the file handle expired but the
				 * renode still refers to the same file object.
				 *
				 * First check that we have fids or not.
				 * If we don't we have a dumb server so we will
				 * just assume every thing is ok for now.
				 */
				if (!ep->error &&
				    garp->n4g_va.va_mask & AT_NODEID &&
				    rp->r_attr.va_mask & AT_NODEID &&
				    rp->r_attr.va_nodeid !=
				    garp->n4g_va.va_nodeid) {
					/*
					 * We have fids, but they don't
					 * compare. So kill the file.
					 */
					failed_msg =
					    "Couldn't reopen: file handle "
					    "changed due to mismatched fids";
					nfs4_end_open_seqid_sync(oop);
					kmem_free(destcfp, destclen + 1);
					nfs4args_copen_free(open_args);
					xdr_free(xdr_COMPOUND4res_clnt,
					    (caddr_t)&res);
					nfs_rw_exit(&mi->mi_fh_lock);
					nfs4_fail_recov(vp, failed_msg,
					    ep->error, ep->stat);
					return;
				} else {
					/*
					 * We have volatile file handles that
					 * refers to the same file (at least
					 * they have the same fid) or we don't
					 * have fids so we can't tell. :(. We'll
					 * be a kind and accepting client so
					 * we'll update the rnode's file
					 * handle with the otw handle.
					 *
					 * We need to drop mi->mi_fh_lock since
					 * sh4_update acquires it. Since there
					 * is only one recovery thread there is
					 * no race.
					 */
					nfs_rw_exit(&mi->mi_fh_lock);
					sfh4_update(rp->r_fh, &gf_res->object);
				}
			}
		} else {
			nfs_rw_exit(&mi->mi_fh_lock);
		}
	}

	ASSERT(nfs4_consistent_type(vp));

	if (op_res->rflags & OPEN4_RESULT_CONFIRM)
		nfs4open_confirm(vp, &seqid, &op_res->stateid, cr, TRUE,
		    &retry_open, oop, TRUE, ep, NULL);
	if (ep->error || ep->stat) {
		nfs4_end_open_seqid_sync(oop);
		kmem_free(destcfp, destclen + 1);
		nfs4args_copen_free(open_args);
		if (!ep->error)
			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
		return;
	}

	if (reopen) {
		/*
		 * Doing a reopen here so the osp should already exist.
		 * If not, something changed or went very wrong.
		 *
		 * returns with 'os_sync_lock' held
		 */
		osp = find_open_stream(oop, rp);
		if (!osp) {
			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
			    "nfs4_resend_open_otw: couldn't find osp"));
			ep->error = EINVAL;
			goto err_out;
		}
		osp->os_open_ref_count++;
	} else {
		mutex_enter(&oop->oo_lock);
		oop->oo_just_created = NFS4_PERM_CREATED;
		mutex_exit(&oop->oo_lock);

		/* returns with 'os_sync_lock' held */
		osp = find_or_create_open_stream(oop, rp, &created_osp);
		if (!osp) {
			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
			    "nfs4_resend_open_otw: couldn't create osp"));
			ep->error = EINVAL;
			goto err_out;
		}
	}

	osp->open_stateid = op_res->stateid;
	osp->os_delegation = FALSE;
	/*
	 * Need to reset this bitfield for the possible case where we were
	 * going to OTW CLOSE the file, got a non-recoverable error, and before
	 * we could retry the CLOSE, OPENed the file again.
	 */
	ASSERT(osp->os_open_owner->oo_seqid_inuse);
	osp->os_final_close = 0;
	osp->os_force_close = 0;

	if (!reopen) {
		if (open_args->share_access & OPEN4_SHARE_ACCESS_READ)
			osp->os_share_acc_read++;
		if (open_args->share_access & OPEN4_SHARE_ACCESS_WRITE)
			osp->os_share_acc_write++;
		osp->os_share_deny_none++;
	}

	mutex_exit(&osp->os_sync_lock);
	if (created_osp)
		nfs4_inc_state_ref_count(mi);
	open_stream_rele(osp, rp);

	nfs4_end_open_seqid_sync(oop);

	/* accept delegation, if any */
	nfs4_delegation_accept(rp, claim, op_res, garp, cr);

	kmem_free(destcfp, destclen + 1);
	nfs4args_copen_free(open_args);

	if (claim == CLAIM_DELEGATE_CUR)
		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
	else
		PURGE_ATTRCACHE4(vp);

	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);

	ASSERT(nfs4_consistent_type(vp));

	return;

err_out:
	nfs4_end_open_seqid_sync(oop);
	kmem_free(destcfp, destclen + 1);
	nfs4args_copen_free(open_args);
	if (!ep->error)
		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
}