/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 *
 * Inter-Domain Network
 *
 * Shared Memory Region (SMR) supporting code.
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/machparam.h>
#include <sys/debug.h>
#include <sys/cpuvar.h>
#include <sys/kmem.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/systm.h>
#include <sys/machlock.h>
#include <sys/membar.h>
#include <sys/mman.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/hat_sfmmu.h>
#include <sys/vm_machparam.h>
#include <sys/x_call.h>

#include <sys/idn.h>

#ifdef DEBUG
#define	DIOCHECK(domid) \
{ \
	int	_dio; \
	if ((_dio = idn_domain[domid].dio) < 0) { \
		cmn_err(CE_WARN, \
			">>>>> file %s, line %d: domain %d, dio = %d", \
			__FILE__, __LINE__, (domid), _dio); \
	} \
}
#else
#define	DIOCHECK(domid)
#endif /* DEBUG */

static int	smr_slab_alloc_local(int domid, smr_slab_t **spp);
static int	smr_slab_alloc_remote(int domid, smr_slab_t **spp);
static void	smr_slab_free_local(int domid, smr_slab_t *sp);
static void	smr_slab_free_remote(int domid, smr_slab_t *sp);
static int 	smr_slabwaiter_register(int domid);
static int 	smr_slabwaiter_unregister(int domid, smr_slab_t **spp);
static int 	smr_slaballoc_wait(int domid, smr_slab_t **spp);
static smr_slab_t 	*smr_slab_reserve(int domid);
static void 	smr_slab_unreserve(int domid, smr_slab_t *sp);
static void	smr_slab_reap_global();

/*
 * Can only be called by the master.  Allocate a slab from the
 * local pool representing the SMR, on behalf of the given
 * domain.  Slab is either being requested for use by the
 * local domain (i.e. domid == idn.localid), or it's being
 * allocated to give to a remote domain which requested one.
 * In the base of allocating on behalf of a remote domain,
 * smr_slab_t structure is used simply to manage ownership.
 *
 * Returns:	smr_slaballoc_wait
 * 		(EINVAL, ETIMEDOUT)
 *		smr_slabwatier_unregister
 *		(0, EINVAL, EBUSY, ENOMEM)
 *		ENOLCK
 */
static int
smr_slab_alloc_local(int domid, smr_slab_t **spp)
{
	int		serrno = 0;
	int		nwait;
	smr_slab_t	*sp;
	idn_domain_t	*dp;


	/*
	 * Only the master can make local allocations.
	 */
	ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);
	ASSERT(idn.localid == IDN_GET_MASTERID());

	*spp = NULL;

	dp = &idn_domain[domid];
	ASSERT(DSLAB_READ_HELD(domid));
	ASSERT(dp->dslab_state == DSLAB_STATE_LOCAL);

	/*
	 * Register myself with the waiting list.
	 */
	nwait = smr_slabwaiter_register(domid);

	if (nwait > 1) {
		/*
		 * XXX - old comment?
		 * Need to drop the read lock _after_ registering
		 * ourselves with the potential wait list for this allocation.
		 * Although this allocation is not a remote one, we could
		 * still have multiple threads on the master trying to
		 * satisfy (allocate) request on behalf of a remote domain.
		 */
		/*
		 * Somebody is already in the process of satisfying
		 * the allocation request for this respective
		 * domain.  All we need to do is wait and let
		 * it happen.
		 */
		serrno = smr_slaballoc_wait(domid, spp);
		return (serrno);
	}
	/*
	 * I'm the original slab requester for this domain.  It's local
	 * so go ahead and do the job.
	 */

	if ((sp = smr_slab_reserve(domid)) == NULL)
		serrno = ENOMEM;

	/*
	 * Allocation may have failed.  In either case we've
	 * got to do the put to at least wake potential waiters up.
	 */
	if (!serrno) {
		if (DSLAB_LOCK_TRYUPGRADE(domid) == 0) {
			DSLAB_UNLOCK(domid);
			DSLAB_LOCK_EXCL(domid);
		}
	}

	(void) smr_slaballoc_put(domid, sp, 0, serrno);

	/*
	 * If serrno is ENOLCK here, then we must have failed
	 * on the upgrade above, so lock already dropped.
	 */
	if (serrno != ENOLCK) {
		/*
		 * Need to drop since reaping may be recursive?
		 */
		DSLAB_UNLOCK(domid);
	}

	/*
	 * Since we were the original requester but never went
	 * to sleep, we need to directly unregister ourselves
	 * from the waiting list.
	 */
	serrno = smr_slabwaiter_unregister(domid, spp);

	/*
	 * Now that we've satisfied the request, let's check if any
	 * reaping is necessary.  Only the master does this and only
	 * when allocating slabs, an infrequent event :-o
	 */
	smr_slab_reap_global();

	ASSERT((serrno == 0) ? (*spp != NULL) : (*spp == NULL));

	DSLAB_LOCK_SHARED(domid);

	return (serrno);
}

/*
 * Can only be called by a slave on behalf of himself.  Need to
 * make a request to the master to allocate a slab of SMR buffers
 * for the local domain.
 *
 * Returns:	smr_slaballoc_wait
 *		(0, EINVAL, EBUSY, ENOMEM)
 *		ENOLCK
 *		ECANCELED
 */
static int
smr_slab_alloc_remote(int domid, smr_slab_t **spp)
{
	int		nwait;
	int		serrno = 0;
	int		bailout = 0;
	int		masterid;
	idn_domain_t	*dp, *mdp = NULL;
	procname_t	proc = "smr_slab_alloc_remote";

	/*
	 * Only slaves make remote allocations.
	 */
	ASSERT(idn.localid != IDN_GET_MASTERID());
	ASSERT(domid == idn.localid);
	ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);

	*spp = NULL;

	dp = &idn_domain[domid];
	ASSERT(DSLAB_READ_HELD(domid));
	ASSERT(dp->dslab_state == DSLAB_STATE_REMOTE);

	/*
	 * Register myself with the slaballoc waiting list.
	 * Note that only allow one outstanding allocation
	 * request for the given domain.  Other callers which
	 * detect a slab is needed simply get stuck on the
	 * waiting list waiting for the original caller to
	 * get the job done.
	 * The waiter_register routine will allocate the necessary
	 * slab structure which will ultimately be inserted in
	 * the domain's slab list via smr_slaballoc_put().
	 */
	nwait = smr_slabwaiter_register(domid);

	/*
	 * Make sure we have a connection with the master
	 * before we wait around for nothing and send a
	 * command off to nowhere.
	 * First do a quick (no lock) check for global okayness.
	 */
	if ((idn.state != IDNGS_ONLINE) ||
	    ((masterid = IDN_GET_MASTERID()) == IDN_NIL_DOMID)) {
		bailout = 1;
		serrno = ECANCELED;
	}
	/*
	 * We need to drop our read lock _before_ acquiring the
	 * slaballoc waiter lock.  This is necessary because the
	 * thread that receives the slab alloc response and fills
	 * in the slab structure will need to grab the domain write
	 * lock while holding onto the slaballoc waiter lock.
	 * Potentially could deadlock if we didn't drop our domain
	 * lock before.  Plus, we've registered.
	 *
	 * 4093209 - Note also that we do this _after_ the check for
	 *	idn.masterid where we grab the READER global
	 *	lock.  This is to prevent somebody from
	 *	changing our state after we drop the drwlock.
	 *	A deadlock can occur when shutting down a
	 *	domain we're holding the
	 */

	if (!bailout) {
		mdp = &idn_domain[masterid];
		/*
		 * Global state is okay.  Let's double check the
		 * state of our actual target domain.
		 */
		if (mdp->dstate != IDNDS_CONNECTED) {
			bailout = 1;
			serrno = ECANCELED;
		} else if (IDN_DLOCK_TRY_SHARED(masterid)) {
			if (mdp->dstate != IDNDS_CONNECTED) {
				bailout = 1;
				serrno = ECANCELED;
				IDN_DUNLOCK(masterid);
			} else if (nwait != 1) {
				IDN_DUNLOCK(masterid);
			}
			/*
			 * Note that keep the drwlock(read) for
			 * the target (master) domain if it appears
			 * we're the lucky one to send the command.
			 * We hold onto the lock until we've actually
			 * sent the command out.
			 * We don't reach this place unless it
			 * appears everything is kosher with
			 * the target (master) domain.
			 */
		} else {
			bailout = 1;
			serrno = ENOLCK;
		}
	}

	if (bailout) {
		ASSERT(serrno);
		/*
		 * Gotta bail.  Abort operation.  Error result
		 * will be picked up when we attempt to wait.
		 */
		PR_SMR("%s: BAILING OUT on behalf domain %d "
		    "(err=%d, gs=%s, ms=%s)\n",
		    proc, domid, serrno, idngs_str[idn.state],
		    (masterid == IDN_NIL_DOMID)
		    ? "unknown" : idnds_str[idn_domain[masterid].dstate]);
		(void) smr_slabwaiter_abort(domid, serrno);

	} else if (nwait == 1) {
		/*
		 * We are the original requester.  Initiate the
		 * actual request to the master.
		 */
		idn_send_cmd(masterid, IDNCMD_SLABALLOC, IDN_SLAB_SIZE, 0, 0);
		ASSERT(mdp);
		IDN_DUNLOCK(masterid);
	}

	/*
	 * Wait here for response.  Once awakened func returns
	 * with slab structure possibly filled with gifts!
	 */
	serrno = smr_slaballoc_wait(domid, spp);

	return (serrno);
}

/*
 * Allocate a slab from the Master on behalf
 * of the given domain.  Note that master uses
 * this function to allocate slabs on behalf of
 * remote domains also.
 * Entered with drwlock held.
 * Leaves with drwlock dropped.
 * Returns:	EDQUOT
 *		EINVAL
 *		ENOLCK
 *		smr_slab_alloc_local
 *		smr_slab_alloc_remote
 *		(0, EINVAL, EBUSY, ENOMEM)
 */
int
smr_slab_alloc(int domid, smr_slab_t **spp)
{
	int		serrno = 0;
	idn_domain_t	*dp;
	procname_t	proc = "smr_slab_alloc";


	dp = &idn_domain[domid];

	ASSERT(DSLAB_READ_HELD(domid));
	ASSERT(dp->dslab_state != DSLAB_STATE_UNKNOWN);

	*spp = NULL;

	switch (dp->dslab_state) {
	case DSLAB_STATE_UNKNOWN:
		cmn_err(CE_WARN,
		    "IDN: 300: no slab allocations without a master");
		serrno = EINVAL;
		break;

	case DSLAB_STATE_LOCAL:
		/*
		 * If I'm the master, then get a slab
		 * from the local SMR pool, but only
		 * if the number of allocated slabs has
		 * not been exceeded.
		 */
		if (((int)dp->dnslabs < IDN_SLAB_MAXPERDOMAIN) ||
		    !IDN_SLAB_MAXPERDOMAIN)
			serrno = smr_slab_alloc_local(domid, spp);
		else
			serrno = EDQUOT;
		break;

	case DSLAB_STATE_REMOTE:
		/*
		 * Have to make a remote request.
		 * In order to prevent overwhelming the master
		 * with a bunch of requests that he won't be able
		 * to handle we do a check to see if we're still
		 * under quota.  Note that the limit is known
		 * apriori based on the SMR/NWR size and
		 * IDN_SLAB_MINTOTAL.  Domains must have the same
		 * size SMR/NWR, however they can have different
		 * IDN_SLAB_MINTOTAL.  Thus a domain could throttle
		 * itself however it wishes.
		 */
		if (((int)dp->dnslabs < IDN_SLAB_MAXPERDOMAIN) ||
		    !IDN_SLAB_MAXPERDOMAIN)
			serrno = smr_slab_alloc_remote(domid, spp);
		else
			serrno = EDQUOT;
		break;

	default:
		cmn_err(CE_WARN,
		    "IDN: 301: (ALLOC) unknown slab state (%d) "
		    "for domain %d", dp->dslab_state, domid);
		serrno = EINVAL;
		break;
	}

	if (*spp == NULL) {
		PR_SMR("%s: failed to allocate %s slab [serrno = %d]\n",
		    proc, (idn.localid == IDN_GET_MASTERID()) ?
		    "local" : "remote", serrno);
	}

	if (serrno) {
		IDN_GKSTAT_GLOBAL_EVENT(gk_slabfail, gk_slabfail_last);
	}

	return (serrno);
}

static void
smr_slab_free_local(int domid, smr_slab_t *sp)
{
	int	rv;

	/*
	 * Do a slaballoc_put just in case there may have
	 * been waiters for slabs for this respective domain
	 * before we unreserve this slab.
	 */
	rv = smr_slaballoc_put(domid, sp, 0, 0);

	if (rv == -1) {
		/*
		 * Put failed.  Must not have been any waiters.
		 * Go ahead and unreserve the space.
		 */
		smr_slab_unreserve(domid, sp);
	}
}

static void
smr_slab_free_remote(int domid, smr_slab_t *sp)
{
	smr_offset_t	slab_offset;
	int		slab_size;
	int		rv;
	int		masterid;

	ASSERT(domid == idn.localid);
	ASSERT(idn.localid != IDN_GET_MASTERID());
	ASSERT(DSLAB_WRITE_HELD(domid));
	ASSERT(idn_domain[domid].dslab_state == DSLAB_STATE_REMOTE);

	masterid = IDN_GET_MASTERID();

	ASSERT(masterid != IDN_NIL_DOMID);

	slab_offset = IDN_ADDR2OFFSET(sp->sl_start);
	slab_size   = (int)(sp->sl_end - sp->sl_start);

	/*
	 * Do a slaballoc_put just in case there may have
	 * been waiters for slabs for this domain before
	 * returning back to the master.
	 */
	rv = smr_slaballoc_put(domid, sp, 0, 0);

	if ((rv == -1) && (masterid != IDN_NIL_DOMID)) {
		/*
		 * Put failed.  No waiters so free the local data
		 * structure ship the SMR range off to the master.
		 */
		smr_free_buflist(sp);
		FREESTRUCT(sp, smr_slab_t, 1);

		IDN_DLOCK_SHARED(masterid);
		idn_send_cmd(masterid, IDNCMD_SLABFREE, slab_offset, slab_size,
		    0);
		IDN_DUNLOCK(masterid);
	}
}

/*
 * Free up the list of slabs passed
 */
void
smr_slab_free(int domid, smr_slab_t *sp)
{
	smr_slab_t	*nsp = NULL;

	ASSERT(DSLAB_WRITE_HELD(domid));

	if (sp == NULL)
		return;

	ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);

	switch (idn_domain[domid].dslab_state) {
	case DSLAB_STATE_UNKNOWN:
		cmn_err(CE_WARN, "IDN: 302: no slab free without a master");
		break;

	case DSLAB_STATE_LOCAL:
		/*
		 * If I'm the master then put the slabs
		 * back to the local SMR pool.
		 */
		for (; sp; sp = nsp) {
			nsp = sp->sl_next;
			smr_slab_free_local(domid, sp);
		}
		break;

	case DSLAB_STATE_REMOTE:
		/*
		 * If the domid is my own then I'm freeing
		 * a slab back to the Master.
		 */
		for (; sp; sp = nsp) {
			nsp = sp->sl_next;
			smr_slab_free_remote(domid, sp);
		}
		break;

	default:
		cmn_err(CE_WARN,
		    "IDN: 301: (FREE) unknown slab state (%d) for domain %d",
		    idn_domain[domid].dslab_state, domid);
		break;
	}
}

/*
 * Free up the list of slab data structures ONLY.
 * This is called during a fatal shutdown of the master
 * where we need to garbage collect the locally allocated
 * data structures used to manage slabs allocated to the
 * local domain.  Should never be called by a master since
 * the master can do a regular smr_slab_free.
 */
void
smr_slab_garbage_collection(smr_slab_t *sp)
{
	smr_slab_t	*nsp;

	ASSERT(idn_domain[idn.localid].dvote.v.master == 0);

	if (sp == NULL)
		return;
	/*
	 * Since this is only ever called by a slave,
	 * the slab structure size always contains a buflist.
	 */
	for (; sp; sp = nsp) {
		nsp = sp->sl_next;
		smr_free_buflist(sp);
		FREESTRUCT(sp, smr_slab_t, 1);
	}
}

/*
 * Allocate a SMR buffer on behalf of the local domain
 * which is ultimately targeted for the given domain.
 *
 * IMPORTANT: This routine is going to drop the domain rwlock (drwlock)
 *	      for the domain on whose behalf the request is being
 *	      made.  This routine canNOT block on trying to
 *	      reacquire the drwlock.  If he does block then somebody
 *	      must have the write lock on the domain which most likely
 *	      means the domain is going south anyway, so just bail on
 *	      this buffer.  Higher levels will retry if needed.
 *
 * XXX - Support larger than IDN_SMR_BUFSIZE allocations?
 *
 * Returns:	A negative return value indicates lock lost on domid.
 *		EINVAL, ENOLINK, ENOLCK(internal)
 *		smr_slaballoc_wait
 * 		(EINVAL, ETIMEDOUT)
 *		smr_slabwatier_unregister
 *		(0, EINVAL, EBUSY, ENOMEM)
 */
int
smr_buf_alloc(int domid, uint_t len, caddr_t *bufpp)
{
	register idn_domain_t	*dp, *ldp;
	smr_slab_t	*sp;
	caddr_t		bufp = NULL;
	int		serrno;
	procname_t	proc = "smr_buf_alloc";

	dp = &idn_domain[domid];
	/*
	 * Local domain can only allocate on behalf of
	 * itself if this is a priviledged call and the
	 * caller is the master.
	 */
	ASSERT((domid != idn.localid) && (domid != IDN_NIL_DOMID));

	*bufpp = NULL;

	if (len > IDN_DATA_SIZE) {
		cmn_err(CE_WARN,
		    "IDN: 303: buffer len %d > IDN_DATA_SIZE (%lu)",
		    len, IDN_DATA_SIZE);
		IDN_GKSTAT_GLOBAL_EVENT(gk_buffail, gk_buffail_last);
		return (EINVAL);
	}

	/*
	 * Need to go to my local slab list to find
	 * a buffer.
	 */
	ldp = &idn_domain[idn.localid];
	/*
	 * Now we loop trying to locate a buffer out of our
	 * slabs.  We continue this until either we find a
	 * buffer or we're unable to allocate a slab.  Note
	 * that new slabs are allocated to the front.
	 */
	DSLAB_LOCK_SHARED(idn.localid);
	sp = ldp->dslab;
	do {
		int	spl, all_empty;

		if (sp == NULL) {
			if ((serrno = smr_slab_alloc(idn.localid, &sp)) != 0) {
				PR_SMR("%s:%d: failed to allocate "
				    "slab [serrno = %d]",
				    proc, domid, serrno);
				DSLAB_UNLOCK(idn.localid);
				IDN_GKSTAT_GLOBAL_EVENT(gk_buffail,
				    gk_buffail_last);
				return (serrno);
			}
			/*
			 * Of course, the world may have changed while
			 * we dropped the lock.  Better make sure we're
			 * still established.
			 */
			if (dp->dstate != IDNDS_CONNECTED) {
				PR_SMR("%s:%d: state changed during slab "
				    "alloc (dstate = %s)\n",
				    proc, domid, idnds_str[dp->dstate]);
				DSLAB_UNLOCK(idn.localid);
				IDN_GKSTAT_GLOBAL_EVENT(gk_buffail,
				    gk_buffail_last);
				return (ENOLINK);
			}
			/*
			 * We were able to allocate a slab.  Should
			 * be at the front of the list, spin again.
			 */
			sp = ldp->dslab;
		}
		/*
		 * If we have reached here then we have a slab!
		 * Hopefully there are free bufs there :-o
		 */
		spl = splhi();
		all_empty = 1;
		for (; sp && !bufp; sp = sp->sl_next) {
			smr_slabbuf_t	*bp;

			if (sp->sl_free == NULL)
				continue;

			if (!lock_try(&sp->sl_lock)) {
				all_empty = 0;
				continue;
			}

			if ((bp = sp->sl_free) == NULL) {
				lock_clear(&sp->sl_lock);
				continue;
			}

			sp->sl_free = bp->sb_next;
			bp->sb_next = sp->sl_inuse;
			sp->sl_inuse = bp;
			/*
			 * Found a free buffer.
			 */
			bp->sb_domid = domid;
			bufp = bp->sb_bufp;
			lock_clear(&sp->sl_lock);
		}
		splx(spl);

		if (!all_empty && !bufp) {
			/*
			 * If we still haven't found a buffer, but
			 * there's still possibly a buffer available,
			 * then try again.  Only if we're absolutely
			 * sure all slabs are empty do we attempt
			 * to allocate a new one.
			 */
			sp = ldp->dslab;
		}
	} while (bufp == NULL);

	*bufpp = bufp;

	ATOMIC_INC(dp->dio);

	DSLAB_UNLOCK(idn.localid);

	return (0);
}

/*
 * Free a buffer allocated to the local domain back to
 * its respective slab.  Slabs are freed via the slab-reap command.
 * XXX - Support larger than IDN_SMR_BUFSIZE allocations?
 */
int
smr_buf_free(int domid, caddr_t bufp, uint_t len)
{
	register smr_slab_t	*sp;
	smr_slabbuf_t		*bp, **bpp;
	idn_domain_t		*ldp;
	int		buffreed;
	int		lockheld = (len == (uint_t)-1);

	/*
	 * We should never be free'ing a buffer on
	 * behalf of ourselves as we are never the
	 * target for allocated SMR buffers.
	 */
	ASSERT(domid != idn.localid);

	sp = NULL;
	buffreed = 0;
	ldp = &idn_domain[idn.localid];

	DSLAB_LOCK_SHARED(idn.localid);

	if (((uintptr_t)bufp & (IDN_SMR_BUFSIZE-1)) &&
	    (IDN_ADDR2OFFSET(bufp) % IDN_SMR_BUFSIZE)) {
		cmn_err(CE_WARN,
		    "IDN: 304: buffer (0x%p) from domain %d not on a "
		    "%d boundary", (void *)bufp, domid, IDN_SMR_BUFSIZE);
		goto bfdone;
	}
	if (!lockheld && (len > IDN_DATA_SIZE)) {
		cmn_err(CE_WARN,
		    "IDN: 305: buffer length (%d) from domain %d greater "
		    "than IDN_DATA_SIZE (%lu)",
		    len, domid, IDN_DATA_SIZE);
		goto bfdone;
	}

	for (sp = ldp->dslab; sp; sp = sp->sl_next)
		if ((bufp >= sp->sl_start) && (bufp < sp->sl_end))
			break;

	if (sp) {
		int spl;

		spl = splhi();
		while (!lock_try(&sp->sl_lock))
			;
		bpp = &sp->sl_inuse;
		for (bp = *bpp; bp; bp = *bpp) {
			if (bp->sb_bufp == bufp)
				break;
			bpp = &bp->sb_next;
		}
		if (bp) {
			ASSERT(bp->sb_domid == domid);
			buffreed++;
			bp->sb_domid = IDN_NIL_DOMID;
			*bpp = bp->sb_next;
			bp->sb_next = sp->sl_free;
			sp->sl_free = bp;
		}
		lock_clear(&sp->sl_lock);
		splx(spl);
	}
bfdone:
	if (buffreed) {
		ATOMIC_DEC(idn_domain[domid].dio);
		DIOCHECK(domid);
	} else {
		cmn_err(CE_WARN,
		    "IDN: 306: unknown buffer (0x%p) from domain %d",
		    (void *)bufp, domid);
		ATOMIC_INC(idn_domain[domid].dioerr);
	}

	DSLAB_UNLOCK(idn.localid);

	return (sp ? 0 : -1);
}

/*
 * Alternative interface to smr_buf_free, but with local drwlock
 * held.
 */
/* ARGSUSED2 */
int
smr_buf_free_locked(int domid, caddr_t bufp, uint_t len)
{
	return (smr_buf_free(domid, bufp, (uint_t)-1));
}

/*
 * Free any and all buffers associated with the given domain.
 * Assumption is that domain is dead and buffers are not in use.
 * Returns:	Number of buffers freed.
 *		-1 if error.
 */
int
smr_buf_free_all(int domid)
{
	register smr_slab_t	*sp;
	register smr_slabbuf_t	*bp, **bpp;
	idn_domain_t		*ldp;
	int			nbufsfreed = 0;
	procname_t	proc = "smr_buf_free_all";

	/*
	 * We should never be free'ing buffers on
	 * behalf of ourself
	 */
	ASSERT(domid != idn.localid);

	if (!VALID_DOMAINID(domid)) {
		cmn_err(CE_WARN, "IDN: 307: domain ID (%d) invalid", domid);
		return (-1);
	}

	ldp = &idn_domain[idn.localid];

	/*
	 * We grab the writer lock so that we don't have any
	 * competition during a "free-all" call.
	 * No need to grab individual slab locks when holding
	 * dslab(writer).
	 */
	DSLAB_LOCK_EXCL(idn.localid);

	for (sp = ldp->dslab; sp; sp = sp->sl_next) {
		bpp = &sp->sl_inuse;
		for (bp = *bpp; bp; bp = *bpp) {
			if (bp->sb_domid == domid) {
				bp->sb_domid = IDN_NIL_DOMID;
				*bpp = bp->sb_next;
				bp->sb_next = sp->sl_free;
				sp->sl_free = bp;
				nbufsfreed++;
			} else {
				bpp = &bp->sb_next;
			}
		}
	}

	if (nbufsfreed > 0) {
		ATOMIC_SUB(idn_domain[domid].dio, nbufsfreed);
		idn_domain[domid].dioerr = 0;
		DIOCHECK(domid);
	}

	DSLAB_UNLOCK(idn.localid);

	PR_SMR("%s: freed %d buffers for domain %d\n", proc, nbufsfreed, domid);

	return (nbufsfreed);
}

int
smr_buf_reclaim(int domid, int nbufs)
{
	int		num_reclaimed = 0;
	idn_domain_t	*ldp, *dp;
	procname_t	proc = "smr_buf_reclaim";

	ldp = &idn_domain[idn.localid];
	dp  = &idn_domain[domid];

	ASSERT(domid != idn.localid);

	if (ATOMIC_CAS(&dp->dreclaim_inprogress, 0, 1)) {
		/*
		 * Reclaim is already in progress, don't
		 * bother.
		 */
		PR_DATA("%s: reclaim already in progress\n", proc);
		return (0);
	}

	PR_SMR("%s: requested %d buffers from domain %d\n", proc, nbufs, domid);

	if (dp->dio && nbufs) {
		register smr_slab_t	*sp;
		int spl;

		DSLAB_LOCK_SHARED(idn.localid);
		spl = splhi();
		for (sp = ldp->dslab; sp && nbufs; sp = sp->sl_next) {
			register smr_slabbuf_t	*bp, **bpp;

			if (sp->sl_inuse == NULL)
				continue;

			if (!lock_try(&sp->sl_lock))
				continue;

			if (sp->sl_inuse == NULL) {
				lock_clear(&sp->sl_lock);
				continue;
			}

			bpp = &sp->sl_inuse;
			for (bp = *bpp; bp && nbufs; bp = *bpp) {
				if (bp->sb_domid == domid) {
					/*
					 * Buffer no longer in use,
					 * reclaim it.
					 */
					bp->sb_domid = IDN_NIL_DOMID;
					*bpp = bp->sb_next;
					bp->sb_next = sp->sl_free;
					sp->sl_free = bp;
					num_reclaimed++;
					nbufs--;
				} else {
					bpp = &bp->sb_next;
				}
			}
			lock_clear(&sp->sl_lock);
		}
		splx(spl);

		if (num_reclaimed > 0) {
			ATOMIC_SUB(dp->dio, num_reclaimed);
			DIOCHECK(domid);
		}
		DSLAB_UNLOCK(idn.localid);
	}

	PR_SMR("%s: reclaimed %d buffers from domain %d\n",
	    proc, num_reclaimed, domid);

	return (num_reclaimed);
}

/*
 * Returns 1	If any buffers are locked for the given slab.
 *	   0	If all buffers are free for the given slab.
 *
 * The caller is assumed to have the slab protected so that no
 * new allocations are attempted from it.  Also, this is only
 * valid to be called with respect to slabs that were allocated
 * on behalf of the local domain, i.e. the master is not expected
 * to call this function with (slave) slab "representatives".
 */
int
smr_slab_busy(smr_slab_t *sp)
{
	return ((sp && sp->sl_inuse) ? 1 : 0);
}

int
smr_slabwaiter_init()
{
	register int		i;
	struct slabwaiter	*wp;

	if (idn.slabwaiter != NULL)
		return (0);

	/*
	 * Initialize the slab waiting area for MAX_DOMAINS.
	 */
	idn.slabwaiter = GETSTRUCT(struct slabwaiter, MAX_DOMAINS);
	wp = idn.slabwaiter;
	for (i = 0; i < MAX_DOMAINS; wp++, i++) {
		wp->w_closed = 0;
		mutex_init(&wp->w_mutex, NULL, MUTEX_DEFAULT, NULL);
		cv_init(&wp->w_cv, NULL, CV_DEFAULT, NULL);
	}

	return (0);
}

void
smr_slabwaiter_deinit()
{
	register int		i;
	struct slabwaiter	*wp;

	if ((wp = idn.slabwaiter) == NULL)
		return;

	for (i = 0; i < MAX_DOMAINS; wp++, i++) {
		ASSERT(wp->w_nwaiters == 0);
		ASSERT(wp->w_sp == NULL);
		cv_destroy(&wp->w_cv);
		mutex_destroy(&wp->w_mutex);
	}

	FREESTRUCT(idn.slabwaiter, struct slabwaiter, MAX_DOMAINS);
	idn.slabwaiter = NULL;
}

void
smr_slabwaiter_open(domainset_t domset)
{
	int			d;
	struct slabwaiter	*wp;

	if ((domset == 0) || !idn.slabwaiter)
		return;

	wp = idn.slabwaiter;

	for (d = 0; d < MAX_DOMAINS; wp++, d++) {
		if (!DOMAIN_IN_SET(domset, d))
			continue;
		mutex_enter(&wp->w_mutex);
		wp->w_closed = 0;
		mutex_exit(&wp->w_mutex);
	}
}

void
smr_slabwaiter_close(domainset_t domset)
{
	int			d;
	struct slabwaiter	*wp;

	if ((domset == 0) || !idn.slabwaiter)
		return;

	wp = idn.slabwaiter;

	for (d = 0; d < MAX_DOMAINS; wp++, d++) {
		if (!DOMAIN_IN_SET(domset, d))
			continue;
		mutex_enter(&wp->w_mutex);
		wp->w_closed = 1;
		cv_broadcast(&wp->w_cv);
		mutex_exit(&wp->w_mutex);
	}
}

/*
 * Register the caller with the waiting list for the
 * given domain.
 *
 * Protocol:
 *	1st Local requester:	register -> alloc ->
 *						put(wakeup|xdc) -> unregister
 *	Nth Local requester:	register -> wait
 *	1st Remote requester:	register -> xdc -> wait
 *	Nth Remote requester:	register -> wait
 *
 *	Remote Responder:	local alloc -> put(xdc)
 *	Local Handler:		xdc -> put(wakeup)
 *
 * E.g. A standard slave allocation request:
 *	slave			master
 *	-----			------
 *	idn_slab_alloc(remote)
 *	- register
 *	- xdc		->	idn_handler
 *	- wait			...
 *				idn_slab_alloc(local)
 *				- register
 *				- alloc
 *				- put
 *				  . wakeup [local]
 *				- unregister
 *	idn_handler    	<-	- xdc
 *	- put       		DONE
 *	  . wakeup [local]
 *	    |
 *	    V
 *      - wait
 *	  . unregister
 *	DONE
 */
static int
smr_slabwaiter_register(int domid)
{
	struct slabwaiter	*wp;
	int		nwait;
	procname_t	proc = "smr_slabwaiter_register";


	ASSERT(domid != IDN_NIL_DOMID);

	ASSERT(DSLAB_READ_HELD(domid));

	wp = &idn.slabwaiter[domid];

	ASSERT(MUTEX_NOT_HELD(&wp->w_mutex));

	mutex_enter(&wp->w_mutex);

	nwait = ++(wp->w_nwaiters);
	ASSERT(nwait > 0);

	PR_SMR("%s: domain = %d, (new)nwaiters = %d\n", proc, domid, nwait);

	if (nwait > 1) {
		/*
		 * There are already waiters for slab allocations
		 * with respect to this domain.
		 */
		PR_SMR("%s: existing waiters for slabs for domain %d\n",
		    proc, domid);
		mutex_exit(&wp->w_mutex);

		return (nwait);
	}
	PR_SMR("%s: initial waiter for slabs for domain %d\n", proc, domid);
	/*
	 * We are the first requester of a slab allocation for this
	 * respective domain.  Need to prep waiting area for
	 * subsequent arrival of a slab.
	 */
	wp->w_sp = NULL;
	wp->w_done = 0;
	wp->w_serrno = 0;

	mutex_exit(&wp->w_mutex);

	return (nwait);
}

/*
 * It is assumed that the caller had previously registered,
 * but wakeup did not occur due to caller never waiting.
 * Thus, slaballoc mutex is still held by caller.
 *
 * Returns:	0
 *		EINVAL
 *		EBUSY
 *		w_serrno (smr_slaballoc_put)
 *		(0, ENOLCK, ENOMEM, EDQUOT, EBUSY, ECANCELED)
 */
static int
smr_slabwaiter_unregister(int domid, smr_slab_t **spp)
{
	struct slabwaiter	*wp;
	int		serrno = 0;
	procname_t	proc = "smr_slabwaiter_unregister";


	ASSERT(domid != IDN_NIL_DOMID);

	wp = &idn.slabwaiter[domid];

	mutex_enter(&wp->w_mutex);

	PR_SMR("%s: domain = %d, nwaiters = %d\n", proc, domid, wp->w_nwaiters);

	if (wp->w_nwaiters <= 0) {
		/*
		 * Hmmm...nobody is registered!
		 */
		PR_SMR("%s: NO WAITERS (domid = %d)\n", proc, domid);
		mutex_exit(&wp->w_mutex);
		return (EINVAL);
	}
	(wp->w_nwaiters)--;
	/*
	 * Is our present under the tree?
	 */
	if (!wp->w_done) {
		/*
		 * Bummer...no presents.  Let the caller know
		 * via a null slab pointer.
		 * Note that we don't clean up immediately since
		 * message might still come in for other waiters.
		 * Thus, late sleepers may still get a chance.
		 */
		PR_SMR("%s: bummer no slab allocated for domain %d\n",
		    proc, domid);
		ASSERT(wp->w_sp == NULL);
		(*spp) = NULL;
		serrno = wp->w_closed ? ECANCELED : EBUSY;

	} else {
		(*spp) = wp->w_sp;
		serrno = wp->w_serrno;

#ifdef DEBUG
		if (serrno == 0) {
			register smr_slab_t	*sp;

			ASSERT(wp->w_sp);
			PR_SMR("%s: allocation succeeded (domain %d)\n",
			    proc, domid);

			DSLAB_LOCK_SHARED(domid);
			for (sp = idn_domain[domid].dslab; sp; sp = sp->sl_next)
				if (sp == wp->w_sp)
					break;
			if (sp == NULL)
				cmn_err(CE_WARN,
				    "%s:%d: slab ptr = NULL",
				    proc, domid);
			DSLAB_UNLOCK(domid);
		} else {
			PR_SMR("%s: allocation failed (domain %d) "
			    "[serrno = %d]\n", proc, domid, serrno);
		}
#endif /* DEBUG */
	}
	if (wp->w_nwaiters == 0) {
		/*
		 * Last one turns out the lights.
		 */
		PR_SMR("%s: domain %d last waiter, turning out lights\n",
		    proc, domid);
		wp->w_sp = NULL;
		wp->w_done = 0;
		wp->w_serrno = 0;
	}
	mutex_exit(&wp->w_mutex);

	return (serrno);
}

/*
 * Called to abort any slaballoc requests on behalf of the
 * given domain.
 */
int
smr_slabwaiter_abort(int domid, int serrno)
{
	ASSERT(serrno != 0);

	return (smr_slaballoc_put(domid, NULL, 0, serrno));
}

/*
 * Put ourselves into a timedwait waiting for slab to be
 * allocated.
 * Returns with slaballoc mutex dropped.
 *
 * Returns:	EINVAL
 *		ETIMEDOUT
 *		smr_slabwatier_unregister
 *		(0, EINVAL, EBUSY, ENOMEM)
 */
static int
smr_slaballoc_wait(int domid, smr_slab_t **spp)
{
	struct slabwaiter	*wp;
	int			serrno = 0, serrno_unreg;
	procname_t		proc = "smr_slaballoc_wait";


	wp = &idn.slabwaiter[domid];

	ASSERT(MUTEX_NOT_HELD(&wp->w_mutex));

	mutex_enter(&wp->w_mutex);

	PR_SMR("%s: domain = %d, nwaiters = %d, wsp = 0x%p\n",
	    proc, domid, wp->w_nwaiters, (void *)wp->w_sp);

	if (wp->w_nwaiters <= 0) {
		/*
		 * Hmmm...no waiters registered.
		 */
		PR_SMR("%s: domain %d, no waiters!\n", proc, domid);
		mutex_exit(&wp->w_mutex);
		return (EINVAL);
	}
	ASSERT(DSLAB_READ_HELD(domid));
	DSLAB_UNLOCK(domid);

	if (!wp->w_done && !wp->w_closed) {
		int	rv;

		/*
		 * Only wait if data hasn't arrived yet.
		 */
		PR_SMR("%s: domain %d, going to sleep...\n", proc, domid);

		rv = cv_reltimedwait_sig(&wp->w_cv, &wp->w_mutex,
		    IDN_SLABALLOC_WAITTIME, TR_CLOCK_TICK);
		if (rv == -1)
			serrno = ETIMEDOUT;

		PR_SMR("%s: domain %d, awakened (reason = %s)\n",
		    proc, domid, (rv == -1) ? "TIMEOUT" : "SIGNALED");
	}
	/*
	 * We've awakened or request already filled!
	 * Unregister ourselves.
	 */
	mutex_exit(&wp->w_mutex);

	/*
	 * Any gifts will be entered into spp.
	 */
	serrno_unreg = smr_slabwaiter_unregister(domid, spp);

	/*
	 * Leave with reader lock on dslab_lock.
	 */
	DSLAB_LOCK_SHARED(domid);

	if ((serrno_unreg == EBUSY) && (serrno == ETIMEDOUT))
		return (serrno);
	else
		return (serrno_unreg);
}

/*
 * A SMR slab was allocated on behalf of the given domain.
 * Wakeup anybody that may have been waiting for the allocation.
 * Note that if the domain is a remote one, i.e. master is allocating
 * on behalf of a slave, it's up to the caller to transmit the
 * allocation response to that domain.
 * The force flag indicates that we want to install the slab for
 * the given user regardless of whether there are waiters or not.
 * This is used primarily in situations where a slave may have timed
 * out before the response actually arrived.  In this situation we
 * don't want to send slab back to the master after we went through
 * the trouble of allocating one.  Master is _not_ allowed to do this
 * for remote domains.
 *
 * Returns:	-1	Non-registered waiter or waiting area garbaged.
 *		0	Successfully performed operation.
 */
int
smr_slaballoc_put(int domid, smr_slab_t *sp, int forceflag, int serrno)
{
	idn_domain_t		*dp;
	struct slabwaiter	*wp;
	procname_t		proc = "smr_slaballoc_put";


	dp = &idn_domain[domid];

	ASSERT(!serrno ? DSLAB_WRITE_HELD(domid) : 1);

	if (domid == IDN_NIL_DOMID)
		return (-1);

	ASSERT(serrno ? (sp == NULL) : (sp != NULL));

	wp = &idn.slabwaiter[domid];

	mutex_enter(&wp->w_mutex);

	PR_SMR("%s: domain = %d, bufp = 0x%p, ebufp = 0x%p, "
	    "(f = %d, se = %d)\n", proc, domid,
	    (sp ? (void *)sp->sl_start : 0),
	    (sp ? (void *)sp->sl_end : 0), forceflag, serrno);

	if (wp->w_nwaiters <= 0) {
		/*
		 * There are no waiters!!  Must have timed out
		 * and left.  Oh well...
		 */
		PR_SMR("%s: no slaballoc waiters found for domain %d\n",
		    proc, domid);
		if (!forceflag || serrno || !sp) {
			/*
			 * No waiters and caller doesn't want to force it.
			 */
			mutex_exit(&wp->w_mutex);
			return (-1);
		}
		PR_SMR("%s: forcing slab onto domain %d\n", proc, domid);
		ASSERT(domid == idn.localid);
		ASSERT(wp->w_sp == NULL);
		wp->w_done = 0;
		/*
		 * Now we fall through and let it be added in the
		 * regular manor.
		 */
	}
	if (wp->w_done) {
		/*
		 * There's at least one waiter so there has
		 * to be a slab structure waiting for us.
		 * If everything is going smoothly, there should only
		 * be one guy coming through the path of inserting
		 * an error or good slab.  However, if a disconnect was
		 * detected, you may get several guys coming through
		 * trying to let everybody know.
		 */
		ASSERT(wp->w_serrno ?
		    (wp->w_sp == NULL) : (wp->w_sp != NULL));

		cv_broadcast(&wp->w_cv);
		mutex_exit(&wp->w_mutex);

		return (-1);
	}
	if (serrno != 0) {
		/*
		 * Bummer...allocation failed.  This call is simply
		 * to wake up the sleepers and let them know.
		 */
		PR_SMR("%s: slaballoc failed for domain %d\n", proc, domid);
		wp->w_serrno = serrno;
		wp->w_done = 1;
		cv_broadcast(&wp->w_cv);
		mutex_exit(&wp->w_mutex);

		return (0);
	}
	PR_SMR("%s: putting slab into struct (domid=%d, localid=%d)\n",
	    proc, domid, idn.localid);
	/*
	 * Prep the slab structure.
	 */

	if (domid == idn.localid) {
		/*
		 * Allocation was indeed for me.
		 * Slab may or may not be locked when
		 * we reach.  Normally they will be locked
		 * if we're being called on behalf of a
		 * free, and not locked if on behalf of
		 * a new allocation request.
		 */
		lock_clear(&sp->sl_lock);
		smr_alloc_buflist(sp);
#ifdef DEBUG
	} else {
		uint_t	rv;
		/*
		 * Slab was not allocated on my behalf.  Must be
		 * a master request on behalf of some other domain.
		 * Prep appropriately.  Slab should have been locked
		 * by smr_slab_reserve.
		 */
		rv = lock_try(&sp->sl_lock);
		ASSERT(!rv);
		ASSERT(sp->sl_domid == (short)domid);
#endif /* DEBUG */
	}

	/*
	 * Slab is ready to go.  Insert it into the domain's
	 * slab list so once we wake everybody up they'll find it.
	 * You better have write lock if you're putting treasures
	 * there.
	 */
	ASSERT(DSLAB_WRITE_HELD(domid));

	sp->sl_next = dp->dslab;
	dp->dslab  = sp;
	dp->dnslabs++;

	/*
	 * It's possible to fall through here without waiters.
	 * This is a case where forceflag was set.
	 */
	if (wp->w_nwaiters > 0) {
		wp->w_sp = sp;
		wp->w_serrno = serrno;
		wp->w_done = 1;
		cv_broadcast(&wp->w_cv);
	} else {
		ASSERT(forceflag);
		wp->w_sp = NULL;
		wp->w_serrno = 0;
		wp->w_done = 0;
	}
	mutex_exit(&wp->w_mutex);

	return (0);
}

/*
 * Get the slab representing [bufp,ebufp] from the respective
 * domain's pool if all the buffers are free.  Remove them from
 * the domain's list and return it.
 * If bufp == NULL, then return however many free ones you
 * can find.
 * List of slabs are returned locked (sl_lock).
 * XXX - Need minimum limit to make sure we don't free up _all_
 *	 of our slabs!  However, during a shutdown we will need
 *	 method to free them all up regardless of locking.
 */
smr_slab_t *
smr_slaballoc_get(int domid, caddr_t bufp, caddr_t ebufp)
{
	idn_domain_t	*dp;
	smr_slab_t	*retsp, *sp, **psp;
	int		foundit, islocal = 0;
	int		nslabs;
	procname_t	proc = "smr_slaballoc_get";

	PR_SMR("%s: getting slab for domain %d [bufp=0x%p, ebufp=0x%p]\n",
	    proc, domid, (void *)bufp, (void *)ebufp);

	dp = &idn_domain[domid];

	ASSERT(DSLAB_WRITE_HELD(domid));

	if ((sp = dp->dslab) == NULL) {
		PR_SMR("%s: oops, no slabs for domain %d\n", proc, domid);
		return (NULL);
	}
	/*
	 * If domid is myself then I'm trying to get a slab out
	 * of my local pool.  Otherwise, I'm the master and
	 * I'm trying to get the slab representative from the
	 * global pool.
	 */
	if (domid == idn.localid)
		islocal = 1;

	if (bufp != NULL) {
		nslabs = -1;
	} else {
		nslabs = *(int *)ebufp;
		if (nslabs == 0) {
			PR_SMR("%s: requested nslabs (%d) <= 0\n",
			    proc, nslabs);
			return (NULL);
		} else if (nslabs < 0) {
			/*
			 * Caller wants them all!
			 */
			nslabs = (int)dp->dnslabs;
		}
	}

	retsp = NULL;
	foundit = 0;
	for (psp = &dp->dslab; sp; sp = *psp) {
		int	isbusy;

		if (bufp && (sp->sl_start != bufp)) {
			psp = &sp->sl_next;
			continue;
		}

		if (bufp && (ebufp > sp->sl_end)) {
			PR_SMR("%s: bufp/ebufp (0x%p/0x%p) "
			    "expected (0x%p/0x%p)\n", proc, (void *)bufp,
			    (void *)ebufp, (void *)sp->sl_start,
			    (void *)sp->sl_end);
			ASSERT(0);
		}
		/*
		 * We found the desired slab.  Make sure
		 * it's free.
		 */
		foundit++;
		isbusy = 0;
		if (islocal) {
			int spl;

			/*
			 * Some of the buffers in the slab
			 * are still in use.  Unlock the
			 * buffers we locked and bail out.
			 */
			spl = splhi();
			if (!lock_try(&sp->sl_lock)) {
				isbusy = 1;
				foundit--;
			} else if (sp->sl_inuse) {
				lock_clear(&sp->sl_lock);
				isbusy = 1;
				foundit--;
			}
			splx(spl);
		} else {
			/*
			 * If not local, then I'm the master getting
			 * a slab from one of the slaves.  In this case,
			 * their slab structs will always be locked.
			 */
			ASSERT(!lock_try(&sp->sl_lock));
		}
		if (!isbusy) {
			/*
			 * Delete the entry from the list and slap
			 * it onto our return list.
			 */
			*psp = sp->sl_next;
			sp->sl_next = retsp;
			retsp = sp;
		} else {
			psp = &sp->sl_next;
		}
		/*
		 * If bufp == NULL (alternate interface) and we haven't
		 * found the desired number of slabs yet, keep looking.
		 */
		if (bufp || (foundit == nslabs))
			break;
	}
	dp->dnslabs -= (short)foundit;

	if (foundit) {
		PR_SMR("%s: found %d free slabs (domid = %d)\n", proc, foundit,
		    domid);
	} else {
		PR_SMR("%s: no free slabs found (domid = %d)\n", proc, domid);
	}

	/*
	 * If this is the alternate interface, need to return
	 * the number of slabs found in the ebufp parameter.
	 */
	if (bufp == NULL)
		*(int *)ebufp = foundit;

	return (retsp);
}

/*
 * Wrapper to hide alternate interface to smr_slaballoc_get()
 */
smr_slab_t *
smr_slaballoc_get_n(int domid, int *nslabs)
{
	smr_slab_t	*sp;

	ASSERT(DSLAB_WRITE_HELD(domid));

	sp = smr_slaballoc_get(domid, NULL, (caddr_t)nslabs);

	return (sp);
}

/*
 * Only called by master.  Initialize slab pool based on local SMR.
 * Returns number of slabs initialized.
 * reserved_size = Length of area at the front of the NWR portion
 *		   of the SMR to reserve and not make available for
 *		   slab allocations.  Must be a IDN_SMR_BUFSIZE multiple.
 * reserved_area = Pointer to reserved area, if any.
 */
int
smr_slabpool_init(size_t reserved_size, caddr_t *reserved_area)
{
	size_t			nwr_available;
	int			minperpool, ntotslabs, nxslabs, nslabs;
	register int		p, pp;
	register caddr_t	bufp;
	register smr_slab_t	*sp;

	ASSERT(IDN_GLOCK_IS_EXCL());
	ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);

	*reserved_area = NULL;

	nwr_available = MB2B(IDN_NWR_SIZE) - reserved_size;

	if ((idn.localid != IDN_GET_MASTERID()) ||
	    (nwr_available < IDN_SLAB_SIZE) ||
	    (idn.slabpool != NULL) ||
	    ((reserved_size != 0) && (reserved_size & (IDN_SMR_BUFSIZE-1)))) {
		return (-1);
	}

	idn.slabpool = GETSTRUCT(struct slabpool, 1);
	idn.slabpool->ntotslabs = ntotslabs = nwr_available / IDN_SLAB_SIZE;
	ASSERT(ntotslabs > 0);
	minperpool = (ntotslabs < IDN_SLAB_MINPERPOOL) ?
	    1 : IDN_SLAB_MINPERPOOL;
	idn.slabpool->npools = (ntotslabs + (minperpool - 1)) / minperpool;

	if ((idn.slabpool->npools & 1) == 0) {
		/*
		 * npools needs to be odd for hashing algorithm.
		 */
		idn.slabpool->npools++;
	}
	ASSERT(idn.slabpool->npools > 0);
	minperpool = (ntotslabs < idn.slabpool->npools) ?
	    1 : (ntotslabs / idn.slabpool->npools);

	/*
	 * Calculate the number of extra slabs that will need to
	 * be alloted to the pools.  This number will be less than
	 * npools.  Only one extra slab is allocated to each pool
	 * until we have assigned all the extra slabs.
	 */
	if (ntotslabs > (idn.slabpool->npools * minperpool))
		nxslabs = ntotslabs - (idn.slabpool->npools * minperpool);
	else
		nxslabs = 0;
	ASSERT((nxslabs >= 0) && (nxslabs < idn.slabpool->npools));

	idn.slabpool->pool = GETSTRUCT(struct smr_slabtbl,
	    idn.slabpool->npools);
	sp = GETSTRUCT(smr_slab_t, idn.slabpool->ntotslabs);

	idn.slabpool->savep = sp;
	bufp = idn.smr.vaddr + reserved_size;

	for (p = nslabs = 0;
	    (p < idn.slabpool->npools) && (ntotslabs > 0);
	    p++, ntotslabs -= nslabs) {

		nslabs = (ntotslabs < minperpool) ? ntotslabs : minperpool;
		if (nxslabs > 0) {
			nslabs++;
			nxslabs--;
		}
		idn.slabpool->pool[p].sarray = sp;
		for (pp = 0; pp < nslabs; pp++) {

			sp->sl_next  = NULL;
			sp->sl_start = bufp;
			sp->sl_end   = bufp = sp->sl_start + IDN_SLAB_SIZE;
			sp->sl_lock  = 0;
			sp->sl_domid = (short)IDN_NIL_DOMID;

			sp++;
		}
		idn.slabpool->pool[p].nfree   = nslabs;
		idn.slabpool->pool[p].nslabs  = nslabs;
	}
	ASSERT((ntotslabs == 0) && (nxslabs == 0));
	/*
	 * We should be at the end of the SMR at this point.
	 */
	ASSERT(bufp == (idn.smr.vaddr + reserved_size
	    + (idn.slabpool->ntotslabs * IDN_SLAB_SIZE)));

	if (reserved_size != 0)
		*reserved_area = idn.smr.vaddr;

	return (0);
}

void
smr_slabpool_deinit()
{
	if (idn.slabpool == NULL)
		return;

	FREESTRUCT(idn.slabpool->savep, smr_slab_t, idn.slabpool->ntotslabs);
	FREESTRUCT(idn.slabpool->pool, struct smr_slabtbl,
	    idn.slabpool->npools);
	FREESTRUCT(idn.slabpool, struct slabpool, 1);

	idn.slabpool = NULL;
}

void
smr_alloc_buflist(smr_slab_t *sp)
{
	int		n, nbufs;
	caddr_t		sbufp;
	smr_slabbuf_t	*hp, *bp;

	if (sp->sl_head)
		return;

	nbufs = (sp->sl_end - sp->sl_start) / IDN_SMR_BUFSIZE;
	ASSERT(nbufs > 0);
	if (nbufs <= 0) {
		sp->sl_head = sp->sl_free = sp->sl_inuse = NULL;
		return;
	}

	hp = GETSTRUCT(smr_slabbuf_t, nbufs);

	sbufp = sp->sl_start;
	for (n = 0, bp = hp; n < nbufs; bp++, n++) {
		bp->sb_bufp = sbufp;
		bp->sb_domid = IDN_NIL_DOMID;
		bp->sb_next = bp + 1;
		sbufp += IDN_SMR_BUFSIZE;
	}
	(--bp)->sb_next = NULL;

	sp->sl_head = sp->sl_free = hp;
	sp->sl_inuse = NULL;
}

void
smr_free_buflist(smr_slab_t *sp)
{
	int	nbufs;

	if (sp->sl_head == NULL)
		return;

	nbufs = (sp->sl_end - sp->sl_start) / IDN_SMR_BUFSIZE;

	FREESTRUCT(sp->sl_head, smr_slabbuf_t, nbufs);

	sp->sl_head = sp->sl_free = sp->sl_inuse = NULL;
}

/*
 * Returns:	0 Successfully located a slab.
 *	       -1 Failure.
 */
static smr_slab_t *
smr_slab_reserve(int domid)
{
	register int		p, nextp, s, nexts;
	register smr_slab_t	*spa;
	int			startp, starts;
	int			foundone = 0;
	int			spl;
	procname_t		proc = "smr_slab_reserve";

	p = startp = SMR_SLABPOOL_HASH(domid);
	nextp = -1;

	spl = splhi();
	while ((nextp != startp) && !foundone) {

		s = starts = SMR_SLAB_HASH(p, domid);
		nexts = -1;
		spa = &(idn.slabpool->pool[p].sarray[0]);

		while ((nexts != starts) && !foundone) {
			if (lock_try(&spa[s].sl_lock)) {
				foundone = 1;
				break;
			}
			nexts = SMR_SLAB_HASHSTEP(p, s);
			s = nexts;
		}
		if (foundone)
			break;
		nextp = SMR_SLABPOOL_HASHSTEP(p);
		p = nextp;
	}
	splx(spl);

	if (foundone) {
		ASSERT((&spa[s] >= idn.slabpool->savep) &&
		    (&spa[s] < (idn.slabpool->savep +
		    idn.slabpool->ntotslabs)));

		spa[s].sl_domid = (short)domid;

		ATOMIC_DEC(idn.slabpool->pool[p].nfree);

		if (domid == idn.localid) {
			smr_slab_t	*nsp;
			/*
			 * Caller is actually reserving a slab for
			 * themself which means they'll need the full
			 * slab structure to represent all of the I/O
			 * buffers.  The "spa" is just a representative
			 * and doesn't contain the space to manage the
			 * individual buffers.  Need to alloc a full-size
			 * struct.
			 * Note that this results in the returning
			 * smr_slab_t structure being unlocked.
			 */
			ASSERT(idn.localid == IDN_GET_MASTERID());
			nsp = GETSTRUCT(smr_slab_t, 1);
			nsp->sl_start = spa[s].sl_start;
			nsp->sl_end   = spa[s].sl_end;
			smr_alloc_buflist(nsp);
			spa = nsp;
			PR_SMR("%s: allocated full slab struct for domain %d\n",
			    proc, domid);
		} else {
			/*
			 * Slab structure gets returned locked.
			 */
			spa += s;
		}

		PR_SMR("%s: allocated slab 0x%p (start=0x%p, size=%lu) for "
		    "domain %d\n", proc, (void *)spa, (void *)spa->sl_start,
		    spa->sl_end - spa->sl_start, domid);
	} else {
		PR_SMR("%s: FAILED to allocate for domain %d\n",
		    proc, domid);
		spa = NULL;
	}

	return (spa);
}

static void
smr_slab_unreserve(int domid, smr_slab_t *sp)
{
	register int		p, nextp, s, nexts;
	register smr_slab_t	*spa;
	int			foundit = 0;
	int			startp, starts;
	caddr_t			bufp;
	procname_t		proc = "smr_slab_unreserve";

	bufp = sp->sl_start;
	p = startp = SMR_SLABPOOL_HASH(domid);
	nextp = -1;

	while ((nextp != startp) && !foundit) {

		s = starts = SMR_SLAB_HASH(p, domid);
		nexts = -1;
		spa = &(idn.slabpool->pool[p].sarray[0]);

		while ((nexts != starts) && !foundit) {
			if (spa[s].sl_start == bufp) {
				foundit = 1;
				break;
			}
			nexts = SMR_SLAB_HASHSTEP(p, s);
			s = nexts;
		}
		if (foundit)
			break;
		nextp = SMR_SLABPOOL_HASHSTEP(p);
		p = nextp;
	}
	if (foundit) {
		ASSERT((&spa[s] >= idn.slabpool->savep) &&
		    (&spa[s] < (idn.slabpool->savep +
		    idn.slabpool->ntotslabs)));
		ASSERT(!lock_try(&spa[s].sl_lock));
		ASSERT(spa[s].sl_domid == (short)domid);

		spa[s].sl_next = NULL;
		spa[s].sl_domid = (short)IDN_NIL_DOMID;
		lock_clear(&spa[s].sl_lock);

		ATOMIC_INC(idn.slabpool->pool[p].nfree);

		PR_SMR("%s: freed (bufp=0x%p) for domain %d\n",
		    proc, (void *)bufp, domid);

		if (domid == idn.localid) {
			/*
			 * Caller is actually unreserving a slab of their
			 * own.  Note that only the master calls this
			 * routine.  Since the master's local slab
			 * structures do not get entered into the global
			 * "representative" pool, we need to free up the
			 * data structure that was passed in.
			 */
			ASSERT(idn.localid == IDN_GET_MASTERID());
			ASSERT(sp != &spa[s]);

			smr_free_buflist(sp);
			FREESTRUCT(sp, smr_slab_t, 1);
		} else {
			ASSERT(sp == &spa[s]);
		}
	} else {
		/*
		 * Couldn't find slab entry for given buf!
		 */
		PR_SMR("%s: FAILED to free (bufp=0x%p) for domain %d\n",
		    proc, (void *)bufp, domid);
	}
}

/*
 * The Reap Protocol:
 *	master				   slave
 *	------				   -----
 *	smr_slab_reap_global
 *	- idn_broadcast_cmd(SLABREAP) ->   idn_recv_cmd(SLABREAP)
 *	  . idn_local_cmd(SLABREAP)        - idn_recv_slabreap_req
 *	    - smr_slab_reap	             . smr_slab_reap
 *	      . smr_slaballoc_get_n            - smr_slaballoc_get_n
 *	      . smr_slab_free		       - smr_slab_free
 *		- smr_slab_free_local		 . smr_slab_free_remote
 *		  . smr_slab_unreserve
 *				      <-	   - idn_send_cmd(SLABFREE)
 *	idn_recv_cmd(SLABFREE)
 *	- idn_recv_slabfree_req
 *	  . smr_slaballoc_get
 *	  . smr_slab_free
 *	    - smr_slab_free_local
 *	      . smr_slab_unreserve
 *        . idn_send_slabfree_resp    ->   idn_recv_cmd(SLABFREE | ack)
 *					   - idn_recv_slabfree_resp
 *
 *	idn_recv_cmd(SLABREAP | ack)  <-     . idn_send_slabreap_resp
 *	- idn_recv_slabreap_resp	   DONE
 *	DONE
 *
 * Check available slabs and if we're below the threshold, kick
 * off reaping to all remote domains.  There is no guarantee remote
 * domains will be able to free up any.
 */
static void
smr_slab_reap_global()
{
	register int	p, npools;
	register int	total_free = 0;
	register struct smr_slabtbl	*tblp;
	static clock_t	reap_last = 0;
	procname_t	proc = "smr_slab_reap_global";
	clock_t		now;

	ASSERT(IDN_GET_MASTERID() != IDN_NIL_DOMID);

	DSLAB_LOCK_SHARED(idn.localid);
	if (idn_domain[idn.localid].dslab_state != DSLAB_STATE_LOCAL) {
		PR_SMR("%s: only allowed by master (%d)\n",
		    proc, IDN_GET_MASTERID());
		DSLAB_UNLOCK(idn.localid);
		return;
	}
	DSLAB_UNLOCK(idn.localid);

	now = ddi_get_lbolt();
	if ((now > 0) && (now > reap_last) &&
	    ((now - reap_last) < IDN_REAP_INTERVAL))
		return;

	reap_last = now;

	ASSERT(idn.slabpool);

	npools = idn.slabpool->npools;
	tblp   = idn.slabpool->pool;

	for (p = 0; p < npools; tblp++, p++)
		total_free += tblp->nfree;

	if (total_free <= IDN_SLAB_THRESHOLD) {
		int	diff, reap_per_domain;

		PR_SMR("%s: kicking off reaping "
		    "(total_free = %d, min = %d)\n",
		    proc, total_free, IDN_SLAB_THRESHOLD);

		diff = IDN_SLAB_THRESHOLD - total_free;
		reap_per_domain = (diff < idn.ndomains) ?
		    1 : (diff / idn.ndomains);

		idn_broadcast_cmd(IDNCMD_SLABREAP, reap_per_domain, 0, 0);
	}
}

void
smr_slab_reap(int domid, int *nslabs)
{
	register int	d;
	int		nreclaimed;
	smr_slab_t	*sp;
	domainset_t	reapset;
	procname_t	proc = "smr_slab_reap";

	/*
	 * Should only be called on behalf of local
	 * domain.
	 */
	if (domid != idn.localid) {
		PR_SMR("%s: called by domain %d, should only be local (%d)\n",
		    proc, domid, idn.localid);
		ASSERT(0);
		return;
	}
	/*
	 * Try and reclaim some buffers so we can possibly
	 * free up some slabs.
	 */
	reapset = idn.domset.ds_connected;

	IDN_GKSTAT_GLOBAL_EVENT(gk_reaps, gk_reap_last);

	nreclaimed = 0;
	for (d = 0; d < MAX_DOMAINS; d++) {
		int		nr;
		idn_domain_t	*dp;

		if (!DOMAIN_IN_SET(reapset, d))
			continue;

		IDN_DLOCK_SHARED(d);

		dp = &idn_domain[d];
		if ((d == idn.localid) || (dp->dcpu < 0)) {
			IDN_DUNLOCK(d);
			continue;
		}
		/*
		 * Clean up any dead I/O errors if possible.
		 */
		if (dp->dioerr > 0) {
			idn_domain_t	*ldp;
			register int	cnt;
			register smr_slabbuf_t	*bp;
			/*
			 * We need to grab the writer lock to prevent
			 * anybody from allocating buffers while we
			 * traverse the slabs outstanding.
			 */
			cnt = 0;
			ldp = &idn_domain[idn.localid];
			IDN_DLOCK_EXCL(idn.localid);
			DSLAB_LOCK_EXCL(idn.localid);
			for (sp = ldp->dslab; sp; sp = sp->sl_next)
				for (bp = sp->sl_inuse; bp; bp = bp->sb_next)
					if (bp->sb_domid == d)
						cnt++;
			DSLAB_UNLOCK(idn.localid);
			ASSERT((dp->dio + dp->dioerr) >= cnt);
			dp->dio = cnt;
			dp->dioerr = 0;
			IDN_DUNLOCK(idn.localid);
		}
		if ((dp->dstate == IDNDS_CONNECTED) &&
		    ((nr = idn_reclaim_mboxdata(d, 0, -1)) > 0))
			nreclaimed += nr;

		IDN_DUNLOCK(d);
	}

	DSLAB_LOCK_EXCL(domid);
	sp = smr_slaballoc_get_n(domid, nslabs);
	if (sp) {
		IDN_GKSTAT_ADD(gk_reap_count, (ulong_t)(*nslabs));
		smr_slab_free(domid, sp);
	}
	DSLAB_UNLOCK(domid);
}

/*
 * ---------------------------------------------------------------------
 * Remap the (IDN) shared memory region to a new physical address.
 * Caller is expected to have performed a ecache flush if needed.
 * ---------------------------------------------------------------------
 */
void
smr_remap(struct as *as, register caddr_t vaddr,
		register pfn_t new_pfn, uint_t mblen)
{
	tte_t		tte;
	size_t		blen;
	pgcnt_t		p, npgs;
	procname_t	proc = "smr_remap";

	if (va_to_pfn(vaddr) == new_pfn) {
		PR_REMAP("%s: vaddr (0x%p) already mapped to pfn (0x%lx)\n",
		    proc, (void *)vaddr, new_pfn);
		return;
	}

	blen = MB2B(mblen);
	npgs = btopr(blen);
	ASSERT(npgs != 0);

	PR_REMAP("%s: va = 0x%p, pfn = 0x%lx, npgs = %ld, mb = %d MB (%ld)\n",
	    proc, (void *)vaddr, new_pfn, npgs, mblen, blen);

	/*
	 * Unmap the SMR virtual address from it's current
	 * mapping.
	 */
	hat_unload(as->a_hat, vaddr, blen, HAT_UNLOAD_UNLOCK);

	if (new_pfn == PFN_INVALID)
		return;

	/*
	 * Map the SMR to the new physical address space,
	 * presumably a remote pfn.  Cannot use hat_devload
	 * because it will think pfn represents non-memory,
	 * i.e. space since it may beyond his physmax.
	 */
	for (p = 0; p < npgs; p++) {
		sfmmu_memtte(&tte, new_pfn, PROT_READ | PROT_WRITE | HAT_NOSYNC,
		    TTE8K);
		sfmmu_tteload(as->a_hat, &tte, vaddr, NULL, HAT_LOAD_LOCK);

		vaddr += MMU_PAGESIZE;
		new_pfn++;
	}

	PR_REMAP("%s: remapped %ld pages (expected %ld)\n",
	    proc, npgs, btopr(MB2B(mblen)));
}