/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 *	Device Strategy
 */
#include <sys/dktp/cm.h>
#include <sys/kstat.h>

#include <sys/dktp/quetypes.h>
#include <sys/dktp/queue.h>
#include <sys/dktp/tgcom.h>
#include <sys/dktp/fctypes.h>
#include <sys/dktp/flowctrl.h>
#include <sys/param.h>
#include <vm/page.h>
#include <sys/modctl.h>

/*
 *	Object Management
 */

static struct buf *qmerge_nextbp(struct que_data *qfp, struct buf *bp_merge,
    int *can_merge);

static struct modlmisc modlmisc = {
	&mod_miscops,	/* Type of module */
	"Device Strategy Objects"
};

static struct modlinkage modlinkage = {
	MODREV_1,
	&modlmisc,
	NULL
};

int
_init(void)
{
	return (mod_install(&modlinkage));
}

int
_fini(void)
{
	return (mod_remove(&modlinkage));
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}


/*
 *	Common Flow Control functions
 */

/*
 * Local static data
 */
#ifdef	FLC_DEBUG
#define	DENT	0x0001
#define	DERR	0x0002
#define	DIO	0x0004
static	int	flc_debug = DENT|DERR|DIO;

#include <sys/thread.h>
static 	int	flc_malloc_intr = 0;
#endif	/* FLC_DEBUG */

static	int	flc_kstat = 1;

static struct flc_obj *fc_create(struct flc_objops *fcopsp);
static int fc_init(opaque_t queuep, opaque_t tgcom_objp, opaque_t que_objp,
    void *lkarg);
static int fc_free(struct flc_obj *flcobjp);
static int fc_start_kstat(opaque_t queuep, char *devtype, int instance);
static int fc_stop_kstat(opaque_t queuep);

static struct flc_obj *
fc_create(struct flc_objops *fcopsp)
{
	struct	flc_obj *flcobjp;
	struct	fc_data *fcdp;

	flcobjp = kmem_zalloc((sizeof (*flcobjp) + sizeof (*fcdp)), KM_NOSLEEP);
	if (!flcobjp)
		return (NULL);

	fcdp = (struct fc_data *)(flcobjp+1);
	flcobjp->flc_data = (opaque_t)fcdp;
	flcobjp->flc_ops  = fcopsp;

	return ((opaque_t)flcobjp);
}

static int dmult_maxcnt = DMULT_MAXCNT;

static int
fc_init(opaque_t queuep, opaque_t tgcom_objp, opaque_t que_objp, void *lkarg)
{
	struct fc_data *fcdp = (struct fc_data *)queuep;

	mutex_init(&fcdp->ds_mutex, NULL, MUTEX_DRIVER, lkarg);

	fcdp->ds_queobjp   = que_objp;
	fcdp->ds_tgcomobjp = tgcom_objp;
	fcdp->ds_waitcnt   = dmult_maxcnt;

	QUE_INIT(que_objp, lkarg);
	TGCOM_INIT(tgcom_objp);
	return (DDI_SUCCESS);
}

static int
fc_free(struct flc_obj *flcobjp)
{
	struct fc_data *fcdp;

	fcdp = (struct fc_data *)flcobjp->flc_data;
	if (fcdp->ds_queobjp)
		QUE_FREE(fcdp->ds_queobjp);
	if (fcdp->ds_tgcomobjp) {
		TGCOM_FREE(fcdp->ds_tgcomobjp);
		mutex_destroy(&fcdp->ds_mutex);
	}
	kmem_free(flcobjp, (sizeof (*flcobjp) + sizeof (*fcdp)));
	return (0);
}

/*ARGSUSED*/
static int
fc_start_kstat(opaque_t queuep, char *devtype, int instance)
{
	struct fc_data *fcdp = (struct fc_data *)queuep;
	if (!flc_kstat)
		return (0);

	if (!fcdp->ds_kstat) {
		if (fcdp->ds_kstat = kstat_create("cmdk", instance, NULL,
		    "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) {
			kstat_install(fcdp->ds_kstat);
		}
	}
	return (0);
}

static int
fc_stop_kstat(opaque_t queuep)
{
	struct fc_data *fcdp = (struct fc_data *)queuep;

	if (fcdp->ds_kstat) {
		kstat_delete(fcdp->ds_kstat);
		fcdp->ds_kstat = NULL;
	}
	return (0);
}


/*
 *	Single Command per Device
 */
/*
 * Local Function Prototypes
 */
static int dsngl_restart();

static int dsngl_enque(opaque_t, struct buf *);
static int dsngl_deque(opaque_t, struct buf *);

struct 	flc_objops dsngl_ops = {
	fc_init,
	fc_free,
	dsngl_enque,
	dsngl_deque,
	fc_start_kstat,
	fc_stop_kstat,
	0, 0
};

struct flc_obj *
dsngl_create()
{
	return (fc_create((struct flc_objops *)&dsngl_ops));
}

static int
dsngl_enque(opaque_t queuep, struct buf *in_bp)
{
	struct fc_data *dsnglp = (struct fc_data *)queuep;
	opaque_t tgcom_objp;
	opaque_t que_objp;

	que_objp   = dsnglp->ds_queobjp;
	tgcom_objp = dsnglp->ds_tgcomobjp;

	if (!in_bp)
		return (0);
	mutex_enter(&dsnglp->ds_mutex);
	if (dsnglp->ds_bp || dsnglp->ds_outcnt) {
		QUE_ADD(que_objp, in_bp);
		if (dsnglp->ds_kstat) {
			kstat_waitq_enter(KSTAT_IO_PTR(dsnglp->ds_kstat));
		}
		mutex_exit(&dsnglp->ds_mutex);
		return (0);
	}
	if (dsnglp->ds_kstat) {
		kstat_waitq_enter(KSTAT_IO_PTR(dsnglp->ds_kstat));
	}
	if (TGCOM_PKT(tgcom_objp, in_bp, dsngl_restart,
		(caddr_t)dsnglp) != DDI_SUCCESS) {

		dsnglp->ds_bp = in_bp;
		mutex_exit(&dsnglp->ds_mutex);
		return (0);
	}
	dsnglp->ds_outcnt++;
	if (dsnglp->ds_kstat)
		kstat_waitq_to_runq(KSTAT_IO_PTR(dsnglp->ds_kstat));
	mutex_exit(&dsnglp->ds_mutex);
	TGCOM_TRANSPORT(tgcom_objp, in_bp);
	return (0);
}

static int
dsngl_deque(opaque_t queuep, struct buf *in_bp)
{
	struct fc_data *dsnglp = (struct fc_data *)queuep;
	opaque_t tgcom_objp;
	opaque_t que_objp;
	struct	 buf *bp;

	que_objp   = dsnglp->ds_queobjp;
	tgcom_objp = dsnglp->ds_tgcomobjp;

	mutex_enter(&dsnglp->ds_mutex);
	if (in_bp) {
		dsnglp->ds_outcnt--;
		if (dsnglp->ds_kstat) {
			if (in_bp->b_flags & B_READ) {
				KSTAT_IO_PTR(dsnglp->ds_kstat)->reads++;
				KSTAT_IO_PTR(dsnglp->ds_kstat)->nread +=
				    (in_bp->b_bcount - in_bp->b_resid);
			} else {
				KSTAT_IO_PTR(dsnglp->ds_kstat)->writes++;
				KSTAT_IO_PTR(dsnglp->ds_kstat)->nwritten +=
				    (in_bp->b_bcount - in_bp->b_resid);
			}
			kstat_runq_exit(KSTAT_IO_PTR(dsnglp->ds_kstat));
		}
	}
	for (;;) {
		if (!dsnglp->ds_bp)
			dsnglp->ds_bp = QUE_DEL(que_objp);
		if (!dsnglp->ds_bp ||
		    (TGCOM_PKT(tgcom_objp, dsnglp->ds_bp, dsngl_restart,
		    (caddr_t)dsnglp) != DDI_SUCCESS) ||
		    dsnglp->ds_outcnt) {
			mutex_exit(&dsnglp->ds_mutex);
			return (0);
		}
		dsnglp->ds_outcnt++;
		bp = dsnglp->ds_bp;
		dsnglp->ds_bp = QUE_DEL(que_objp);
		if (dsnglp->ds_kstat)
			kstat_waitq_to_runq(KSTAT_IO_PTR(dsnglp->ds_kstat));
		mutex_exit(&dsnglp->ds_mutex);

		TGCOM_TRANSPORT(tgcom_objp, bp);

		if (!mutex_tryenter(&dsnglp->ds_mutex))
			return (0);
	}
}

static int
dsngl_restart(struct fc_data *dsnglp)
{
	(void) dsngl_deque(dsnglp, NULL);
	return (-1);
}


/*
 *	Multiple Commands per Device
 */
/*
 * Local Function Prototypes
 */
static int dmult_restart();

static int dmult_enque(opaque_t, struct buf *);
static int dmult_deque(opaque_t, struct buf *);

struct 	flc_objops dmult_ops = {
	fc_init,
	fc_free,
	dmult_enque,
	dmult_deque,
	fc_start_kstat,
	fc_stop_kstat,
	0, 0
};

struct flc_obj *
dmult_create()
{
	return (fc_create((struct flc_objops *)&dmult_ops));

}


/*
 * Some of the object management functions QUE_ADD() and QUE_DEL()
 * do not accquire lock.
 * They depend on dmult_enque(), dmult_deque() to do all locking.
 * If this changes we have to grab locks in qmerge_add() and qmerge_del().
 */
static int
dmult_enque(opaque_t queuep, struct buf *in_bp)
{
	struct fc_data *dmultp = (struct fc_data *)queuep;
	opaque_t tgcom_objp;
	opaque_t que_objp;

	que_objp   = dmultp->ds_queobjp;
	tgcom_objp = dmultp->ds_tgcomobjp;

	if (!in_bp)
		return (0);
	mutex_enter(&dmultp->ds_mutex);
	if ((dmultp->ds_outcnt >= dmultp->ds_waitcnt) || dmultp->ds_bp) {
		QUE_ADD(que_objp, in_bp);
		if (dmultp->ds_kstat) {
			kstat_waitq_enter(KSTAT_IO_PTR(dmultp->ds_kstat));
		}
		mutex_exit(&dmultp->ds_mutex);
		return (0);
	}
	if (dmultp->ds_kstat) {
		kstat_waitq_enter(KSTAT_IO_PTR(dmultp->ds_kstat));
	}

	if (TGCOM_PKT(tgcom_objp, in_bp, dmult_restart,
		(caddr_t)dmultp) != DDI_SUCCESS) {

		dmultp->ds_bp = in_bp;
		mutex_exit(&dmultp->ds_mutex);
		return (0);
	}
	dmultp->ds_outcnt++;
	if (dmultp->ds_kstat)
		kstat_waitq_to_runq(KSTAT_IO_PTR(dmultp->ds_kstat));
	mutex_exit(&dmultp->ds_mutex);

	TGCOM_TRANSPORT(tgcom_objp, in_bp);
	return (0);
}

static int
dmult_deque(opaque_t queuep, struct buf *in_bp)
{
	struct fc_data *dmultp = (struct fc_data *)queuep;
	opaque_t tgcom_objp;
	opaque_t que_objp;
	struct	 buf *bp;

	que_objp = dmultp->ds_queobjp;
	tgcom_objp = dmultp->ds_tgcomobjp;

	mutex_enter(&dmultp->ds_mutex);
	if (in_bp) {
		dmultp->ds_outcnt--;
		if (dmultp->ds_kstat) {
			if (in_bp->b_flags & B_READ) {
				KSTAT_IO_PTR(dmultp->ds_kstat)->reads++;
				KSTAT_IO_PTR(dmultp->ds_kstat)->nread +=
				    (in_bp->b_bcount - in_bp->b_resid);
			} else {
				KSTAT_IO_PTR(dmultp->ds_kstat)->writes++;
				KSTAT_IO_PTR(dmultp->ds_kstat)->nwritten +=
				    (in_bp->b_bcount - in_bp->b_resid);
			}
			kstat_runq_exit(KSTAT_IO_PTR(dmultp->ds_kstat));
		}
	}

	for (;;) {

#ifdef	FLC_DEBUG
		if ((curthread->t_intr) && (!dmultp->ds_bp) &&
		    (!dmultp->ds_outcnt))
			flc_malloc_intr++;
#endif

		if (!dmultp->ds_bp)
			dmultp->ds_bp = QUE_DEL(que_objp);
		if (!dmultp->ds_bp ||
		    (TGCOM_PKT(tgcom_objp, dmultp->ds_bp, dmult_restart,
		    (caddr_t)dmultp) != DDI_SUCCESS) ||
		    (dmultp->ds_outcnt >= dmultp->ds_waitcnt)) {
			mutex_exit(&dmultp->ds_mutex);
			return (0);
		}
		dmultp->ds_outcnt++;
		bp = dmultp->ds_bp;
		dmultp->ds_bp = QUE_DEL(que_objp);

		if (dmultp->ds_kstat)
			kstat_waitq_to_runq(KSTAT_IO_PTR(dmultp->ds_kstat));

		mutex_exit(&dmultp->ds_mutex);

		TGCOM_TRANSPORT(tgcom_objp, bp);

		if (!mutex_tryenter(&dmultp->ds_mutex))
			return (0);
	}
}

static int
dmult_restart(struct fc_data *dmultp)
{
	(void) dmult_deque(dmultp, NULL);
	return (-1);
}

/*
 *	Duplexed Commands per Device: Read Queue and Write Queue
 */
/*
 * Local Function Prototypes
 */
static int duplx_restart();

static int duplx_init(opaque_t queuep, opaque_t tgcom_objp, opaque_t que_objp,
    void *lkarg);
static int duplx_free(struct flc_obj *flcobjp);
static int duplx_enque(opaque_t queuep, struct buf *bp);
static int duplx_deque(opaque_t queuep, struct buf *bp);

struct 	flc_objops duplx_ops = {
	duplx_init,
	duplx_free,
	duplx_enque,
	duplx_deque,
	fc_start_kstat,
	fc_stop_kstat,
	0, 0
};

struct flc_obj *
duplx_create()
{
	struct	flc_obj *flcobjp;
	struct	duplx_data *fcdp;

	flcobjp = kmem_zalloc((sizeof (*flcobjp) + sizeof (*fcdp)), KM_NOSLEEP);
	if (!flcobjp)
		return (NULL);

	fcdp = (struct duplx_data *)(flcobjp+1);
	flcobjp->flc_data = (opaque_t)fcdp;
	flcobjp->flc_ops  = &duplx_ops;

	fcdp->ds_writeq.fc_qobjp = qfifo_create();
	if (!(fcdp->ds_writeq.fc_qobjp = qfifo_create())) {
		kmem_free(flcobjp, (sizeof (*flcobjp) + sizeof (*fcdp)));
		return (NULL);
	}
	return (flcobjp);
}

static int
duplx_free(struct flc_obj *flcobjp)
{
	struct duplx_data *fcdp;

	fcdp = (struct duplx_data *)flcobjp->flc_data;
	if (fcdp->ds_writeq.fc_qobjp) {
		QUE_FREE(fcdp->ds_writeq.fc_qobjp);
	}
	if (fcdp->ds_readq.fc_qobjp)
		QUE_FREE(fcdp->ds_readq.fc_qobjp);
	if (fcdp->ds_tgcomobjp) {
		TGCOM_FREE(fcdp->ds_tgcomobjp);
		mutex_destroy(&fcdp->ds_mutex);
	}
	kmem_free(flcobjp, (sizeof (*flcobjp) + sizeof (*fcdp)));
	return (0);
}

static int
duplx_init(opaque_t queuep, opaque_t tgcom_objp, opaque_t que_objp, void *lkarg)
{
	struct duplx_data *fcdp = (struct duplx_data *)queuep;
	fcdp->ds_tgcomobjp = tgcom_objp;
	fcdp->ds_readq.fc_qobjp = que_objp;

	QUE_INIT(que_objp, lkarg);
	QUE_INIT(fcdp->ds_writeq.fc_qobjp, lkarg);
	TGCOM_INIT(tgcom_objp);

	mutex_init(&fcdp->ds_mutex, NULL, MUTEX_DRIVER, lkarg);

	fcdp->ds_writeq.fc_maxcnt = DUPLX_MAXCNT;
	fcdp->ds_readq.fc_maxcnt  = DUPLX_MAXCNT;

	/* queues point to each other for round robin */
	fcdp->ds_readq.next = &fcdp->ds_writeq;
	fcdp->ds_writeq.next = &fcdp->ds_readq;

	return (DDI_SUCCESS);
}

static int
duplx_enque(opaque_t queuep, struct buf *in_bp)
{
	struct duplx_data *duplxp = (struct duplx_data *)queuep;
	opaque_t tgcom_objp;
	struct fc_que *activeq;
	struct buf *bp;

	mutex_enter(&duplxp->ds_mutex);
	if (in_bp) {
		if (duplxp->ds_kstat) {
			kstat_waitq_enter(KSTAT_IO_PTR(duplxp->ds_kstat));
		}
		if (in_bp->b_flags & B_READ)
			activeq = &duplxp->ds_readq;
		else
			activeq = &duplxp->ds_writeq;

		QUE_ADD(activeq->fc_qobjp, in_bp);
	} else {
		activeq = &duplxp->ds_readq;
	}

	tgcom_objp = duplxp->ds_tgcomobjp;

	for (;;) {
		if (!activeq->fc_bp)
			activeq->fc_bp = QUE_DEL(activeq->fc_qobjp);
		if (!activeq->fc_bp ||
		    (TGCOM_PKT(tgcom_objp, activeq->fc_bp, duplx_restart,
		    (caddr_t)duplxp) != DDI_SUCCESS) ||
		    (activeq->fc_outcnt >= activeq->fc_maxcnt)) {

			/* switch read/write queues */
			activeq = activeq->next;
			if (!activeq->fc_bp)
				activeq->fc_bp = QUE_DEL(activeq->fc_qobjp);
			if (!activeq->fc_bp ||
			    (TGCOM_PKT(tgcom_objp, activeq->fc_bp,
			    duplx_restart, (caddr_t)duplxp) != DDI_SUCCESS) ||
			    (activeq->fc_outcnt >= activeq->fc_maxcnt)) {
				mutex_exit(&duplxp->ds_mutex);
				return (0);
			}
		}

		activeq->fc_outcnt++;
		bp = activeq->fc_bp;
		activeq->fc_bp = NULL;

		if (duplxp->ds_kstat)
			kstat_waitq_to_runq(KSTAT_IO_PTR(duplxp->ds_kstat));
		mutex_exit(&duplxp->ds_mutex);

		TGCOM_TRANSPORT(tgcom_objp, bp);

		if (!mutex_tryenter(&duplxp->ds_mutex))
			return (0);

		activeq = activeq->next;
	}
}

static int
duplx_deque(opaque_t queuep, struct buf *in_bp)
{
	struct duplx_data *duplxp = (struct duplx_data *)queuep;
	opaque_t tgcom_objp;
	struct fc_que *activeq;
	struct buf *bp;

	mutex_enter(&duplxp->ds_mutex);

	tgcom_objp = duplxp->ds_tgcomobjp;

	if (in_bp->b_flags & B_READ)
		activeq = &duplxp->ds_readq;
	else
		activeq = &duplxp->ds_writeq;
	activeq->fc_outcnt--;

	if (duplxp->ds_kstat) {
		if (in_bp->b_flags & B_READ) {
			KSTAT_IO_PTR(duplxp->ds_kstat)->reads++;
			KSTAT_IO_PTR(duplxp->ds_kstat)->nread +=
			    (in_bp->b_bcount - in_bp->b_resid);
		} else {
			KSTAT_IO_PTR(duplxp->ds_kstat)->writes++;
			KSTAT_IO_PTR(duplxp->ds_kstat)->nwritten +=
			    (in_bp->b_bcount - in_bp->b_resid);
		}
		kstat_runq_exit(KSTAT_IO_PTR(duplxp->ds_kstat));
	}

	for (;;) {

		/* if needed, try to pull request off a queue */
		if (!activeq->fc_bp)
			activeq->fc_bp = QUE_DEL(activeq->fc_qobjp);

		if (!activeq->fc_bp ||
		    (TGCOM_PKT(tgcom_objp, activeq->fc_bp, duplx_restart,
		    (caddr_t)duplxp) != DDI_SUCCESS) ||
		    (activeq->fc_outcnt >= activeq->fc_maxcnt)) {

			activeq = activeq->next;
			if (!activeq->fc_bp)
				activeq->fc_bp = QUE_DEL(activeq->fc_qobjp);

			if (!activeq->fc_bp ||
			    (TGCOM_PKT(tgcom_objp, activeq->fc_bp,
			    duplx_restart, (caddr_t)duplxp) != DDI_SUCCESS) ||
			    (activeq->fc_outcnt >= activeq->fc_maxcnt)) {
				mutex_exit(&duplxp->ds_mutex);
				return (0);
			}
		}

		activeq->fc_outcnt++;
		bp = activeq->fc_bp;
		activeq->fc_bp = NULL;

		if (duplxp->ds_kstat)
			kstat_waitq_to_runq(KSTAT_IO_PTR(duplxp->ds_kstat));

		mutex_exit(&duplxp->ds_mutex);

		TGCOM_TRANSPORT(tgcom_objp, bp);

		if (!mutex_tryenter(&duplxp->ds_mutex))
			return (0);

		activeq = activeq->next;
	}
}

static int
duplx_restart(struct duplx_data *duplxp)
{
	(void) duplx_enque(duplxp, NULL);
	return (-1);
}

/*
 *	Tagged queueing flow control
 */
/*
 * Local Function Prototypes
 */

struct 	flc_objops adapt_ops = {
	fc_init,
	fc_free,
	dmult_enque,
	dmult_deque,
	fc_start_kstat,
	fc_stop_kstat,
	0, 0
};

struct flc_obj *
adapt_create()
{
	return (fc_create((struct flc_objops *)&adapt_ops));

}

/*
 *	Common Queue functions
 */

/*
 * 	Local static data
 */
#ifdef	Q_DEBUG
#define	DENT	0x0001
#define	DERR	0x0002
#define	DIO	0x0004
static	int	que_debug = DENT|DERR|DIO;

#endif	/* Q_DEBUG */
/*
 * 	Local Function Prototypes
 */
static struct que_obj *que_create(struct que_objops *qopsp);
static int que_init(struct que_data *qfp, void *lkarg);
static int que_free(struct que_obj *queobjp);
static struct buf *que_del(struct que_data *qfp);

static struct que_obj *
que_create(struct que_objops *qopsp)
{
	struct	que_data *qfp;
	struct	que_obj *queobjp;

	queobjp = kmem_zalloc((sizeof (*queobjp) + sizeof (*qfp)), KM_NOSLEEP);
	if (!queobjp)
		return (NULL);

	queobjp->que_ops = qopsp;
	qfp = (struct que_data *)(queobjp+1);
	queobjp->que_data = (opaque_t)qfp;

	return ((opaque_t)queobjp);
}

static int
que_init(struct que_data *qfp, void *lkarg)
{
	mutex_init(&qfp->q_mutex, NULL, MUTEX_DRIVER, lkarg);
	return (DDI_SUCCESS);
}

static int
que_free(struct que_obj *queobjp)
{
	struct	que_data *qfp;

	qfp = (struct que_data *)queobjp->que_data;
	mutex_destroy(&qfp->q_mutex);
	kmem_free(queobjp, (sizeof (*queobjp) + sizeof (struct que_data)));
	return (0);
}

static struct buf *
que_del(struct que_data *qfp)
{
	struct buf *bp;

	bp = qfp->q_tab.b_actf;
	if (bp) {
		qfp->q_tab.b_actf = bp->av_forw;
		if (!qfp->q_tab.b_actf)
			qfp->q_tab.b_actl = NULL;
		bp->av_forw = 0;
	}
	return (bp);
}



/*
 *	Qmerge
 * 	Local Function Prototypes
 */
static int qmerge_add(), qmerge_free();
static struct buf *qmerge_del(struct que_data *qfp);

struct 	que_objops qmerge_ops = {
	que_init,
	qmerge_free,
	qmerge_add,
	qmerge_del,
	0, 0
};

/* fields in diskhd */
#define	hd_cnt			b_back
#define	hd_private		b_forw
#define	hd_flags		b_flags
#define	hd_sync_next		av_forw
#define	hd_async_next		av_back

#define	hd_sync2async		sync_async_ratio

#define	QNEAR_FORWARD		0x01
#define	QNEAR_BACKWARD		0x02
#define	QNEAR_ASYNCONLY		0x04
#define	QNEAR_ASYNCALSO		0x08

#define	DBLK(bp) ((unsigned long)(bp)->b_private)

#define	BP_LT_BP(a, b) (DBLK(a) < DBLK(b))
#define	BP_GT_BP(a, b) (DBLK(a) > DBLK(b))
#define	BP_LT_HD(a, b) (DBLK(a) < (unsigned long)((b)->hd_private))
#define	BP_GT_HD(a, b) (DBLK(a) > (unsigned long)((b)->hd_private))
#define	QNEAR_ASYNC	(QNEAR_ASYNCONLY|QNEAR_ASYNCALSO)

#define	SYNC2ASYNC(a) ((a)->q_tab.hd_cnt)


/*
 * qmerge implements a two priority queue, the low priority queue holding ASYNC
 * write requests, while the rest are queued in the high priority sync queue.
 * Requests on the async queue would be merged if possible.
 * By default qmerge2wayscan is 1, indicating an elevator algorithm. When
 * this variable is set to zero, it has the following side effects.
 * 1. We assume fairness is the number one issue.
 * 2. The next request to be picked indicates current head position.
 *
 * qmerge_sync2async indicates the ratio of scans of high prioriy
 * sync queue to low priority async queue.
 *
 * When qmerge variables have the following values it defaults to qsort
 *
 * qmerge1pri = 1, qmerge2wayscan = 0, qmerge_max_merge = 0
 *
 */
static int	qmerge_max_merge = 128 * 1024;
static intptr_t	qmerge_sync2async = 4;
static int	qmerge2wayscan = 1;
static int	qmerge1pri = 0;
static int	qmerge_merge = 0;

/*
 * 	Local static data
 */
struct que_obj *
qmerge_create()
{
	struct que_data *qfp;
	struct que_obj *queobjp;

	queobjp = kmem_zalloc((sizeof (*queobjp) + sizeof (*qfp)), KM_NOSLEEP);
	if (!queobjp)
		return (NULL);

	queobjp->que_ops = &qmerge_ops;
	qfp = (struct que_data *)(queobjp+1);
	qfp->q_tab.hd_private = qfp->q_tab.hd_private = 0;
	qfp->q_tab.hd_sync_next = qfp->q_tab.hd_async_next = NULL;
	qfp->q_tab.hd_cnt = (void *)qmerge_sync2async;
	queobjp->que_data = (opaque_t)qfp;

	return ((opaque_t)queobjp);
}

static int
qmerge_free(struct que_obj *queobjp)
{
	struct	que_data *qfp;

	qfp = (struct que_data *)queobjp->que_data;
	mutex_destroy(&qfp->q_mutex);
	kmem_free(queobjp, (sizeof (*queobjp) + sizeof (*qfp)));
	return (0);
}

static int
qmerge_can_merge(bp1, bp2)
struct	buf *bp1, *bp2;
{
	const int paw_flags = B_PAGEIO | B_ASYNC | B_WRITE;

	if ((bp1->b_un.b_addr != 0) || (bp2->b_un.b_addr != 0) ||
	    ((bp1->b_flags & (paw_flags | B_REMAPPED)) != paw_flags) ||
	    ((bp2->b_flags & (paw_flags | B_REMAPPED)) != paw_flags) ||
	    (bp1->b_bcount & PAGEOFFSET) || (bp2->b_bcount & PAGEOFFSET) ||
	    (bp1->b_bcount + bp2->b_bcount > qmerge_max_merge))
		return (0);

	if ((DBLK(bp2) + bp2->b_bcount / DEV_BSIZE == DBLK(bp1)) ||
	    (DBLK(bp1) + bp1->b_bcount / DEV_BSIZE == DBLK(bp2)))
		return (1);
	else
		return (0);
}

static void
qmerge_mergesetup(bp_merge, bp)
struct	buf *bp_merge, *bp;
{
	struct	buf *bp1;
	struct	page *pp, *pp_merge, *pp_merge_prev;
	int	forward;

	qmerge_merge++;
	forward = DBLK(bp_merge) < DBLK(bp);

	bp_merge->b_bcount += bp->b_bcount;

	pp = bp->b_pages;
	pp_merge = bp_merge->b_pages;

	pp_merge_prev = pp_merge->p_prev;

	pp_merge->p_prev->p_next = pp;
	pp_merge->p_prev = pp->p_prev;
	pp->p_prev->p_next = pp_merge;
	pp->p_prev = pp_merge_prev;

	bp1 = bp_merge->b_forw;

	bp1->av_back->av_forw = bp;
	bp->av_back = bp1->av_back;
	bp1->av_back = bp;
	bp->av_forw = bp1;

	if (!forward) {
		bp_merge->b_forw = bp;
		bp_merge->b_pages = pp;
		bp_merge->b_private = bp->b_private;
	}
}

static void
que_insert(struct que_data *qfp, struct buf *bp)
{
	struct buf	*bp1, *bp_start, *lowest_bp, *highest_bp;
	uintptr_t	highest_blk, lowest_blk;
	struct buf	**async_bpp, **sync_bpp, **bpp;
	struct diskhd	*dp = &qfp->q_tab;

	sync_bpp = &dp->hd_sync_next;
	async_bpp = &dp->hd_async_next;
	/*
	 * The ioctl used by the format utility requires that bp->av_back be
	 * preserved.
	 */
	if (bp->av_back)
		bp->b_error = (intptr_t)bp->av_back;
	if (!qmerge1pri &&
	    ((bp->b_flags & (B_ASYNC|B_READ|B_FREE)) == B_ASYNC)) {
		bpp = &dp->hd_async_next;
	} else {
		bpp = &dp->hd_sync_next;
	}


	if ((bp1 = *bpp) == NULL) {
		*bpp = bp;
		bp->av_forw = bp->av_back = bp;
		if ((bpp == async_bpp) && (*sync_bpp == NULL)) {
			dp->hd_flags |= QNEAR_ASYNCONLY;
		} else if (bpp == sync_bpp) {
			dp->hd_flags &= ~QNEAR_ASYNCONLY;
			if (*async_bpp) {
				dp->hd_flags |= QNEAR_ASYNCALSO;
			}
		}
		return;
	}
	bp_start = bp1;
	if (DBLK(bp) < DBLK(bp1)) {
		lowest_blk = DBLK(bp1);
		lowest_bp = bp1;
		do {
			if (DBLK(bp) > DBLK(bp1)) {
				bp->av_forw = bp1->av_forw;
				bp1->av_forw->av_back = bp;
				bp1->av_forw = bp;
				bp->av_back = bp1;

				if (((bpp == async_bpp) &&
				    (dp->hd_flags & QNEAR_ASYNC)) ||
				    (bpp == sync_bpp)) {
					if (!(dp->hd_flags & QNEAR_BACKWARD) &&
					    BP_GT_HD(bp, dp)) {
						*bpp = bp;
					}
				}
				return;
			} else if (DBLK(bp1) < lowest_blk) {
				lowest_bp = bp1;
				lowest_blk = DBLK(bp1);
			}
		} while ((DBLK(bp1->av_back) < DBLK(bp1)) &&
		    ((bp1 = bp1->av_back) != bp_start));
		bp->av_forw = lowest_bp;
		lowest_bp->av_back->av_forw = bp;
		bp->av_back = lowest_bp->av_back;
		lowest_bp->av_back = bp;
		if ((bpp == async_bpp) && !(dp->hd_flags & QNEAR_ASYNC)) {
			*bpp = bp;
		} else if (!(dp->hd_flags & QNEAR_BACKWARD) &&
		    BP_GT_HD(bp, dp)) {
			*bpp = bp;
		}
	} else {
		highest_blk = DBLK(bp1);
		highest_bp = bp1;
		do {
			if (DBLK(bp) < DBLK(bp1)) {
				bp->av_forw = bp1;
				bp1->av_back->av_forw = bp;
				bp->av_back = bp1->av_back;
				bp1->av_back = bp;
				if (((bpp == async_bpp) &&
				    (dp->hd_flags & QNEAR_ASYNC)) ||
				    (bpp == sync_bpp)) {
					if ((dp->hd_flags & QNEAR_BACKWARD) &&
					    BP_LT_HD(bp, dp)) {
						*bpp = bp;
					}
				}
				return;
			} else if (DBLK(bp1) > highest_blk) {
				highest_bp = bp1;
				highest_blk = DBLK(bp1);
			}
		} while ((DBLK(bp1->av_forw) > DBLK(bp1)) &&
		    ((bp1 = bp1->av_forw) != bp_start));
		bp->av_back = highest_bp;
		highest_bp->av_forw->av_back = bp;
		bp->av_forw = highest_bp->av_forw;
		highest_bp->av_forw = bp;

		if (((bpp == sync_bpp) ||
		    ((bpp == async_bpp) && (dp->hd_flags & QNEAR_ASYNC))) &&
		    (dp->hd_flags & QNEAR_BACKWARD) && (BP_LT_HD(bp, dp)))
			*bpp = bp;
	}
}

/*
 * dmult_enque() holds dmultp->ds_mutex lock, so we dont grab
 * lock here. If dmult_enque() changes we will have to visit
 * this function again
 */
static int
qmerge_add(struct que_data *qfp, struct buf *bp)
{

	que_insert(qfp, bp);
	return (++qfp->q_cnt);
}

static int
qmerge_iodone(struct buf *bp)
{
	struct buf *bp1;
	struct	page *pp, *pp1, *tmp_pp;

	if (bp->b_flags & B_REMAPPED)
		bp_mapout(bp);

	bp1 = bp->b_forw;
	do {
		bp->b_forw = bp1->av_forw;
		bp1->av_forw->av_back = bp1->av_back;
		bp1->av_back->av_forw = bp1->av_forw;
		pp = (page_t *)bp1->b_pages;
		pp1 = bp->b_forw->b_pages;

		tmp_pp = pp->p_prev;
		pp->p_prev = pp1->p_prev;
		pp->p_prev->p_next = pp;

		pp1->p_prev = tmp_pp;
		pp1->p_prev->p_next = pp1;

		if (bp->b_flags & B_ERROR) {
			bp1->b_error = bp->b_error;
			bp1->b_flags |= B_ERROR;
		}

		biodone(bp1);
	} while ((bp1 = bp->b_forw) != bp->b_forw->av_forw);

	biodone(bp1);
	kmem_free(bp, sizeof (*bp));
	return (0);
}




static struct buf *
qmerge_nextbp(struct que_data *qfp, struct buf *bp_merge, int *can_merge)
{
	intptr_t	private, cnt;
	int		flags;
	struct		buf *sync_bp, *async_bp, *bp;
	struct		buf **sync_bpp, **async_bpp, **bpp;
	struct		diskhd *dp = &qfp->q_tab;

	if (qfp->q_cnt == 0) {
		return (NULL);
	}
	flags = qfp->q_tab.hd_flags;
	sync_bpp = &qfp->q_tab.hd_sync_next;
	async_bpp = &qfp->q_tab.hd_async_next;

begin_nextbp:
	if (flags & QNEAR_ASYNCONLY) {
		bp = *async_bpp;
		private = DBLK(bp);
		if (bp_merge && !qmerge_can_merge(bp, bp_merge)) {
			return (NULL);
		} else if (bp->av_forw == bp) {
			bp->av_forw = bp->av_back = NULL;
			flags &= ~(QNEAR_ASYNCONLY | QNEAR_BACKWARD);
			private = 0;
		} else if (flags & QNEAR_BACKWARD) {
			if (DBLK(bp) < DBLK(bp->av_back)) {
				flags &= ~QNEAR_BACKWARD;
				private = 0;
			}
		} else if (DBLK(bp) > DBLK(bp->av_forw)) {
			if (qmerge2wayscan) {
				flags |= QNEAR_BACKWARD;
			} else {
				private = 0;
			}
		} else if (qmerge2wayscan == 0) {
			private = DBLK(bp->av_forw);
		}
		bpp = async_bpp;

	} else if (flags & QNEAR_ASYNCALSO) {
		sync_bp = *sync_bpp;
		async_bp = *async_bpp;
		if (flags & QNEAR_BACKWARD) {
			if (BP_GT_HD(sync_bp, dp) && BP_GT_HD(async_bp, dp)) {
				flags &= ~(QNEAR_BACKWARD|QNEAR_ASYNCALSO);
				*sync_bpp = sync_bp->av_forw;
				*async_bpp = async_bp->av_forw;
				SYNC2ASYNC(qfp) = (void *)qmerge_sync2async;
				qfp->q_tab.hd_private = 0;
				goto begin_nextbp;
			}
			if (BP_LT_HD(async_bp, dp) && BP_LT_HD(sync_bp, dp)) {
				if (BP_GT_BP(async_bp, sync_bp)) {
					bpp = async_bpp;
					bp = *async_bpp;
				} else {
					bpp = sync_bpp;
					bp = *sync_bpp;
				}
			} else if (BP_LT_HD(async_bp, dp)) {
				bpp = async_bpp;
				bp = *async_bpp;
			} else {
				bpp = sync_bpp;
				bp = *sync_bpp;
			}
		} else {
			if (BP_LT_HD(sync_bp, dp) && BP_LT_HD(async_bp, dp)) {
				if (qmerge2wayscan) {
					flags |= QNEAR_BACKWARD;
					*sync_bpp = sync_bp->av_back;
					*async_bpp = async_bp->av_back;
					goto begin_nextbp;
				} else {
					flags &= ~QNEAR_ASYNCALSO;
					SYNC2ASYNC(qfp) =
						(void *)qmerge_sync2async;
					qfp->q_tab.hd_private = 0;
					goto begin_nextbp;
				}
			}
			if (BP_GT_HD(async_bp, dp) && BP_GT_HD(sync_bp, dp)) {
				if (BP_LT_BP(async_bp, sync_bp)) {
					bpp = async_bpp;
					bp = *async_bpp;
				} else {
					bpp = sync_bpp;
					bp = *sync_bpp;
				}
			} else if (BP_GT_HD(async_bp, dp)) {
				bpp = async_bpp;
				bp = *async_bpp;
			} else {
				bpp = sync_bpp;
				bp = *sync_bpp;
			}
		}
		if (bp_merge && !qmerge_can_merge(bp, bp_merge)) {
			return (NULL);
		} else if (bp->av_forw == bp) {
			bp->av_forw = bp->av_back = NULL;
			flags &= ~QNEAR_ASYNCALSO;
			if (bpp == async_bpp) {
				SYNC2ASYNC(qfp) = (void *)qmerge_sync2async;
			} else {
				flags |= QNEAR_ASYNCONLY;
			}
		}
		private = DBLK(bp);
	} else {
		bp = *sync_bpp;
		private = DBLK(bp);
		if (bp_merge && !qmerge_can_merge(bp, bp_merge)) {
			return (NULL);
		} else if (bp->av_forw == bp) {
			private = 0;
			SYNC2ASYNC(qfp) = (void *)qmerge_sync2async;
			bp->av_forw = bp->av_back = NULL;
			flags &= ~QNEAR_BACKWARD;
			if (*async_bpp)
				flags |= QNEAR_ASYNCONLY;
		} else if (flags & QNEAR_BACKWARD) {
			if (DBLK(bp) < DBLK(bp->av_back)) {
				flags &= ~QNEAR_BACKWARD;
				cnt = (intptr_t)SYNC2ASYNC(qfp);
				if (cnt > 0) {
					cnt--;
					SYNC2ASYNC(qfp) = (void *)cnt;
				} else {
					if (*async_bpp)
						flags |= QNEAR_ASYNCALSO;
					SYNC2ASYNC(qfp) =
						(void *)qmerge_sync2async;
				}
				private = 0;
			}
		} else if (DBLK(bp) > DBLK(bp->av_forw)) {
			private = 0;
			if (qmerge2wayscan) {
				flags |= QNEAR_BACKWARD;
				private = DBLK(bp);
			} else {
				cnt = (intptr_t)SYNC2ASYNC(qfp);
				if (cnt > 0) {
					cnt--;
					SYNC2ASYNC(qfp) = (void *)cnt;
				} else {
					if (*async_bpp)
						flags |= QNEAR_ASYNCALSO;
					SYNC2ASYNC(qfp) =
						(void *)qmerge_sync2async;
				}
			}
		} else if (qmerge2wayscan == 0) {
			private = DBLK(bp->av_forw);
		}
		bpp = sync_bpp;
	}

	if (bp->av_forw) {
		*can_merge = !(bp->b_flags & B_READ);
		if (flags & QNEAR_BACKWARD) {
			*bpp = bp->av_back;
			if ((DBLK(bp->av_back) +
			    bp->av_back->b_bcount / DEV_BSIZE) != DBLK(bp))
				*can_merge = 0;
		} else {
			*bpp = bp->av_forw;
			if ((DBLK(bp) + bp->b_bcount / DEV_BSIZE) !=
			    DBLK(bp->av_forw))
				*can_merge = 0;
		}
		bp->av_forw->av_back = bp->av_back;
		bp->av_back->av_forw = bp->av_forw;
		bp->av_forw = bp->av_back = NULL;
	} else {
		*bpp = NULL;
		*can_merge = 0;
	}
	qfp->q_tab.hd_private = (void *)private;
	qfp->q_cnt--;
	qfp->q_tab.hd_flags = flags;
	if (bp->b_error) {
		bp->av_back = (void *)(intptr_t)bp->b_error;
		bp->b_error = 0;
	}
	return (bp);
}

static struct buf *
qmerge_del(struct que_data *qfp)
{
	struct	buf *bp, *next_bp, *bp_merge;
	int	alloc_mergebp, merge;

	if (qfp->q_cnt == 0) {
		return (NULL);
	}

	bp_merge = bp = qmerge_nextbp(qfp, NULL, &merge);
	alloc_mergebp = 1;
	while (merge && (next_bp = qmerge_nextbp(qfp, bp_merge, &merge))) {
		if (alloc_mergebp) {
			bp_merge = kmem_alloc(sizeof (*bp_merge), KM_NOSLEEP);
			if (bp_merge == NULL) {
				mutex_exit(&qfp->q_mutex);
				return (bp);
			}
			bcopy(bp, bp_merge, sizeof (*bp_merge));
			bp_merge->b_iodone = qmerge_iodone;
			bp_merge->b_forw = bp;
			bp_merge->b_back = (struct buf *)qfp;
			bp->av_forw = bp->av_back = bp;
			alloc_mergebp = 0;
		}
		qmerge_mergesetup(bp_merge, next_bp);
	}
	return (bp_merge);
}


/*
 *	FIFO Queue functions
 */
/*
 * 	Local Function Prototypes
 */
static int qfifo_add();

struct 	que_objops qfifo_ops = {
	que_init,
	que_free,
	qfifo_add,
	que_del,
	0, 0
};

/*
 * 	Local static data
 */
struct que_obj *
qfifo_create()
{
	return (que_create((struct que_objops *)&qfifo_ops));
}

static int
qfifo_add(struct que_data *qfp, struct buf *bp)
{

	if (!qfp->q_tab.b_actf)
		qfp->q_tab.b_actf = bp;
	else
		qfp->q_tab.b_actl->av_forw = bp;
	qfp->q_tab.b_actl = bp;
	bp->av_forw = NULL;
	return (0);
}

/*
 *	One-Way-Scan Queue functions
 */
/*
 * 	Local Function Prototypes
 */
static int qsort_add();
static struct buf *qsort_del();
static void oneway_scan_binary(struct diskhd *dp, struct buf *bp);

struct 	que_objops qsort_ops = {
	que_init,
	que_free,
	qsort_add,
	qsort_del,
	0, 0
};

/*
 * 	Local static data
 */
struct que_obj *
qsort_create()
{
	return (que_create((struct que_objops *)&qsort_ops));
}

static int
qsort_add(struct que_data *qfp, struct buf *bp)
{
	qfp->q_cnt++;
	oneway_scan_binary(&qfp->q_tab, bp);
	return (0);
}


#define	b_pasf	b_forw
#define	b_pasl	b_back
static void
oneway_scan_binary(struct diskhd *dp, struct buf *bp)
{
	struct buf *ap;

	ap = dp->b_actf;
	if (ap == NULL) {
		dp->b_actf = bp;
		bp->av_forw = NULL;
		return;
	}
	if (DBLK(bp) < DBLK(ap)) {
		ap = dp->b_pasf;
		if ((ap == NULL) || (DBLK(bp) < DBLK(ap))) {
			dp->b_pasf = bp;
			bp->av_forw = ap;
			return;
		}
	}
	while (ap->av_forw) {
		if (DBLK(bp) < DBLK(ap->av_forw))
			break;
		ap = ap->av_forw;
	}
	bp->av_forw = ap->av_forw;
	ap->av_forw = bp;
}

static struct buf *
qsort_del(struct que_data *qfp)
{
	struct buf *bp;

	if (qfp->q_cnt == 0) {
		return (NULL);
	}
	qfp->q_cnt--;
	bp = qfp->q_tab.b_actf;
	qfp->q_tab.b_actf = bp->av_forw;
	bp->av_forw = 0;
	if (!qfp->q_tab.b_actf && qfp->q_tab.b_pasf) {
		qfp->q_tab.b_actf = qfp->q_tab.b_pasf;
		qfp->q_tab.b_pasf = NULL;
	}
	return (bp);
}

/*
 *	Tagged queueing
 */
/*
 * 	Local Function Prototypes
 */

struct 	que_objops qtag_ops = {
	que_init,
	que_free,
	qsort_add,
	qsort_del,
	0, 0
};

/*
 * 	Local static data
 */
struct que_obj *
qtag_create()
{
	return (que_create((struct que_objops *)&qtag_ops));
}