/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/scsi/scsi.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/thread.h>
#include <sys/var.h>

#include "sd_xbuf.h"

/*
 * xbuf.c: buf(9s) extension facility.
 *
 * The buf(9S) extension facility is intended to allow block drivers to
 * allocate additional memory that is associated with a particular buf(9S)
 * struct.  It is further intended to help in addressing the usual set of
 * problems associated with such allocations, in particular those involving
 * recovery from allocation failures, especially in code paths that the
 * system relies on to free memory.
 *
 * CAVEAT: Currently this code is completely private to the sd driver and in
 * NO WAY constitutes a public or supported interface of any kind. It is
 * envisioned that this may one day migrate into the Solaris DDI, but until
 * that time this ought to be considered completely unstable and is subject
 * to change without notice. This code may NOT in any way be utilized by
 * ANY code outside the sd driver.
 */


static int xbuf_iostart(ddi_xbuf_attr_t xap);
static void xbuf_dispatch(ddi_xbuf_attr_t xap);
static void xbuf_restart_callback(void *arg);
static int xbuf_brk_done(struct buf *bp);


/*
 * Note: Should this be exposed to the caller.... do we want to give the
 * caller the fexibility of specifying the parameters for the thread pool?
 * Note: these values are just estimates at this time, based upon what
 * seems reasonable for the sd driver. It may be preferable to make these
 * parameters self-scaling in a real (future) implementation.
 */
#define	XBUF_TQ_MINALLOC	64
#define	XBUF_TQ_MAXALLOC	512
#define	XBUF_DISPATCH_DELAY	(drv_usectohz(50000))	/* 50 msec */

static taskq_t *xbuf_tq = NULL;
static int xbuf_attr_tq_minalloc = XBUF_TQ_MINALLOC;
static int xbuf_attr_tq_maxalloc = XBUF_TQ_MAXALLOC;

static kmutex_t	xbuf_mutex = { 0 };
static uint32_t	xbuf_refcount = 0;

/*
 * Private wrapper for buf cloned via ddi_xbuf_qstrategy()
 */
struct xbuf_brk {
	kmutex_t mutex;
	struct buf *bp0;
	uint8_t nbufs;	/* number of buf allocated */
	uint8_t active; /* number of active xfer */

	size_t brksize;	/* break size used for this buf */
	int brkblk;

	/* xfer position */
	off_t off;
	off_t noff;
	daddr_t blkno;
};

_NOTE(DATA_READABLE_WITHOUT_LOCK(xbuf_brk::off))

/*
 * Hack needed in the prototype so buf breakup will work.
 * Here we can rely on the sd code not changing the value in
 * b_forw.
 */
#define	b_clone_private b_forw


/* ARGSUSED */
DDII ddi_xbuf_attr_t
ddi_xbuf_attr_create(size_t xsize,
    void (*xa_strategy)(struct buf *bp, ddi_xbuf_t xp, void *attr_arg),
    void *attr_arg, uint32_t active_limit, uint32_t reserve_limit,
    major_t major, int flags)
{
	ddi_xbuf_attr_t	xap;

	xap = kmem_zalloc(sizeof (struct __ddi_xbuf_attr), KM_SLEEP);

	mutex_init(&xap->xa_mutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&xap->xa_reserve_mutex, NULL, MUTEX_DRIVER, NULL);

	/* Future: Allow the caller to specify alignment requirements? */
	xap->xa_allocsize	= max(xsize, sizeof (void *));
	xap->xa_active_limit	= active_limit;
	xap->xa_active_lowater	= xap->xa_active_limit / 2;
	xap->xa_reserve_limit	= reserve_limit;
	xap->xa_strategy	= xa_strategy;
	xap->xa_attr_arg	= attr_arg;

	mutex_enter(&xbuf_mutex);
	if (xbuf_refcount == 0) {
		ASSERT(xbuf_tq == NULL);
		/*
		 * Note: Would be nice if: (1) #threads in the taskq pool (set
		 * to the value of 'ncpus' at the time the taskq is created)
		 * could adjust automatically with DR; (2) the taskq
		 * minalloc/maxalloc counts could be grown/shrunk on the fly.
		 */
		xbuf_tq = taskq_create("xbuf_taskq", ncpus,
		    (v.v_maxsyspri - 2), xbuf_attr_tq_minalloc,
		    xbuf_attr_tq_maxalloc, TASKQ_PREPOPULATE);
	}
	xbuf_refcount++;
	mutex_exit(&xbuf_mutex);

	/* In this prototype we just always use the global system pool. */
	xap->xa_tq = xbuf_tq;

	return (xap);
}


DDII void
ddi_xbuf_attr_destroy(ddi_xbuf_attr_t xap)
{
	ddi_xbuf_t	xp;

	mutex_destroy(&xap->xa_mutex);
	mutex_destroy(&xap->xa_reserve_mutex);

	/* Free any xbufs on the reserve list */
	while (xap->xa_reserve_count != 0) {
		xp = xap->xa_reserve_headp;
		xap->xa_reserve_headp = *((void **)xp);
		xap->xa_reserve_count--;
		kmem_free(xp, xap->xa_allocsize);
	}
	ASSERT(xap->xa_reserve_headp == NULL);

	mutex_enter(&xbuf_mutex);
	ASSERT((xbuf_refcount != 0) && (xbuf_tq != NULL));
	xbuf_refcount--;
	if (xbuf_refcount == 0) {
		taskq_destroy(xbuf_tq);
		xbuf_tq = NULL;
	}
	mutex_exit(&xbuf_mutex);

	kmem_free(xap, sizeof (struct __ddi_xbuf_attr));
}


/* ARGSUSED */
DDII void
ddi_xbuf_attr_register_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip)
{
	/* Currently a no-op in this prototype */
}


/* ARGSUSED */
DDII void
ddi_xbuf_attr_unregister_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip)
{
	/* Currently a no-op in this prototype */
}

DDII int
ddi_xbuf_attr_setup_brk(ddi_xbuf_attr_t xap, size_t size)
{
	if (size < DEV_BSIZE)
		return (0);

	mutex_enter(&xap->xa_mutex);
	xap->xa_brksize = size & ~(DEV_BSIZE - 1);
	mutex_exit(&xap->xa_mutex);
	return (1);
}



/*
 * Enqueue the given buf and attempt to initiate IO.
 * Called from the driver strategy(9E) routine.
 */

DDII int
ddi_xbuf_qstrategy(struct buf *bp, ddi_xbuf_attr_t xap)
{
	ASSERT(xap != NULL);
	ASSERT(!mutex_owned(&xap->xa_mutex));
	ASSERT(!mutex_owned(&xap->xa_reserve_mutex));

	mutex_enter(&xap->xa_mutex);

	ASSERT((bp->b_bcount & (DEV_BSIZE - 1)) == 0);

	/*
	 * Breakup buf if necessary. bp->b_private is temporarily
	 * used to save xbuf_brk
	 */
	if (xap->xa_brksize && bp->b_bcount > xap->xa_brksize) {
		struct xbuf_brk *brkp;

		brkp = kmem_zalloc(sizeof (struct xbuf_brk), KM_SLEEP);
		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*brkp))
		mutex_init(&brkp->mutex, NULL, MUTEX_DRIVER, NULL);
		brkp->bp0 = bp;
		brkp->brksize = xap->xa_brksize;
		brkp->brkblk = btodt(xap->xa_brksize);
		brkp->noff = xap->xa_brksize;
		brkp->blkno = bp->b_blkno;
		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*brkp))
		bp->b_private = brkp;
	} else {
		bp->b_private = NULL;
	}

	/* Enqueue buf */
	if (xap->xa_headp == NULL) {
		xap->xa_headp = xap->xa_tailp = bp;
	} else {
		xap->xa_tailp->av_forw = bp;
		xap->xa_tailp = bp;
	}
	bp->av_forw = NULL;

	xap->xa_pending++;
	mutex_exit(&xap->xa_mutex);
	return (xbuf_iostart(xap));
}


/*
 * Drivers call this immediately before calling biodone(9F), to notify the
 * framework that the indicated xbuf is no longer being used by the driver.
 * May be called under interrupt context.
 */

DDII int
ddi_xbuf_done(struct buf *bp, ddi_xbuf_attr_t xap)
{
	ddi_xbuf_t xp;
	int done;

	ASSERT(bp != NULL);
	ASSERT(xap != NULL);
	ASSERT(!mutex_owned(&xap->xa_mutex));
	ASSERT(!mutex_owned(&xap->xa_reserve_mutex));

	xp = ddi_xbuf_get(bp, xap);

	mutex_enter(&xap->xa_mutex);

#ifdef	SDDEBUG
	if (xap->xa_active_limit != 0) {
		ASSERT(xap->xa_active_count > 0);
	}
#endif
	xap->xa_active_count--;

	if (xap->xa_reserve_limit != 0) {
		mutex_enter(&xap->xa_reserve_mutex);
		if (xap->xa_reserve_count < xap->xa_reserve_limit) {
			/* Put this xbuf onto the reserve list & exit */
			*((void **)xp) = xap->xa_reserve_headp;
			xap->xa_reserve_headp = xp;
			xap->xa_reserve_count++;
			mutex_exit(&xap->xa_reserve_mutex);
			goto done;
		}
		mutex_exit(&xap->xa_reserve_mutex);
	}

	kmem_free(xp, xap->xa_allocsize);	/* return it to the system */

done:
	if (bp->b_iodone == xbuf_brk_done) {
		struct xbuf_brk *brkp = (struct xbuf_brk *)bp->b_clone_private;

		brkp->active--;
		if (brkp->active || xap->xa_headp == brkp->bp0) {
			done = 0;
		} else {
			brkp->off = -1;	/* mark bp0 as completed */
			done = 1;
		}
	} else {
		done = 1;
	}

	if ((xap->xa_active_limit == 0) ||
	    (xap->xa_active_count <= xap->xa_active_lowater)) {
		xbuf_dispatch(xap);
	}

	mutex_exit(&xap->xa_mutex);
	return (done);
}

static int
xbuf_brk_done(struct buf *bp)
{
	struct xbuf_brk *brkp = (struct xbuf_brk *)bp->b_clone_private;
	struct buf *bp0 = brkp->bp0;
	int done;

	mutex_enter(&brkp->mutex);
	if (bp->b_flags & B_ERROR && !(bp0->b_flags & B_ERROR)) {
		bp0->b_flags |= B_ERROR;
		bp0->b_error = bp->b_error;
	}
	if (bp->b_resid)
		bp0->b_resid = bp0->b_bcount;

	freerbuf(bp);
	brkp->nbufs--;

	done = (brkp->off == -1 && brkp->nbufs == 0);
	mutex_exit(&brkp->mutex);

	/* All buf segments done */
	if (done) {
		mutex_destroy(&brkp->mutex);
		kmem_free(brkp, sizeof (struct xbuf_brk));
		biodone(bp0);
	}
	return (0);
}

DDII void
ddi_xbuf_dispatch(ddi_xbuf_attr_t xap)
{
	mutex_enter(&xap->xa_mutex);
	if ((xap->xa_active_limit == 0) ||
	    (xap->xa_active_count <= xap->xa_active_lowater)) {
		xbuf_dispatch(xap);
	}
	mutex_exit(&xap->xa_mutex);
}


/*
 * ISSUE: in this prototype we cannot really implement ddi_xbuf_get()
 * unless we explicitly hide the xbuf pointer somewhere in the buf
 * during allocation, and then rely on the driver never changing it.
 * We can probably get away with using b_private for this for now,
 * tho it really is kinda gnarly.....
 */

/* ARGSUSED */
DDII ddi_xbuf_t
ddi_xbuf_get(struct buf *bp, ddi_xbuf_attr_t xap)
{
	return (bp->b_private);
}


/*
 * Initiate IOs for bufs on the queue.  Called from kernel thread or taskq
 * thread context. May execute concurrently for the same ddi_xbuf_attr_t.
 */

static int
xbuf_iostart(ddi_xbuf_attr_t xap)
{
	struct buf *bp;
	ddi_xbuf_t xp;

	ASSERT(xap != NULL);
	ASSERT(!mutex_owned(&xap->xa_mutex));
	ASSERT(!mutex_owned(&xap->xa_reserve_mutex));

	/*
	 * For each request on the queue, attempt to allocate the specified
	 * xbuf extension area, and call the driver's iostart() routine.
	 * We process as many requests on the queue as we can, until either
	 * (1) we run out of requests; or
	 * (2) we run out of resources; or
	 * (3) we reach the maximum limit for the given ddi_xbuf_attr_t.
	 */
	for (;;) {
		mutex_enter(&xap->xa_mutex);

		if ((bp = xap->xa_headp) == NULL) {
			break;	/* queue empty */
		}

		if ((xap->xa_active_limit != 0) &&
		    (xap->xa_active_count >= xap->xa_active_limit)) {
			break;	/* allocation limit reached */
		}

		/*
		 * If the reserve_limit is non-zero then work with the
		 * reserve else always allocate a new struct.
		 */
		if (xap->xa_reserve_limit != 0) {
			/*
			 * Don't penalize EVERY I/O by always allocating a new
			 * struct. for the sake of maintaining and not touching
			 * a reserve for a pathalogical condition that may never
			 * happen. Use the reserve entries first, this uses it
			 * like a local pool rather than a reserve that goes
			 * untouched. Make sure it's re-populated whenever it
			 * gets fully depleted just in case it really is needed.
			 * This is safe because under the pathalogical
			 * condition, when the system runs out of memory such
			 * that the below allocs fail, the reserve will still
			 * be available whether the entries are saved away on
			 * the queue unused or in-transport somewhere. Thus
			 * progress can still continue, however slowly.
			 */
			mutex_enter(&xap->xa_reserve_mutex);
			if (xap->xa_reserve_count != 0) {
				ASSERT(xap->xa_reserve_headp != NULL);
				/* Grab an xbuf from the reserve */
				xp = xap->xa_reserve_headp;
				xap->xa_reserve_headp = *((void **)xp);
				ASSERT(xap->xa_reserve_count > 0);
				xap->xa_reserve_count--;
			} else {
				/*
				 * Either this is the first time through,
				 * or the reserve has been totally depleted.
				 * Re-populate the reserve (pool). Excess
				 * structs. get released in the done path.
				 */
				while (xap->xa_reserve_count <
				    xap->xa_reserve_limit) {
					xp = kmem_alloc(xap->xa_allocsize,
					    KM_NOSLEEP);
					if (xp == NULL) {
						break;
					}
					*((void **)xp) = xap->xa_reserve_headp;
					xap->xa_reserve_headp = xp;
					xap->xa_reserve_count++;
				}
				/* And one more to use right now. */
				xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP);
			}
			mutex_exit(&xap->xa_reserve_mutex);
		} else {
			/*
			 * Try to alloc a new xbuf struct. If this fails just
			 * exit for now. We'll get back here again either upon
			 * cmd completion or via the timer handler.
			 * Question: what if the allocation attempt for the very
			 * first cmd. fails? There are no outstanding cmds so
			 * how do we get back here?
			 * Should look at un_ncmds_in_transport, if it's zero
			 * then schedule xbuf_restart_callback via the timer.
			 * Athough that breaks the architecture by bringing
			 * softstate data into this code.
			 */
			xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP);
		}
		if (xp == NULL) {
			break; /* Can't process a cmd. right now. */
		}

		/*
		 * Always run the counter. It's used/needed when xa_active_limit
		 * is non-zero which is the typical (and right now only) case.
		 */
		xap->xa_active_count++;

		if (bp->b_private) {
			struct xbuf_brk *brkp = bp->b_private;
			struct buf *bp0 = bp;

			brkp->active++;

			mutex_enter(&brkp->mutex);
			brkp->nbufs++;
			mutex_exit(&brkp->mutex);

			if (brkp->noff < bp0->b_bcount) {
				bp = bioclone(bp0, brkp->off, brkp->brksize,
				    bp0->b_edev, brkp->blkno, xbuf_brk_done,
				    NULL, KM_SLEEP);

				/* update xfer position */
				brkp->off = brkp->noff;
				brkp->noff += brkp->brksize;
				brkp->blkno += brkp->brkblk;
			} else {
				bp = bioclone(bp0, brkp->off,
				    bp0->b_bcount - brkp->off, bp0->b_edev,
				    brkp->blkno, xbuf_brk_done, NULL, KM_SLEEP);

				/* unlink the buf from the list */
				xap->xa_headp = bp0->av_forw;
				bp0->av_forw = NULL;
			}
			bp->b_clone_private = (struct buf *)brkp;
		} else {
			/* unlink the buf from the list */
			xap->xa_headp = bp->av_forw;
			bp->av_forw = NULL;
		}

		/*
		 * Hack needed in the prototype so ddi_xbuf_get() will work.
		 * Here we can rely on the sd code not changing the value in
		 * b_private (in fact it wants it there). See ddi_get_xbuf()
		 */
		bp->b_private = xp;

		/* call the driver's iostart routine */
		mutex_exit(&xap->xa_mutex);
		(*(xap->xa_strategy))(bp, xp, xap->xa_attr_arg);
	}

	ASSERT(xap->xa_pending > 0);
	xap->xa_pending--;
	mutex_exit(&xap->xa_mutex);
	return (0);
}


/*
 * Re-start IO processing if there is anything on the queue, AND if the
 * restart function is not already running/pending for this ddi_xbuf_attr_t
 */
static void
xbuf_dispatch(ddi_xbuf_attr_t xap)
{
	ASSERT(xap != NULL);
	ASSERT(xap->xa_tq != NULL);
	ASSERT(mutex_owned(&xap->xa_mutex));

	if ((xap->xa_headp != NULL) && (xap->xa_timeid == NULL) &&
	    (xap->xa_pending == 0)) {
		/*
		 * First try to see if we can dispatch the restart function
		 * immediately, in a taskq thread.  If this fails, then
		 * schedule a timeout(9F) callback to try again later.
		 */
		if (taskq_dispatch(xap->xa_tq,
		    (void (*)(void *)) xbuf_iostart, xap, KM_NOSLEEP) ==
		    TASKQID_INVALID) {
			/*
			 * Unable to enqueue the request for the taskq thread,
			 * try again later.  Note that this will keep re-trying
			 * until taskq_dispatch() succeeds.
			 */
			xap->xa_timeid = timeout(xbuf_restart_callback, xap,
			    XBUF_DISPATCH_DELAY);
		} else {
			/*
			 * This indicates that xbuf_iostart() will soon be
			 * run for this ddi_xbuf_attr_t, and we do not need to
			 * schedule another invocation via timeout/taskq
			 */
			xap->xa_pending++;
		}
	}
}

/* timeout(9F) callback routine for xbuf restart mechanism. */
static void
xbuf_restart_callback(void *arg)
{
	ddi_xbuf_attr_t	xap = arg;

	ASSERT(xap != NULL);
	ASSERT(xap->xa_tq != NULL);
	ASSERT(!mutex_owned(&xap->xa_mutex));

	mutex_enter(&xap->xa_mutex);
	xap->xa_timeid = NULL;
	xbuf_dispatch(xap);
	mutex_exit(&xap->xa_mutex);
}


DDII void
ddi_xbuf_flushq(ddi_xbuf_attr_t xap, int (*funcp)(struct buf *))
{
	struct buf *bp;
	struct buf *next_bp;
	struct buf *prev_bp = NULL;

	ASSERT(xap != NULL);
	ASSERT(xap->xa_tq != NULL);
	ASSERT(!mutex_owned(&xap->xa_mutex));

	mutex_enter(&xap->xa_mutex);

	for (bp = xap->xa_headp; bp != NULL; bp = next_bp) {

		next_bp = bp->av_forw;	/* Save for next iteration */

		/*
		 * If the user-supplied function is non-NULL and returns
		 * FALSE, then just leave the current bp on the queue.
		 */
		if ((funcp != NULL) && (!(*funcp)(bp))) {
			prev_bp = bp;
			continue;
		}

		/* de-queue the bp */
		if (bp == xap->xa_headp) {
			xap->xa_headp = next_bp;
			if (xap->xa_headp == NULL) {
				xap->xa_tailp = NULL;
			}
		} else {
			ASSERT(xap->xa_headp != NULL);
			ASSERT(prev_bp != NULL);
			if (bp == xap->xa_tailp) {
				ASSERT(next_bp == NULL);
				xap->xa_tailp = prev_bp;
			}
			prev_bp->av_forw = next_bp;
		}
		bp->av_forw = NULL;

		/* Add the bp to the flush queue */
		if (xap->xa_flush_headp == NULL) {
			ASSERT(xap->xa_flush_tailp == NULL);
			xap->xa_flush_headp = xap->xa_flush_tailp = bp;
		} else {
			ASSERT(xap->xa_flush_tailp != NULL);
			xap->xa_flush_tailp->av_forw = bp;
			xap->xa_flush_tailp = bp;
		}
	}

	while ((bp = xap->xa_flush_headp) != NULL) {
		xap->xa_flush_headp = bp->av_forw;
		if (xap->xa_flush_headp == NULL) {
			xap->xa_flush_tailp = NULL;
		}
		mutex_exit(&xap->xa_mutex);
		bioerror(bp, EIO);
		bp->b_resid = bp->b_bcount;
		biodone(bp);
		mutex_enter(&xap->xa_mutex);
	}

	mutex_exit(&xap->xa_mutex);
}