/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Note: This is the backend part of the split PV disk driver. This driver
 * is not a nexus driver, nor is it a leaf driver(block/char/stream driver).
 * Currently, it does not create any minor node. So, although, it runs in
 * backend domain, it will not be used directly from within dom0.
 * It simply gets block I/O requests issued by frontend from a shared page
 * (blkif ring buffer - defined by Xen) between backend and frontend domain,
 * generates a buf, and push it down to underlying disk target driver via
 * ldi interface. When buf is done, this driver will generate a response
 * and put it into ring buffer to inform frontend of the status of the I/O
 * request issued by it. When a new virtual device entry is added in xenstore,
 * there will be an watch event sent from Xen to xvdi framework, who will,
 * in turn, create the devinfo node and try to attach this driver
 * (see xvdi_create_dev). When frontend peer changes its state to
 * XenbusStateClose, an event will also be sent from Xen to xvdi framework,
 * who will detach and remove this devinfo node (see i_xvdi_oestate_handler).
 * I/O requests get from ring buffer and event coming from xenstore cannot be
 * trusted. We verify them in xdb_get_buf() and xdb_check_state_transition().
 *
 * Virtual device configuration is read/written from/to the database via
 * xenbus_* interfaces. Driver also use xvdi_* to interact with hypervisor.
 * There is an on-going effort to make xvdi_* cover all xenbus_*.
 */

#include <sys/types.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/dditypes.h>
#include <sys/sunddi.h>
#include <sys/list.h>
#include <sys/dkio.h>
#include <sys/cmlb.h>
#include <sys/vtoc.h>
#include <sys/modctl.h>
#include <sys/bootconf.h>
#include <sys/promif.h>
#include <sys/sysmacros.h>
#include <public/io/xenbus.h>
#include <public/io/xs_wire.h>
#include <xen/sys/xenbus_impl.h>
#include <xen/sys/xendev.h>
#include <sys/gnttab.h>
#include <sys/scsi/generic/inquiry.h>
#include <vm/seg_kmem.h>
#include <vm/hat_i86.h>
#include <sys/gnttab.h>
#include <sys/lofi.h>
#include <io/xdf.h>
#include <xen/io/blkif_impl.h>
#include <io/xdb.h>

static xdb_t *xdb_statep;
static int xdb_debug = 0;

static void xdb_close(dev_info_t *);
static int xdb_push_response(xdb_t *, uint64_t, uint8_t, uint16_t);
static int xdb_get_request(xdb_t *, blkif_request_t *);
static void blkif_get_x86_32_req(blkif_request_t *, blkif_x86_32_request_t *);
static void blkif_get_x86_64_req(blkif_request_t *, blkif_x86_64_request_t *);
static int xdb_biodone(buf_t *);


#ifdef DEBUG
/*
 * debug aid functions
 */

static void
logva(xdb_t *vdp, uint64_t va)
{
	uint64_t *page_addrs;
	int i;

	page_addrs = vdp->page_addrs;
	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
		if (page_addrs[i] == va)
			debug_enter("VA remapping found!");
	}

	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
		if (page_addrs[i] == 0) {
			page_addrs[i] = va;
			break;
		}
	}
	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
}

static void
unlogva(xdb_t *vdp, uint64_t va)
{
	uint64_t *page_addrs;
	int i;

	page_addrs = vdp->page_addrs;
	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++) {
		if (page_addrs[i] == va) {
			page_addrs[i] = 0;
			break;
		}
	}
	ASSERT(i < XDB_MAX_IO_PAGES(vdp));
}

static void
xdb_dump_request_oe(blkif_request_t *req)
{
	int i;

	/*
	 * Exploit the public interface definitions for BLKIF_OP_READ
	 * etc..
	 */
	char *op_name[] = { "read", "write", "barrier", "flush" };

	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "op=%s", op_name[req->operation]));
	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "num of segments=%d",
	    req->nr_segments));
	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "handle=%d", req->handle));
	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "id=%llu",
	    (unsigned long long)req->id));
	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "start sector=%llu",
	    (unsigned long long)req->sector_number));
	for (i = 0; i < req->nr_segments; i++) {
		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "gref=%d, first sec=%d,"
		    "last sec=%d", req->seg[i].gref, req->seg[i].first_sect,
		    req->seg[i].last_sect));
	}
}
#endif /* DEBUG */

/*
 * Statistics.
 */
static char *xdb_stats[] = {
	"rd_reqs",
	"wr_reqs",
	"br_reqs",
	"fl_reqs",
	"oo_reqs"
};

static int
xdb_kstat_update(kstat_t *ksp, int flag)
{
	xdb_t *vdp;
	kstat_named_t *knp;

	if (flag != KSTAT_READ)
		return (EACCES);

	vdp = ksp->ks_private;
	knp = ksp->ks_data;

	/*
	 * Assignment order should match that of the names in
	 * xdb_stats.
	 */
	(knp++)->value.ui64 = vdp->xs_stat_req_reads;
	(knp++)->value.ui64 = vdp->xs_stat_req_writes;
	(knp++)->value.ui64 = vdp->xs_stat_req_barriers;
	(knp++)->value.ui64 = vdp->xs_stat_req_flushes;
	(knp++)->value.ui64 = 0; /* oo_req */

	return (0);
}

static boolean_t
xdb_kstat_init(xdb_t *vdp)
{
	int nstat = sizeof (xdb_stats) / sizeof (xdb_stats[0]);
	char **cp = xdb_stats;
	kstat_named_t *knp;

	if ((vdp->xs_kstats = kstat_create("xdb",
	    ddi_get_instance(vdp->xs_dip),
	    "req_statistics", "block", KSTAT_TYPE_NAMED,
	    nstat, 0)) == NULL)
		return (B_FALSE);

	vdp->xs_kstats->ks_private = vdp;
	vdp->xs_kstats->ks_update = xdb_kstat_update;

	knp = vdp->xs_kstats->ks_data;
	while (nstat > 0) {
		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
		knp++;
		cp++;
		nstat--;
	}

	kstat_install(vdp->xs_kstats);

	return (B_TRUE);
}

static char *
i_pathname(dev_info_t *dip)
{
	char *path, *rv;

	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	(void) ddi_pathname(dip, path);
	rv = strdup(path);
	kmem_free(path, MAXPATHLEN);

	return (rv);
}

static buf_t *
xdb_get_buf(xdb_t *vdp, blkif_request_t *req, xdb_request_t *xreq)
{
	buf_t *bp;
	uint8_t segs, curseg;
	int sectors;
	int i, err;
	gnttab_map_grant_ref_t mapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	ddi_acc_handle_t acchdl;

	acchdl = vdp->xs_ring_hdl;
	bp = XDB_XREQ2BP(xreq);
	curseg = xreq->xr_curseg;
	/* init a new xdb request */
	if (req != NULL) {
		ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
		boolean_t pagemapok = B_TRUE;
		uint8_t op = ddi_get8(acchdl, &req->operation);

		xreq->xr_vdp = vdp;
		xreq->xr_op = op;
		xreq->xr_id = ddi_get64(acchdl, &req->id);
		segs = xreq->xr_buf_pages = ddi_get8(acchdl, &req->nr_segments);
		if (segs == 0) {
			if (op != BLKIF_OP_FLUSH_DISKCACHE)
				cmn_err(CE_WARN, "!non-BLKIF_OP_FLUSH_DISKCACHE"
				    " is seen from domain %d with zero "
				    "length data buffer!", vdp->xs_peer);
			bioinit(bp);
			bp->b_bcount = 0;
			bp->b_lblkno = 0;
			bp->b_un.b_addr = NULL;
			return (bp);
		} else if (op == BLKIF_OP_FLUSH_DISKCACHE) {
			cmn_err(CE_WARN, "!BLKIF_OP_FLUSH_DISKCACHE"
			    " is seen from domain %d with non-zero "
			    "length data buffer!", vdp->xs_peer);
		}

		/*
		 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
		 * according to the definition of blk interface by Xen
		 * we do sanity check here
		 */
		if (segs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
			segs = xreq->xr_buf_pages =
			    BLKIF_MAX_SEGMENTS_PER_REQUEST;

		for (i = 0; i < segs; i++) {
			uint8_t fs, ls;

			mapops[i].host_addr =
			    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
			    vdp->xs_iopage_va, xreq->xr_idx, i);
			mapops[i].dom = vdp->xs_peer;
			mapops[i].ref = ddi_get32(acchdl, &req->seg[i].gref);
			mapops[i].flags = GNTMAP_host_map;
			if (op != BLKIF_OP_READ)
				mapops[i].flags |= GNTMAP_readonly;

			fs = ddi_get8(acchdl, &req->seg[i].first_sect);
			ls = ddi_get8(acchdl, &req->seg[i].last_sect);

			/*
			 * first_sect should be no bigger than last_sect and
			 * both of them should be no bigger than
			 * XB_LAST_SECTOR_IN_SEG according to definition
			 * of blk interface by Xen, so sanity check again
			 */
			if (fs > XB_LAST_SECTOR_IN_SEG)
				fs = XB_LAST_SECTOR_IN_SEG;
			if (ls > XB_LAST_SECTOR_IN_SEG)
				ls = XB_LAST_SECTOR_IN_SEG;
			if (fs > ls)
				fs = ls;

			xreq->xr_segs[i].fs = fs;
			xreq->xr_segs[i].ls = ls;
		}

		/* map in io pages */
		err = xen_map_gref(GNTTABOP_map_grant_ref, mapops, i, B_FALSE);
		if (err != 0)
			return (NULL);
		for (i = 0; i < segs; i++) {
			/*
			 * Although HYPERVISOR_grant_table_op() returned no
			 * error, mapping of each single page can fail. So,
			 * we have to do the check here and handle the error
			 * if needed
			 */
			if (mapops[i].status != GNTST_okay) {
				int j;
				for (j = 0; j < i; j++) {
#ifdef DEBUG
					unlogva(vdp, mapops[j].host_addr);
#endif
					xen_release_pfn(
					    xreq->xr_plist[j].p_pagenum);
				}
				pagemapok = B_FALSE;
				break;
			}
			/* record page mapping handle for unmapping later */
			xreq->xr_page_hdls[i] = mapops[i].handle;
#ifdef DEBUG
			logva(vdp, mapops[i].host_addr);
#endif
			/*
			 * Pass the MFNs down using the shadow list (xr_pplist)
			 *
			 * This is pretty ugly since we have implict knowledge
			 * of how the rootnex binds buffers.
			 * The GNTTABOP_map_grant_ref op makes us do some ugly
			 * stuff since we're not allowed to touch these PTEs
			 * from the VM.
			 *
			 * Obviously, these aren't real page_t's. The rootnex
			 * only needs p_pagenum.
			 * Also, don't use btop() here or 32 bit PAE breaks.
			 */
			xreq->xr_pplist[i] = &xreq->xr_plist[i];
			xreq->xr_plist[i].p_pagenum =
			    xen_assign_pfn(mapops[i].dev_bus_addr >> PAGESHIFT);
		}

		/*
		 * not all pages mapped in successfully, unmap those mapped-in
		 * page and return failure
		 */
		if (!pagemapok) {
			gnttab_unmap_grant_ref_t unmapop;

			for (i = 0; i < segs; i++) {
				if (mapops[i].status != GNTST_okay)
					continue;
				unmapop.host_addr =
				    (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
				    vdp->xs_iopage_va, xreq->xr_idx, i);
				unmapop.dev_bus_addr = NULL;
				unmapop.handle = mapops[i].handle;
				(void) HYPERVISOR_grant_table_op(
				    GNTTABOP_unmap_grant_ref, &unmapop, 1);
			}

			return (NULL);
		}
		bioinit(bp);
		bp->b_lblkno = ddi_get64(acchdl, &req->sector_number);
		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
		bp->b_flags |= (ddi_get8(acchdl, &req->operation) ==
		    BLKIF_OP_READ) ? B_READ : (B_WRITE | B_ASYNC);
	} else {
		uint64_t blkst;
		int isread;

		/* reuse this buf */
		blkst = bp->b_lblkno + bp->b_bcount / DEV_BSIZE;
		isread = bp->b_flags & B_READ;
		bioreset(bp);
		bp->b_lblkno = blkst;
		bp->b_flags = B_BUSY | B_SHADOW | B_PHYS;
		bp->b_flags |= isread ? B_READ : (B_WRITE | B_ASYNC);
		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "reuse buf, xreq is %d!!",
		    xreq->xr_idx));
	}

	/* form a buf */
	bp->b_un.b_addr = XDB_IOPAGE_VA(vdp->xs_iopage_va, xreq->xr_idx,
	    curseg) + xreq->xr_segs[curseg].fs * DEV_BSIZE;
	bp->b_shadow = &xreq->xr_pplist[curseg];
	bp->b_iodone = xdb_biodone;
	sectors = 0;

	/*
	 * Run through the segments. There are XB_NUM_SECTORS_PER_SEG sectors
	 * per segment. On some OSes (e.g. Linux), there may be empty gaps
	 * between segments. (i.e. the first segment may end on sector 6 and
	 * the second segment start on sector 4).
	 *
	 * if a segments first sector is not set to 0, and this is not the
	 * first segment in our buf, end this buf now.
	 *
	 * if a segments last sector is not set to XB_LAST_SECTOR_IN_SEG, and
	 * this is not the last segment in the request, add this segment into
	 * the buf, then end this buf (updating the pointer to point to the
	 * next segment next time around).
	 */
	for (i = curseg; i < xreq->xr_buf_pages; i++) {
		if ((xreq->xr_segs[i].fs != 0) && (i != curseg)) {
			break;
		}
		sectors += (xreq->xr_segs[i].ls - xreq->xr_segs[i].fs + 1);
		if ((xreq->xr_segs[i].ls != XB_LAST_SECTOR_IN_SEG) &&
		    (i != (xreq->xr_buf_pages - 1))) {
			i++;
			break;
		}
	}
	xreq->xr_curseg = i;
	bp->b_bcount = sectors * DEV_BSIZE;
	bp->b_bufsize = bp->b_bcount;

	return (bp);
}

static xdb_request_t *
xdb_get_req(xdb_t *vdp)
{
	xdb_request_t *req;
	int idx;

	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
	ASSERT(vdp->xs_free_req != -1);
	req = &vdp->xs_req[vdp->xs_free_req];
	vdp->xs_free_req = req->xr_next;
	idx = req->xr_idx;
	bzero(req, sizeof (xdb_request_t));
	req->xr_idx = idx;
	return (req);
}

static void
xdb_free_req(xdb_request_t *req)
{
	xdb_t *vdp = req->xr_vdp;

	ASSERT(MUTEX_HELD(&vdp->xs_iomutex));
	req->xr_next = vdp->xs_free_req;
	vdp->xs_free_req = req->xr_idx;
}

static void
xdb_response(xdb_t *vdp, blkif_request_t *req, boolean_t ok)
{
	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;

	if (xdb_push_response(vdp, ddi_get64(acchdl, &req->id),
	    ddi_get8(acchdl, &req->operation), ok))
		xvdi_notify_oe(vdp->xs_dip);
}

static void
xdb_init_ioreqs(xdb_t *vdp)
{
	int i;

	ASSERT(vdp->xs_nentry);

	if (vdp->xs_req == NULL)
		vdp->xs_req = kmem_alloc(vdp->xs_nentry *
		    sizeof (xdb_request_t), KM_SLEEP);
#ifdef DEBUG
	if (vdp->page_addrs == NULL)
		vdp->page_addrs = kmem_zalloc(XDB_MAX_IO_PAGES(vdp) *
		    sizeof (uint64_t), KM_SLEEP);
#endif
	for (i = 0; i < vdp->xs_nentry; i++) {
		vdp->xs_req[i].xr_idx = i;
		vdp->xs_req[i].xr_next = i + 1;
	}
	vdp->xs_req[vdp->xs_nentry - 1].xr_next = -1;
	vdp->xs_free_req = 0;

	/* alloc va in host dom for io page mapping */
	vdp->xs_iopage_va = vmem_xalloc(heap_arena,
	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE, PAGESIZE, 0, 0, 0, 0,
	    VM_SLEEP);
	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
		hat_prepare_mapping(kas.a_hat,
		    vdp->xs_iopage_va + i * PAGESIZE, NULL);
}

static void
xdb_uninit_ioreqs(xdb_t *vdp)
{
	int i;

	for (i = 0; i < XDB_MAX_IO_PAGES(vdp); i++)
		hat_release_mapping(kas.a_hat,
		    vdp->xs_iopage_va + i * PAGESIZE);
	vmem_xfree(heap_arena, vdp->xs_iopage_va,
	    XDB_MAX_IO_PAGES(vdp) * PAGESIZE);
	if (vdp->xs_req != NULL) {
		kmem_free(vdp->xs_req, vdp->xs_nentry * sizeof (xdb_request_t));
		vdp->xs_req = NULL;
	}
#ifdef DEBUG
	if (vdp->page_addrs != NULL) {
		kmem_free(vdp->page_addrs, XDB_MAX_IO_PAGES(vdp) *
		    sizeof (uint64_t));
		vdp->page_addrs = NULL;
	}
#endif
}

static uint_t
xdb_intr(caddr_t arg)
{
	xdb_t		*vdp = (xdb_t *)arg;
	dev_info_t	*dip = vdp->xs_dip;
	blkif_request_t	req, *reqp = &req;
	xdb_request_t	*xreq;
	buf_t		*bp;
	uint8_t		op;
	int		ret = DDI_INTR_UNCLAIMED;

	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
	    "xdb@%s: I/O request received from dom %d",
	    ddi_get_name_addr(dip), vdp->xs_peer));

	mutex_enter(&vdp->xs_iomutex);

	/* shouldn't touch ring buffer if not in connected state */
	if (!vdp->xs_if_connected) {
		mutex_exit(&vdp->xs_iomutex);
		return (DDI_INTR_UNCLAIMED);
	}
	ASSERT(vdp->xs_hp_connected && vdp->xs_fe_initialised);

	/*
	 * We'll loop till there is no more request in the ring
	 * We won't stuck in this loop for ever since the size of ring buffer
	 * is limited, and frontend will stop pushing requests into it when
	 * the ring buffer is full
	 */

	/* req_event will be increased in xvdi_ring_get_request() */
	while (xdb_get_request(vdp, reqp)) {
		ret = DDI_INTR_CLAIMED;

		op = ddi_get8(vdp->xs_ring_hdl, &reqp->operation);
		if (op == BLKIF_OP_READ			||
		    op == BLKIF_OP_WRITE		||
		    op == BLKIF_OP_WRITE_BARRIER	||
		    op == BLKIF_OP_FLUSH_DISKCACHE) {
#ifdef DEBUG
			xdb_dump_request_oe(reqp);
#endif
			xreq = xdb_get_req(vdp);
			ASSERT(xreq);
			switch (op) {
			case BLKIF_OP_READ:
				vdp->xs_stat_req_reads++;
				break;
			case BLKIF_OP_WRITE_BARRIER:
				vdp->xs_stat_req_barriers++;
				/* FALLTHRU */
			case BLKIF_OP_WRITE:
				vdp->xs_stat_req_writes++;
				break;
			case BLKIF_OP_FLUSH_DISKCACHE:
				vdp->xs_stat_req_flushes++;
				break;
			}

			xreq->xr_curseg = 0; /* start from first segment */
			bp = xdb_get_buf(vdp, reqp, xreq);
			if (bp == NULL) {
				/* failed to form a buf */
				xdb_free_req(xreq);
				xdb_response(vdp, reqp, B_FALSE);
				continue;
			}
			bp->av_forw = NULL;

			XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
			    " buf %p, blkno %lld, size %lu, addr %p",
			    (void *)bp, (longlong_t)bp->b_blkno,
			    (ulong_t)bp->b_bcount, (void *)bp->b_un.b_addr));

			/* send bp to underlying blk driver */
			if (vdp->xs_f_iobuf == NULL) {
				vdp->xs_f_iobuf = vdp->xs_l_iobuf = bp;
			} else {
				vdp->xs_l_iobuf->av_forw = bp;
				vdp->xs_l_iobuf = bp;
			}
		} else {
			xdb_response(vdp, reqp, B_FALSE);
			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
			    "Unsupported cmd received from dom %d",
			    ddi_get_name_addr(dip), vdp->xs_peer));
		}
	}
	/* notify our taskq to push buf to underlying blk driver */
	if (ret == DDI_INTR_CLAIMED)
		cv_broadcast(&vdp->xs_iocv);

	mutex_exit(&vdp->xs_iomutex);

	return (ret);
}

static int
xdb_biodone(buf_t *bp)
{
	int i, err, bioerr;
	uint8_t segs;
	gnttab_unmap_grant_ref_t unmapops[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	xdb_request_t *xreq = XDB_BP2XREQ(bp);
	xdb_t *vdp = xreq->xr_vdp;
	buf_t *nbp;

	bioerr = geterror(bp);
	if (bioerr)
		XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: I/O error %d",
		    ddi_get_name_addr(vdp->xs_dip), bioerr));

	/* check if we are done w/ this I/O request */
	if ((bioerr == 0) && (xreq->xr_curseg < xreq->xr_buf_pages)) {
		nbp = xdb_get_buf(vdp, NULL, xreq);
		if (nbp) {
			err = ldi_strategy(vdp->xs_ldi_hdl, nbp);
			if (err == 0) {
				XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
				    "sent buf to backend ok"));
				return (DDI_SUCCESS);
			}
			bioerr = EIO;
			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN, "xdb@%s: "
			    "sent buf to backend dev failed, err=%d",
			    ddi_get_name_addr(vdp->xs_dip), err));
		} else {
			bioerr = EIO;
		}
	}

	/* unmap io pages */
	segs = xreq->xr_buf_pages;
	/*
	 * segs should be no bigger than BLKIF_MAX_SEGMENTS_PER_REQUEST
	 * according to the definition of blk interface by Xen
	 */
	ASSERT(segs <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
	for (i = 0; i < segs; i++) {
		unmapops[i].host_addr = (uint64_t)(uintptr_t)XDB_IOPAGE_VA(
		    vdp->xs_iopage_va, xreq->xr_idx, i);
#ifdef DEBUG
		mutex_enter(&vdp->xs_iomutex);
		unlogva(vdp, unmapops[i].host_addr);
		mutex_exit(&vdp->xs_iomutex);
#endif
		unmapops[i].dev_bus_addr = NULL;
		unmapops[i].handle = xreq->xr_page_hdls[i];
	}
	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
	    unmapops, segs);
	ASSERT(!err);

	/*
	 * If we have reached a barrier write or a cache flush , then we must
	 * flush all our I/Os.
	 */
	if (xreq->xr_op == BLKIF_OP_WRITE_BARRIER ||
	    xreq->xr_op == BLKIF_OP_FLUSH_DISKCACHE) {
		/*
		 * XXX At this point the write did succeed, so I don't
		 * believe we should report an error because the flush
		 * failed. However, this is a debatable point, so
		 * maybe we need to think more carefully about this.
		 * For now, just cast to void.
		 */
		(void) ldi_ioctl(vdp->xs_ldi_hdl,
		    DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, kcred, NULL);
	}

	mutex_enter(&vdp->xs_iomutex);

	/* send response back to frontend */
	if (vdp->xs_if_connected) {
		ASSERT(vdp->xs_hp_connected && vdp->xs_fe_initialised);
		if (xdb_push_response(vdp, xreq->xr_id, xreq->xr_op, bioerr))
			xvdi_notify_oe(vdp->xs_dip);
		XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
		    "sent resp back to frontend, id=%llu",
		    (unsigned long long)xreq->xr_id));
	}
	/* free io resources */
	biofini(bp);
	xdb_free_req(xreq);

	vdp->xs_ionum--;
	if (!vdp->xs_if_connected && (vdp->xs_ionum == 0)) {
		/* we're closing, someone is waiting for I/O clean-up */
		cv_signal(&vdp->xs_ionumcv);
	}

	mutex_exit(&vdp->xs_iomutex);

	return (DDI_SUCCESS);
}

static int
xdb_bindto_frontend(xdb_t *vdp)
{
	int err;
	char *oename;
	grant_ref_t gref;
	evtchn_port_t evtchn;
	dev_info_t *dip = vdp->xs_dip;
	char protocol[64] = "";

	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));

	/*
	 * Switch to the XenbusStateInitialised state.  This let's the
	 * frontend know that we're about to negotiate a connection.
	 */
	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialised);

	/*
	 * Gather info from frontend
	 */
	oename = xvdi_get_oename(dip);
	if (oename == NULL)
		return (DDI_FAILURE);

	err = xenbus_gather(XBT_NULL, oename,
	    XBP_RING_REF, "%lu", &gref,
	    XBP_EVENT_CHAN, "%u", &evtchn,
	    NULL);
	if (err != 0) {
		xvdi_dev_error(dip, err,
		    "Getting ring-ref and evtchn from frontend");
		return (DDI_FAILURE);
	}

	vdp->xs_blk_protocol = BLKIF_PROTOCOL_NATIVE;
	vdp->xs_nentry = BLKIF_RING_SIZE;
	vdp->xs_entrysize = sizeof (union blkif_sring_entry);

	err = xenbus_gather(XBT_NULL, oename,
	    XBP_PROTOCOL, "%63s", protocol, NULL);
	if (err)
		(void) strcpy(protocol, "unspecified, assuming native");
	else {
		/*
		 * We must check for NATIVE first, so that the fast path
		 * is taken for copying data from the guest to the host.
		 */
		if (strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE) != 0) {
			if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_32;
				vdp->xs_nentry = BLKIF_X86_32_RING_SIZE;
				vdp->xs_entrysize =
				    sizeof (union blkif_x86_32_sring_entry);
			} else if (strcmp(protocol, XEN_IO_PROTO_ABI_X86_64) ==
			    0) {
				vdp->xs_blk_protocol = BLKIF_PROTOCOL_X86_64;
				vdp->xs_nentry = BLKIF_X86_64_RING_SIZE;
				vdp->xs_entrysize =
				    sizeof (union blkif_x86_64_sring_entry);
			} else {
				xvdi_fatal_error(dip, err, "unknown protocol");
				return (DDI_FAILURE);
			}
		}
	}
#ifdef DEBUG
	cmn_err(CE_NOTE, "!xdb@%s: blkif protocol '%s' ",
	    ddi_get_name_addr(dip), protocol);
#endif

	/*
	 * Map and init ring.  The ring parameters must match those which
	 * have been allocated in the front end.
	 */
	if (xvdi_map_ring(dip, vdp->xs_nentry, vdp->xs_entrysize,
	    gref, &vdp->xs_ring) != DDI_SUCCESS)
		return (DDI_FAILURE);

	/*
	 * This will be removed after we use shadow I/O ring request since
	 * we don't need to access the ring itself directly, thus the access
	 * handle is not needed
	 */
	vdp->xs_ring_hdl = vdp->xs_ring->xr_acc_hdl;

	/* bind event channel */
	err = xvdi_bind_evtchn(dip, evtchn);
	if (err != DDI_SUCCESS) {
		xvdi_unmap_ring(vdp->xs_ring);
		return (DDI_FAILURE);
	}

	return (DDI_SUCCESS);
}

static void
xdb_unbindfrom_frontend(xdb_t *vdp)
{
	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));

	xvdi_free_evtchn(vdp->xs_dip);
	xvdi_unmap_ring(vdp->xs_ring);
}

/*
 * xdb_params_change() initiates a allows change to the underlying device/file
 * that the backend is accessing.  It does this by disconnecting from the
 * frontend, closing the old device, clearing a bunch of xenbus parameters,
 * and switching back to the XenbusStateInitialising state.  The frontend
 * should notice this transition to the XenbusStateInitialising state and
 * should attempt to reconnect to us (the backend).
 */
static void
xdb_params_change(xdb_t *vdp, char *params, boolean_t update_xs)
{
	xenbus_transaction_t	xbt;
	dev_info_t		*dip = vdp->xs_dip;
	char			*xsname;
	int			err;

	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));
	ASSERT(vdp->xs_params_path != NULL);

	if ((xsname = xvdi_get_xsname(dip)) == NULL)
		return;
	if (strcmp(vdp->xs_params_path, params) == 0)
		return;

	/*
	 * Close the device we're currently accessing and update the
	 * path which points to our backend device/file.
	 */
	xdb_close(dip);
	vdp->xs_fe_initialised = B_FALSE;

trans_retry:
	if ((err = xenbus_transaction_start(&xbt)) != 0) {
		xvdi_dev_error(dip, err, "params change transaction init");
		goto errout;
	}

	/*
	 * Delete all the xenbus properties that are connection dependant
	 * and go back to the initializing state so that the frontend
	 * driver can re-negotiate a connection.
	 */
	if (((err = xenbus_rm(xbt, xsname, XBP_FB)) != 0) ||
	    ((err = xenbus_rm(xbt, xsname, XBP_INFO)) != 0) ||
	    ((err = xenbus_rm(xbt, xsname, "sector-size")) != 0) ||
	    ((err = xenbus_rm(xbt, xsname, XBP_SECTORS)) != 0) ||
	    ((err = xenbus_rm(xbt, xsname, "instance")) != 0) ||
	    ((err = xenbus_rm(xbt, xsname, "node")) != 0) ||
	    (update_xs && ((err = xenbus_printf(xbt, xsname,
	    "params", "%s", params)) != 0)) ||
	    ((err = xvdi_switch_state(dip,
	    xbt, XenbusStateInitialising) > 0))) {
		(void) xenbus_transaction_end(xbt, 1);
		xvdi_dev_error(dip, err, "params change transaction setup");
		goto errout;
	}

	if ((err = xenbus_transaction_end(xbt, 0)) != 0) {
		if (err == EAGAIN) {
			/* transaction is ended, don't need to abort it */
			goto trans_retry;
		}
		xvdi_dev_error(dip, err, "params change transaction commit");
		goto errout;
	}

	/* Change the device that we plan to access */
	strfree(vdp->xs_params_path);
	vdp->xs_params_path = strdup(params);
	return;

errout:
	(void) xvdi_switch_state(dip, xbt, XenbusStateInitialising);
}

/*
 * xdb_watch_params_cb() - This callback is invoked whenever there
 * is an update to the following xenbus parameter:
 *     /local/domain/0/backend/vbd/<domU_id>/<domU_dev>/params
 *
 * This normally happens during xm block-configure operations, which
 * are used to change CD device images for HVM domUs.
 */
/*ARGSUSED*/
static void
xdb_watch_params_cb(dev_info_t *dip, const char *path, void *arg)
{
	xdb_t			*vdp = (xdb_t *)ddi_get_driver_private(dip);
	char			*xsname, *oename, *str, *str2;

	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
	    ((oename = xvdi_get_oename(dip)) == NULL)) {
		return;
	}

	mutex_enter(&vdp->xs_cbmutex);

	if (xenbus_read_str(xsname, "params", &str) != 0) {
		mutex_exit(&vdp->xs_cbmutex);
		return;
	}

	if (strcmp(vdp->xs_params_path, str) == 0) {
		/* Nothing todo */
		mutex_exit(&vdp->xs_cbmutex);
		strfree(str);
		return;
	}

	/*
	 * If the frontend isn't a cd device, doesn't support media
	 * requests, or has locked the media, then we can't change
	 * the params value.  restore the current value.
	 */
	str2 = NULL;
	if (!XDB_IS_FE_CD(vdp) ||
	    (xenbus_read_str(oename, XBP_MEDIA_REQ, &str2) != 0) ||
	    (strcmp(str2, XBV_MEDIA_REQ_LOCK) == 0)) {
		if (str2 != NULL)
			strfree(str2);
		strfree(str);

		str = i_pathname(dip);
		cmn_err(CE_NOTE,
		    "!%s: media locked, ignoring params update", str);
		strfree(str);

		mutex_exit(&vdp->xs_cbmutex);
		return;
	}

	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE,
	    "block-configure params request: \"%s\"", str));

	xdb_params_change(vdp, str, B_FALSE);
	mutex_exit(&vdp->xs_cbmutex);
	strfree(str);
}

/*
 * xdb_watch_media_req_cb() - This callback is invoked whenever there
 * is an update to the following xenbus parameter:
 *     /local/domain/<domU_id>/device/vbd/<domU_dev>/media-req
 *
 * Media requests are only supported on CD devices and are issued by
 * the frontend.  Currently the only supported media request operaions
 * are "lock" and "eject".  A "lock" prevents the backend from changing
 * the backing device/file (via xm block-configure).  An "eject" requests
 * tells the backend device that it should disconnect from the frontend
 * and closing the backing device/file that is currently in use.
 */
/*ARGSUSED*/
static void
xdb_watch_media_req_cb(dev_info_t *dip, const char *path, void *arg)
{
	xdb_t			*vdp = (xdb_t *)ddi_get_driver_private(dip);
	char			*oename, *str;

	mutex_enter(&vdp->xs_cbmutex);

	if ((oename = xvdi_get_oename(dip)) == NULL) {
		mutex_exit(&vdp->xs_cbmutex);
		return;
	}

	if (xenbus_read_str(oename, XBP_MEDIA_REQ, &str) != 0) {
		mutex_exit(&vdp->xs_cbmutex);
		return;
	}

	if (!XDB_IS_FE_CD(vdp)) {
		xvdi_dev_error(dip, EINVAL,
		    "media-req only supported for cdrom devices");
		mutex_exit(&vdp->xs_cbmutex);
		return;
	}

	if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
		mutex_exit(&vdp->xs_cbmutex);
		strfree(str);
		return;
	}
	strfree(str);

	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "media eject request"));

	xdb_params_change(vdp, "", B_TRUE);
	(void) xenbus_printf(XBT_NULL, oename,
	    XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE);
	mutex_exit(&vdp->xs_cbmutex);
}

/*
 * If we're dealing with a cdrom device, let the frontend know that
 * we support media requests via XBP_MEDIA_REQ_SUP, and setup a watch
 * to handle those frontend media request changes, which modify the
 * following xenstore parameter:
 *	/local/domain/<domU_id>/device/vbd/<domU_dev>/media-req
 */
static boolean_t
xdb_media_req_init(xdb_t *vdp)
{
	dev_info_t		*dip = vdp->xs_dip;
	char			*xsname, *oename;

	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));

	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
	    ((oename = xvdi_get_oename(dip)) == NULL))
		return (B_FALSE);

	if (!XDB_IS_FE_CD(vdp))
		return (B_TRUE);

	if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ_SUP, "%d", 1) != 0)
		return (B_FALSE);

	if (xvdi_add_xb_watch_handler(dip, oename,
	    XBP_MEDIA_REQ, xdb_watch_media_req_cb, NULL) != DDI_SUCCESS) {
		xvdi_dev_error(dip, EAGAIN,
		    "Failed to register watch for cdrom media requests");
		return (B_FALSE);
	}

	return (B_TRUE);
}

/*
 * Get our params value.  Also, if we're using "params" then setup a
 * watch to handle xm block-configure operations which modify the
 * following xenstore parameter:
 *	/local/domain/0/backend/vbd/<domU_id>/<domU_dev>/params
 */
static boolean_t
xdb_params_init(xdb_t *vdp)
{
	dev_info_t		*dip = vdp->xs_dip;
	char			*str, *xsname;
	int			err;

	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));
	ASSERT(vdp->xs_params_path == NULL);

	if ((xsname = xvdi_get_xsname(dip)) == NULL)
		return (B_FALSE);

	err = xenbus_read_str(xsname, "params", &str);
	if (err != 0) {
		return (B_FALSE);
	}
	vdp->xs_params_path = str;

	if (xvdi_add_xb_watch_handler(dip, xsname, "params",
	    xdb_watch_params_cb, NULL) != DDI_SUCCESS) {
		strfree(vdp->xs_params_path);
		vdp->xs_params_path = NULL;
		return (B_FALSE);
	}

	return (B_TRUE);
}

#define	LOFI_CTRL_NODE	"/dev/lofictl"
#define	LOFI_DEV_NODE	"/devices/pseudo/lofi@0:"
#define	LOFI_MODE	(FREAD | FWRITE | FEXCL)

static int
xdb_setup_node(xdb_t *vdp, char *path)
{
	dev_info_t		*dip = vdp->xs_dip;
	char			*xsname, *str;
	ldi_handle_t		ldi_hdl;
	struct lofi_ioctl	*li;
	int			minor, err;

	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));

	if ((xsname = xvdi_get_xsname(dip)) == NULL)
		return (DDI_FAILURE);

	if ((err = xenbus_read_str(xsname, "type", &str)) != 0) {
		xvdi_dev_error(dip, err, "Getting type from backend device");
		return (DDI_FAILURE);
	}
	if (strcmp(str, "file") == 0)
		vdp->xs_type |= XDB_DEV_BE_LOFI;
	strfree(str);

	if (!XDB_IS_BE_LOFI(vdp)) {
		(void) strlcpy(path, vdp->xs_params_path, MAXPATHLEN);
		ASSERT(vdp->xs_lofi_path == NULL);
		return (DDI_SUCCESS);
	}

	do {
		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
		    &ldi_hdl, vdp->xs_ldi_li);
	} while (err == EBUSY);
	if (err != 0) {
		return (DDI_FAILURE);
	}

	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
	(void) strlcpy(li->li_filename, vdp->xs_params_path,
	    sizeof (li->li_filename));
	err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
	    LOFI_MODE | FKIOCTL, kcred, &minor);
	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
	kmem_free(li, sizeof (*li));

	if (err != 0) {
		cmn_err(CE_WARN, "xdb@%s: Failed to create lofi dev for %s",
		    ddi_get_name_addr(dip), vdp->xs_params_path);
		return (DDI_FAILURE);
	}

	/*
	 * return '/devices/...' instead of '/dev/lofi/...' since the
	 * former is available immediately after calling ldi_ioctl
	 */
	(void) snprintf(path, MAXPATHLEN, LOFI_DEV_NODE "%d", minor);
	(void) xenbus_printf(XBT_NULL, xsname, "node", "%s", path);

	ASSERT(vdp->xs_lofi_path == NULL);
	vdp->xs_lofi_path = strdup(path);

	return (DDI_SUCCESS);
}

static void
xdb_teardown_node(xdb_t *vdp)
{
	dev_info_t *dip = vdp->xs_dip;
	ldi_handle_t ldi_hdl;
	struct lofi_ioctl *li;
	int err;

	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));

	if (!XDB_IS_BE_LOFI(vdp))
		return;

	vdp->xs_type &= ~XDB_DEV_BE_LOFI;
	ASSERT(vdp->xs_lofi_path != NULL);

	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
	(void) strlcpy(li->li_filename, vdp->xs_params_path,
	    sizeof (li->li_filename));

	do {
		err = ldi_open_by_name(LOFI_CTRL_NODE, LOFI_MODE, kcred,
		    &ldi_hdl, vdp->xs_ldi_li);
	} while (err == EBUSY);

	if (err != 0) {
		kmem_free(li, sizeof (*li));
		return;
	}

	if (ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE, (intptr_t)li,
	    LOFI_MODE | FKIOCTL, kcred, NULL) != 0) {
		cmn_err(CE_WARN, "xdb@%s: Failed to delete lofi dev for %s",
		    ddi_get_name_addr(dip), li->li_filename);
	}

	(void) ldi_close(ldi_hdl, LOFI_MODE, kcred);
	kmem_free(li, sizeof (*li));

	strfree(vdp->xs_lofi_path);
	vdp->xs_lofi_path = NULL;
}

static int
xdb_open_device(xdb_t *vdp)
{
	dev_info_t *dip = vdp->xs_dip;
	uint64_t devsize;
	int blksize;
	char *nodepath;
	char *xsname;
	char *str;
	int err;

	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));

	if (strlen(vdp->xs_params_path) == 0) {
		/*
		 * it's possible to have no backing device when dealing
		 * with a pv cdrom drive that has no virtual cd associated
		 * with it.
		 */
		ASSERT(XDB_IS_FE_CD(vdp));
		ASSERT(vdp->xs_sectors == 0);
		ASSERT(vdp->xs_ldi_li == NULL);
		ASSERT(vdp->xs_ldi_hdl == NULL);
		return (DDI_SUCCESS);
	}

	/*
	 * after the hotplug scripts have "connected" the device, check to see
	 * if we're using a dynamic device.  If so, replace the params path
	 * with the dynamic one.
	 */
	xsname = xvdi_get_xsname(dip);
	err = xenbus_read_str(xsname, "dynamic-device-path", &str);
	if (err == 0) {
		strfree(vdp->xs_params_path);
		vdp->xs_params_path = str;
	}

	if (ldi_ident_from_dip(dip, &vdp->xs_ldi_li) != 0)
		return (DDI_FAILURE);

	nodepath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);

	/* try to open backend device */
	if (xdb_setup_node(vdp, nodepath) != DDI_SUCCESS) {
		xvdi_dev_error(dip, ENXIO,
		    "Getting device path of backend device");
		ldi_ident_release(vdp->xs_ldi_li);
		kmem_free(nodepath, MAXPATHLEN);
		return (DDI_FAILURE);
	}

	if (ldi_open_by_name(nodepath,
	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE),
	    kcred, &vdp->xs_ldi_hdl, vdp->xs_ldi_li) != 0) {
		xdb_teardown_node(vdp);
		ldi_ident_release(vdp->xs_ldi_li);
		cmn_err(CE_WARN, "xdb@%s: Failed to open: %s",
		    ddi_get_name_addr(dip), nodepath);
		kmem_free(nodepath, MAXPATHLEN);
		return (DDI_FAILURE);
	}

	if (ldi_get_size(vdp->xs_ldi_hdl, &devsize) != DDI_SUCCESS) {
		(void) ldi_close(vdp->xs_ldi_hdl,
		    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
		xdb_teardown_node(vdp);
		ldi_ident_release(vdp->xs_ldi_li);
		kmem_free(nodepath, MAXPATHLEN);
		return (DDI_FAILURE);
	}

	blksize = ldi_prop_get_int64(vdp->xs_ldi_hdl,
	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
	    "blksize", DEV_BSIZE);
	if (blksize == DEV_BSIZE)
		blksize = ldi_prop_get_int(vdp->xs_ldi_hdl,
		    LDI_DEV_T_ANY | DDI_PROP_DONTPASS |
		    DDI_PROP_NOTPROM, "device-blksize", DEV_BSIZE);

	vdp->xs_sec_size = blksize;
	vdp->xs_sectors = devsize / blksize;

	/* check if the underlying device is a CD/DVD disc */
	if (ldi_prop_get_int(vdp->xs_ldi_hdl, LDI_DEV_T_ANY | DDI_PROP_DONTPASS,
	    INQUIRY_DEVICE_TYPE, DTYPE_DIRECT) == DTYPE_RODIRECT)
		vdp->xs_type |= XDB_DEV_BE_CD;

	/* check if the underlying device is a removable disk */
	if (ldi_prop_exists(vdp->xs_ldi_hdl,
	    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
	    "removable-media"))
		vdp->xs_type |= XDB_DEV_BE_RMB;

	kmem_free(nodepath, MAXPATHLEN);
	return (DDI_SUCCESS);
}

static void
xdb_close_device(xdb_t *vdp)
{
	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));

	if (strlen(vdp->xs_params_path) == 0) {
		ASSERT(XDB_IS_FE_CD(vdp));
		ASSERT(vdp->xs_sectors == 0);
		ASSERT(vdp->xs_ldi_li == NULL);
		ASSERT(vdp->xs_ldi_hdl == NULL);
		return;
	}

	(void) ldi_close(vdp->xs_ldi_hdl,
	    FREAD | (XDB_IS_RO(vdp) ? 0 : FWRITE), kcred);
	xdb_teardown_node(vdp);
	ldi_ident_release(vdp->xs_ldi_li);
	vdp->xs_type &= ~(XDB_DEV_BE_CD | XDB_DEV_BE_RMB);
	vdp->xs_sectors = 0;
	vdp->xs_ldi_li = NULL;
	vdp->xs_ldi_hdl = NULL;
}

/*
 * Kick-off connect process
 * If xs_fe_initialised == B_TRUE and xs_hp_connected == B_TRUE
 * the xs_if_connected will be changed to B_TRUE on success,
 */
static void
xdb_start_connect(xdb_t *vdp)
{
	xenbus_transaction_t	xbt;
	dev_info_t		*dip = vdp->xs_dip;
	boolean_t		fb_exists;
	int			err, instance = ddi_get_instance(dip);
	uint64_t		sectors;
	uint_t			dinfo, ssize;
	char			*xsname;

	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));

	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
	    ((vdp->xs_peer = xvdi_get_oeid(dip)) == (domid_t)-1))
		return;

	mutex_enter(&vdp->xs_iomutex);
	/*
	 * if the hotplug scripts haven't run or if the frontend is not
	 * initialized, then we can't try to connect.
	 */
	if (!vdp->xs_hp_connected || !vdp->xs_fe_initialised) {
		ASSERT(!vdp->xs_if_connected);
		mutex_exit(&vdp->xs_iomutex);
		return;
	}

	/* If we're already connected then there's nothing todo */
	if (vdp->xs_if_connected) {
		mutex_exit(&vdp->xs_iomutex);
		return;
	}
	mutex_exit(&vdp->xs_iomutex);

	/*
	 * Start connect to frontend only when backend device are ready
	 * and frontend has moved to XenbusStateInitialised, which means
	 * ready to connect.
	 */
	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE,
	    "xdb@%s: starting connection process", ddi_get_name_addr(dip)));

	if (xdb_open_device(vdp) != DDI_SUCCESS)
		return;

	if (xdb_bindto_frontend(vdp) != DDI_SUCCESS) {
		xdb_close_device(vdp);
		return;
	}

	/* init i/o requests */
	xdb_init_ioreqs(vdp);

	if (ddi_add_intr(dip, 0, NULL, NULL, xdb_intr, (caddr_t)vdp)
	    != DDI_SUCCESS) {
		xdb_uninit_ioreqs(vdp);
		xdb_unbindfrom_frontend(vdp);
		xdb_close_device(vdp);
		return;
	}

	dinfo = 0;
	if (XDB_IS_RO(vdp))
		dinfo |= VDISK_READONLY;
	if (XDB_IS_BE_RMB(vdp))
		dinfo |= VDISK_REMOVABLE;
	if (XDB_IS_BE_CD(vdp))
		dinfo |= VDISK_CDROM;
	if (XDB_IS_FE_CD(vdp))
		dinfo |= VDISK_REMOVABLE | VDISK_CDROM;

	/*
	 * we can recieve intr any time from now on
	 * mark that we're ready to take intr
	 */
	mutex_enter(&vdp->xs_iomutex);
	ASSERT(vdp->xs_fe_initialised);
	vdp->xs_if_connected = B_TRUE;
	mutex_exit(&vdp->xs_iomutex);

trans_retry:
	/* write into xenstore the info needed by frontend */
	if ((err = xenbus_transaction_start(&xbt)) != 0) {
		xvdi_dev_error(dip, err, "connect transaction init");
		goto errout;
	}

	/* If feature-barrier isn't present in xenstore, add it.  */
	fb_exists = xenbus_exists(xsname, XBP_FB);

	ssize = (vdp->xs_sec_size == 0) ? DEV_BSIZE : vdp->xs_sec_size;
	sectors = vdp->xs_sectors;
	if (((!fb_exists &&
	    (err = xenbus_printf(xbt, xsname, XBP_FB, "%d", 1)))) ||
	    (err = xenbus_printf(xbt, xsname, XBP_INFO, "%u", dinfo)) ||
	    (err = xenbus_printf(xbt, xsname, XBP_SECTOR_SIZE, "%u", ssize)) ||
	    (err = xenbus_printf(xbt, xsname,
	    XBP_SECTORS, "%"PRIu64, sectors)) ||
	    (err = xenbus_printf(xbt, xsname, "instance", "%d", instance)) ||
	    ((err = xvdi_switch_state(dip, xbt, XenbusStateConnected)) > 0)) {
		(void) xenbus_transaction_end(xbt, 1);
		xvdi_dev_error(dip, err, "connect transaction setup");
		goto errout;
	}

	if ((err = xenbus_transaction_end(xbt, 0)) != 0) {
		if (err == EAGAIN) {
			/* transaction is ended, don't need to abort it */
			goto trans_retry;
		}
		xvdi_dev_error(dip, err, "connect transaction commit");
		goto errout;
	}

	return;

errout:
	xdb_close(dip);
}

/*
 * Disconnect from frontend and close backend device
 */
static void
xdb_close(dev_info_t *dip)
{
	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);

	ASSERT(MUTEX_HELD(&vdp->xs_cbmutex));
	mutex_enter(&vdp->xs_iomutex);

	/*
	 * if the hotplug scripts haven't run or if the frontend is not
	 * initialized, then we can't be connected, so there's no
	 * connection to close.
	 */
	if (!vdp->xs_hp_connected || !vdp->xs_fe_initialised) {
		ASSERT(!vdp->xs_if_connected);
		mutex_exit(&vdp->xs_iomutex);
		return;
	}

	/* if we're not connected, there's nothing to do */
	if (!vdp->xs_if_connected) {
		cv_broadcast(&vdp->xs_iocv);
		mutex_exit(&vdp->xs_iomutex);
		return;
	}

	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "closing while connected"));

	vdp->xs_if_connected = B_FALSE;
	cv_broadcast(&vdp->xs_iocv);

	mutex_exit(&vdp->xs_iomutex);

	/* stop accepting I/O request from frontend */
	ddi_remove_intr(dip, 0, NULL);

	/* clear all on-going I/Os, if any */
	mutex_enter(&vdp->xs_iomutex);
	while (vdp->xs_ionum > 0)
		cv_wait(&vdp->xs_ionumcv, &vdp->xs_iomutex);
	mutex_exit(&vdp->xs_iomutex);

	/* clean up resources and close this interface */
	xdb_uninit_ioreqs(vdp);
	xdb_unbindfrom_frontend(vdp);
	xdb_close_device(vdp);
	vdp->xs_peer = (domid_t)-1;
}

static void
xdb_send_buf(void *arg)
{
	xdb_t	*vdp = (xdb_t *)arg;
	buf_t	*bp;
	int	err;

	mutex_enter(&vdp->xs_iomutex);
	while (vdp->xs_send_buf) {
		if ((bp = vdp->xs_f_iobuf) == NULL) {
			/* wait for some io to send */
			XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
			    "send buf waiting for io"));
			cv_wait(&vdp->xs_iocv, &vdp->xs_iomutex);
			continue;
		}

		vdp->xs_f_iobuf = bp->av_forw;
		bp->av_forw = NULL;
		vdp->xs_ionum++;

		mutex_exit(&vdp->xs_iomutex);
		if (bp->b_bcount == 0) {
			/* no I/O needs to be done */
			(void) xdb_biodone(bp);
			mutex_enter(&vdp->xs_iomutex);
			continue;
		}

		err = EIO;
		if (vdp->xs_ldi_hdl != NULL)
			err = ldi_strategy(vdp->xs_ldi_hdl, bp);
		if (err != 0) {
			bp->b_flags |= B_ERROR;
			(void) xdb_biodone(bp);
			XDB_DBPRINT(XDB_DBG_IO, (CE_WARN,
			    "xdb@%s: sent buf to backend devfailed, err=%d",
			    ddi_get_name_addr(vdp->xs_dip), err));
		} else {
			XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE,
			    "sent buf to backend ok"));
		}
		mutex_enter(&vdp->xs_iomutex);
	}
	XDB_DBPRINT(XDB_DBG_IO, (CE_NOTE, "send buf finishing"));
	mutex_exit(&vdp->xs_iomutex);
}

/*ARGSUSED*/
static void
xdb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
    void *impl_data)
{
	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);

	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
	    "hotplug status change to %d!", ddi_get_name_addr(dip), state));

	if (state != Connected)
		return;

	mutex_enter(&vdp->xs_cbmutex);

	/* If hotplug script have already run, there's nothing todo */
	if (vdp->xs_hp_connected) {
		mutex_exit(&vdp->xs_cbmutex);
		return;
	}

	vdp->xs_hp_connected = B_TRUE;
	xdb_start_connect(vdp);
	mutex_exit(&vdp->xs_cbmutex);
}

/*ARGSUSED*/
static void
xdb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg,
    void *impl_data)
{
	XenbusState new_state = *(XenbusState *)impl_data;
	xdb_t *vdp = (xdb_t *)ddi_get_driver_private(dip);

	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: "
	    "otherend state change to %d!", ddi_get_name_addr(dip), new_state));

	mutex_enter(&vdp->xs_cbmutex);

	/*
	 * Now it'd really be nice if there was a well defined state
	 * transition model for xen frontend drivers, but unfortunatly
	 * there isn't.  So we're stuck with assuming that all state
	 * transitions are possible, and we'll just have to deal with
	 * them regardless of what state we're in.
	 */
	switch (new_state) {
	case XenbusStateUnknown:
	case XenbusStateInitialising:
	case XenbusStateInitWait:
		/* tear down our connection to the frontend */
		xdb_close(dip);
		vdp->xs_fe_initialised = B_FALSE;
		break;

	case XenbusStateInitialised:
		/*
		 * If we were conected, then we need to drop the connection
		 * and re-negotiate it.
		 */
		xdb_close(dip);
		vdp->xs_fe_initialised = B_TRUE;
		xdb_start_connect(vdp);
		break;

	case XenbusStateConnected:
		/* nothing todo here other than congratulate the frontend */
		break;

	case XenbusStateClosing:
		/* monkey see monkey do */
		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
		break;

	case XenbusStateClosed:
		/* tear down our connection to the frontend */
		xdb_close(dip);
		vdp->xs_fe_initialised = B_FALSE;
		(void) xvdi_switch_state(dip, XBT_NULL, new_state);
		break;
	}

	mutex_exit(&vdp->xs_cbmutex);
}

static int
xdb_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
	ddi_iblock_cookie_t	ibc;
	xdb_t			*vdp;
	int			instance = ddi_get_instance(dip);
	char			*xsname, *oename;
	char			*str;

	switch (cmd) {
	case DDI_RESUME:
		return (DDI_FAILURE);
	case DDI_ATTACH:
		break;
	default:
		return (DDI_FAILURE);
	}
	/* DDI_ATTACH */

	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
	    ((oename = xvdi_get_oename(dip)) == NULL))
		return (DDI_FAILURE);

	/*
	 * Disable auto-detach.  This is necessary so that we don't get
	 * detached while we're disconnected from the front end.
	 */
	(void) ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1);

	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
		return (DDI_FAILURE);

	if (ddi_soft_state_zalloc(xdb_statep, instance) != DDI_SUCCESS)
		return (DDI_FAILURE);

	vdp = ddi_get_soft_state(xdb_statep, instance);
	vdp->xs_dip = dip;
	mutex_init(&vdp->xs_iomutex, NULL, MUTEX_DRIVER, (void *)ibc);
	mutex_init(&vdp->xs_cbmutex, NULL, MUTEX_DRIVER, (void *)ibc);
	cv_init(&vdp->xs_iocv, NULL, CV_DRIVER, NULL);
	cv_init(&vdp->xs_ionumcv, NULL, CV_DRIVER, NULL);
	ddi_set_driver_private(dip, vdp);

	if (!xdb_kstat_init(vdp))
		goto errout1;

	/* Check if the frontend device is supposed to be a cdrom */
	if (xenbus_read_str(oename, XBP_DEV_TYPE, &str) != 0)
		return (DDI_FAILURE);
	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
		vdp->xs_type |= XDB_DEV_FE_CD;
	strfree(str);

	/* Check if the frontend device is supposed to be read only */
	if (xenbus_read_str(xsname, "mode", &str) != 0)
		return (DDI_FAILURE);
	if ((strcmp(str, "r") == NULL) || (strcmp(str, "ro") == NULL))
		vdp->xs_type |= XDB_DEV_RO;
	strfree(str);

	mutex_enter(&vdp->xs_cbmutex);
	if (!xdb_media_req_init(vdp) || !xdb_params_init(vdp)) {
		xvdi_remove_xb_watch_handlers(dip);
		mutex_exit(&vdp->xs_cbmutex);
		goto errout2;
	}
	mutex_exit(&vdp->xs_cbmutex);

	vdp->xs_send_buf = B_TRUE;
	vdp->xs_iotaskq = ddi_taskq_create(dip, "xdb_iotask", 1,
	    TASKQ_DEFAULTPRI, 0);
	(void) ddi_taskq_dispatch(vdp->xs_iotaskq, xdb_send_buf, vdp,
	    DDI_SLEEP);

	/* Watch frontend and hotplug state change */
	if ((xvdi_add_event_handler(dip, XS_OE_STATE, xdb_oe_state_change,
	    NULL) != DDI_SUCCESS) ||
	    (xvdi_add_event_handler(dip, XS_HP_STATE, xdb_hp_state_change,
	    NULL) != DDI_SUCCESS))
		goto errout3;

	/*
	 * Kick-off hotplug script
	 */
	if (xvdi_post_event(dip, XEN_HP_ADD) != DDI_SUCCESS) {
		cmn_err(CE_WARN, "xdb@%s: failed to start hotplug script",
		    ddi_get_name_addr(dip));
		goto errout3;
	}

	/*
	 * start waiting for hotplug event and otherend state event
	 * mainly for debugging, frontend will not take any op seeing this
	 */
	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);

	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: attached!",
	    ddi_get_name_addr(dip)));
	return (DDI_SUCCESS);

errout3:
	ASSERT(vdp->xs_hp_connected && vdp->xs_if_connected);

	xvdi_remove_event_handler(dip, NULL);

	/* Disconnect from the backend */
	mutex_enter(&vdp->xs_cbmutex);
	mutex_enter(&vdp->xs_iomutex);
	vdp->xs_send_buf = B_FALSE;
	cv_broadcast(&vdp->xs_iocv);
	mutex_exit(&vdp->xs_iomutex);
	mutex_exit(&vdp->xs_cbmutex);

	/* wait for all io to dtrain and destroy io taskq */
	ddi_taskq_destroy(vdp->xs_iotaskq);

	/* tear down block-configure watch */
	mutex_enter(&vdp->xs_cbmutex);
	xvdi_remove_xb_watch_handlers(dip);
	mutex_exit(&vdp->xs_cbmutex);

errout2:
	/* remove kstats */
	kstat_delete(vdp->xs_kstats);

errout1:
	/* free up driver state */
	ddi_set_driver_private(dip, NULL);
	cv_destroy(&vdp->xs_iocv);
	cv_destroy(&vdp->xs_ionumcv);
	mutex_destroy(&vdp->xs_cbmutex);
	mutex_destroy(&vdp->xs_iomutex);
	ddi_soft_state_free(xdb_statep, instance);

	return (DDI_FAILURE);
}

/*ARGSUSED*/
static int
xdb_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	int instance = ddi_get_instance(dip);
	xdb_t *vdp = XDB_INST2SOFTS(instance);

	switch (cmd) {
	case DDI_SUSPEND:
		return (DDI_FAILURE);
	case DDI_DETACH:
		break;
	default:
		return (DDI_FAILURE);
	}

	/* DDI_DETACH handling */

	/* refuse to detach if we're still in use by the frontend */
	mutex_enter(&vdp->xs_iomutex);
	if (vdp->xs_if_connected) {
		mutex_exit(&vdp->xs_iomutex);
		return (DDI_FAILURE);
	}
	vdp->xs_send_buf = B_FALSE;
	cv_broadcast(&vdp->xs_iocv);
	mutex_exit(&vdp->xs_iomutex);

	xvdi_remove_event_handler(dip, NULL);
	(void) xvdi_post_event(dip, XEN_HP_REMOVE);

	ddi_taskq_destroy(vdp->xs_iotaskq);

	mutex_enter(&vdp->xs_cbmutex);
	xvdi_remove_xb_watch_handlers(dip);
	mutex_exit(&vdp->xs_cbmutex);

	cv_destroy(&vdp->xs_iocv);
	cv_destroy(&vdp->xs_ionumcv);
	mutex_destroy(&vdp->xs_cbmutex);
	mutex_destroy(&vdp->xs_iomutex);
	kstat_delete(vdp->xs_kstats);
	ddi_set_driver_private(dip, NULL);
	ddi_soft_state_free(xdb_statep, instance);

	XDB_DBPRINT(XDB_DBG_INFO, (CE_NOTE, "xdb@%s: detached!",
	    ddi_get_name_addr(dip)));
	return (DDI_SUCCESS);
}

static struct dev_ops xdb_dev_ops = {
	DEVO_REV,	/* devo_rev */
	0,		/* devo_refcnt */
	ddi_getinfo_1to1, /* devo_getinfo */
	nulldev,	/* devo_identify */
	nulldev,	/* devo_probe */
	xdb_attach,	/* devo_attach */
	xdb_detach,	/* devo_detach */
	nodev,		/* devo_reset */
	NULL,		/* devo_cb_ops */
	NULL,		/* devo_bus_ops */
	NULL,		/* power */
	ddi_quiesce_not_needed, /* quiesce */
};

/*
 * Module linkage information for the kernel.
 */
static struct modldrv modldrv = {
	&mod_driverops,			/* Type of module. */
	"vbd backend driver",		/* Name of the module */
	&xdb_dev_ops			/* driver ops */
};

static struct modlinkage xdb_modlinkage = {
	MODREV_1,
	&modldrv,
	NULL
};

int
_init(void)
{
	int rv;

	if ((rv = ddi_soft_state_init((void **)&xdb_statep,
	    sizeof (xdb_t), 0)) == 0)
		if ((rv = mod_install(&xdb_modlinkage)) != 0)
			ddi_soft_state_fini((void **)&xdb_statep);
	return (rv);
}

int
_fini(void)
{
	int rv;

	if ((rv = mod_remove(&xdb_modlinkage)) != 0)
		return (rv);
	ddi_soft_state_fini((void **)&xdb_statep);
	return (rv);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&xdb_modlinkage, modinfop));
}

static int
xdb_get_request(xdb_t *vdp, blkif_request_t *req)
{
	void *src = xvdi_ring_get_request(vdp->xs_ring);

	if (src == NULL)
		return (0);

	switch (vdp->xs_blk_protocol) {
	case BLKIF_PROTOCOL_NATIVE:
		(void) memcpy(req, src, sizeof (*req));
		break;
	case BLKIF_PROTOCOL_X86_32:
		blkif_get_x86_32_req(req, src);
		break;
	case BLKIF_PROTOCOL_X86_64:
		blkif_get_x86_64_req(req, src);
		break;
	default:
		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
		    ddi_get_name_addr(vdp->xs_dip),
		    vdp->xs_blk_protocol);
	}
	return (1);
}

static int
xdb_push_response(xdb_t *vdp, uint64_t id, uint8_t op, uint16_t status)
{
	ddi_acc_handle_t acchdl = vdp->xs_ring_hdl;
	blkif_response_t *rsp = xvdi_ring_get_response(vdp->xs_ring);
	blkif_x86_32_response_t *rsp_32 = (blkif_x86_32_response_t *)rsp;
	blkif_x86_64_response_t *rsp_64 = (blkif_x86_64_response_t *)rsp;

	ASSERT(rsp);

	switch (vdp->xs_blk_protocol) {
	case BLKIF_PROTOCOL_NATIVE:
		ddi_put64(acchdl, &rsp->id, id);
		ddi_put8(acchdl, &rsp->operation, op);
		ddi_put16(acchdl, (uint16_t *)&rsp->status,
		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
		break;
	case BLKIF_PROTOCOL_X86_32:
		ddi_put64(acchdl, &rsp_32->id, id);
		ddi_put8(acchdl, &rsp_32->operation, op);
		ddi_put16(acchdl, (uint16_t *)&rsp_32->status,
		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
		break;
	case BLKIF_PROTOCOL_X86_64:
		ddi_put64(acchdl, &rsp_64->id, id);
		ddi_put8(acchdl, &rsp_64->operation, op);
		ddi_put16(acchdl, (uint16_t *)&rsp_64->status,
		    status == 0 ? BLKIF_RSP_OKAY : BLKIF_RSP_ERROR);
		break;
	default:
		cmn_err(CE_PANIC, "xdb@%s: unrecognised protocol: %d",
		    ddi_get_name_addr(vdp->xs_dip),
		    vdp->xs_blk_protocol);
	}

	return (xvdi_ring_push_response(vdp->xs_ring));
}

static void
blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
{
	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
	dst->operation = src->operation;
	dst->nr_segments = src->nr_segments;
	dst->handle = src->handle;
	dst->id = src->id;
	dst->sector_number = src->sector_number;
	if (n > src->nr_segments)
		n = src->nr_segments;
	for (i = 0; i < n; i++)
		dst->seg[i] = src->seg[i];
}

static void
blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
{
	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
	dst->operation = src->operation;
	dst->nr_segments = src->nr_segments;
	dst->handle = src->handle;
	dst->id = src->id;
	dst->sector_number = src->sector_number;
	if (n > src->nr_segments)
		n = src->nr_segments;
	for (i = 0; i < n; i++)
		dst->seg[i] = src->seg[i];
}