/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 *
 * Copyright (c) 2004 Christian Limpach.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. This section intentionally left blank.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
/*
 * Section 3 of the above license was updated in response to bug 6379571.
 */

/*
 * xnf.c - GLDv3 network driver for domU.
 */

/*
 * This driver uses four per-instance locks:
 *
 * xnf_gref_lock:
 *
 *    Protects access to the grant reference list stored in
 *    xnf_gref_head. Grant references should be acquired and released
 *    using gref_get() and gref_put() respectively.
 *
 * xnf_schedlock:
 *
 *    Protects:
 *    xnf_need_sched - used to record that a previous transmit attempt
 *       failed (and consequently it will be necessary to call
 *       mac_tx_update() when transmit resources are available).
 *    xnf_pending_multicast - the number of multicast requests that
 *       have been submitted to the backend for which we have not
 *       processed responses.
 *
 * xnf_txlock:
 *
 *    Protects the transmit ring (xnf_tx_ring) and associated
 *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
 *
 * xnf_rxlock:
 *
 *    Protects the receive ring (xnf_rx_ring) and associated
 *    structures (notably xnf_rx_pkt_info).
 *
 * If driver-global state that affects both the transmit and receive
 * rings is manipulated, both xnf_txlock and xnf_rxlock should be
 * held, in that order.
 *
 * xnf_schedlock is acquired both whilst holding xnf_txlock and
 * without. It should always be acquired after xnf_txlock if both are
 * held.
 *
 * Notes:
 * - atomic_add_64() is used to manipulate counters where we require
 *   accuracy. For counters intended only for observation by humans,
 *   post increment/decrement are used instead.
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/stream.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/devops.h>
#include <sys/sunddi.h>
#include <sys/sunndi.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <sys/strsun.h>
#include <sys/pattr.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <sys/gld.h>
#include <sys/modctl.h>
#include <sys/mac_provider.h>
#include <sys/mac_ether.h>
#include <sys/bootinfo.h>
#include <sys/mach_mmu.h>
#ifdef	XPV_HVM_DRIVER
#include <sys/xpv_support.h>
#include <sys/hypervisor.h>
#else
#include <sys/hypervisor.h>
#include <sys/evtchn_impl.h>
#include <sys/balloon_impl.h>
#endif
#include <xen/public/io/netif.h>
#include <sys/gnttab.h>
#include <xen/sys/xendev.h>
#include <sys/sdt.h>
#include <sys/note.h>
#include <sys/debug.h>

#include <io/xnf.h>

#if defined(DEBUG) || defined(__lint)
#define	XNF_DEBUG
#endif

#ifdef XNF_DEBUG
int xnf_debug = 0;
xnf_t *xnf_debug_instance = NULL;
#endif

/*
 * On a 32 bit PAE system physical and machine addresses are larger
 * than 32 bits.  ddi_btop() on such systems take an unsigned long
 * argument, and so addresses above 4G are truncated before ddi_btop()
 * gets to see them.  To avoid this, code the shift operation here.
 */
#define	xnf_btop(addr)	((addr) >> PAGESHIFT)

unsigned int	xnf_max_tx_frags = 1;

/*
 * Should we use the multicast control feature if the backend provides
 * it?
 */
boolean_t xnf_multicast_control = B_TRUE;

/*
 * Received packets below this size are copied to a new streams buffer
 * rather than being desballoc'ed.
 *
 * This value is chosen to accommodate traffic where there are a large
 * number of small packets. For data showing a typical distribution,
 * see:
 *
 * Sinha07a:
 *	Rishi Sinha, Christos Papadopoulos, and John
 *	Heidemann. Internet Packet Size Distributions: Some
 *	Observations. Technical Report ISI-TR-2007-643,
 *	USC/Information Sciences Institute, May, 2007. Orignally
 *	released October 2005 as web page
 *	http://netweb.usc.edu/~sinha/pkt-sizes/.
 *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
 */
size_t xnf_rx_copy_limit = 64;

#define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
#define	INVALID_GRANT_REF	((grant_ref_t)-1)
#define	INVALID_TX_ID		((uint16_t)-1)

#define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
#define	TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))

/* Required system entry points */
static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);

/* Required driver entry points for Nemo */
static int	xnf_start(void *);
static void	xnf_stop(void *);
static int	xnf_set_mac_addr(void *, const uint8_t *);
static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
static int	xnf_set_promiscuous(void *, boolean_t);
static mblk_t	*xnf_send(void *, mblk_t *);
static uint_t	xnf_intr(caddr_t);
static int	xnf_stat(void *, uint_t, uint64_t *);
static boolean_t xnf_getcapab(void *, mac_capab_t, void *);

/* Driver private functions */
static int xnf_alloc_dma_resources(xnf_t *);
static void xnf_release_dma_resources(xnf_t *);
static void xnf_release_mblks(xnf_t *);

static int xnf_buf_constructor(void *, void *, int);
static void xnf_buf_destructor(void *, void *);
static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
#pragma inline(xnf_buf_get)
static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
#pragma inline(xnf_buf_put)
static void xnf_buf_refresh(xnf_buf_t *);
#pragma inline(xnf_buf_refresh)
static void xnf_buf_recycle(xnf_buf_t *);

static int xnf_tx_buf_constructor(void *, void *, int);
static void xnf_tx_buf_destructor(void *, void *);

static grant_ref_t gref_get(xnf_t *);
#pragma inline(gref_get)
static void gref_put(xnf_t *, grant_ref_t);
#pragma inline(gref_put)

static xnf_txid_t *txid_get(xnf_t *);
#pragma inline(txid_get)
static void txid_put(xnf_t *, xnf_txid_t *);
#pragma inline(txid_put)

void xnf_send_driver_status(int, int);
static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
static int xnf_tx_clean_ring(xnf_t  *);
static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
    void *, void *);
static boolean_t xnf_kstat_init(xnf_t *);
static void xnf_rx_collect(xnf_t *);

static mac_callbacks_t xnf_callbacks = {
	MC_GETCAPAB,
	xnf_stat,
	xnf_start,
	xnf_stop,
	xnf_set_promiscuous,
	xnf_set_multicast,
	xnf_set_mac_addr,
	xnf_send,
	NULL,
	xnf_getcapab
};

/* DMA attributes for network ring buffer */
static ddi_dma_attr_t ringbuf_dma_attr = {
	DMA_ATTR_V0,		/* version of this structure */
	0,			/* lowest usable address */
	0xffffffffffffffffULL,	/* highest usable address */
	0x7fffffff,		/* maximum DMAable byte count */
	MMU_PAGESIZE,		/* alignment in bytes */
	0x7ff,			/* bitmap of burst sizes */
	1,			/* minimum transfer */
	0xffffffffU,		/* maximum transfer */
	0xffffffffffffffffULL,	/* maximum segment length */
	1,			/* maximum number of segments */
	1,			/* granularity */
	0,			/* flags (reserved) */
};

/* DMA attributes for transmit and receive data */
static ddi_dma_attr_t buf_dma_attr = {
	DMA_ATTR_V0,		/* version of this structure */
	0,			/* lowest usable address */
	0xffffffffffffffffULL,	/* highest usable address */
	0x7fffffff,		/* maximum DMAable byte count */
	MMU_PAGESIZE,		/* alignment in bytes */
	0x7ff,			/* bitmap of burst sizes */
	1,			/* minimum transfer */
	0xffffffffU,		/* maximum transfer */
	0xffffffffffffffffULL,	/* maximum segment length */
	1,			/* maximum number of segments */
	1,			/* granularity */
	0,			/* flags (reserved) */
};

/* DMA access attributes for registers and descriptors */
static ddi_device_acc_attr_t accattr = {
	DDI_DEVICE_ATTR_V0,
	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
	DDI_STRICTORDER_ACC
};

/* DMA access attributes for data: NOT to be byte swapped. */
static ddi_device_acc_attr_t data_accattr = {
	DDI_DEVICE_ATTR_V0,
	DDI_NEVERSWAP_ACC,
	DDI_STRICTORDER_ACC
};

DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
    nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);

static struct modldrv xnf_modldrv = {
	&mod_driverops,
	"Virtual Ethernet driver",
	&xnf_dev_ops
};

static struct modlinkage modlinkage = {
	MODREV_1, &xnf_modldrv, NULL
};

int
_init(void)
{
	int r;

	mac_init_ops(&xnf_dev_ops, "xnf");
	r = mod_install(&modlinkage);
	if (r != DDI_SUCCESS)
		mac_fini_ops(&xnf_dev_ops);

	return (r);
}

int
_fini(void)
{
	return (EBUSY); /* XXPV should be removable */
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}

/*
 * Acquire a grant reference.
 */
static grant_ref_t
gref_get(xnf_t *xnfp)
{
	grant_ref_t gref;

	mutex_enter(&xnfp->xnf_gref_lock);

	do {
		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);

	} while ((gref == INVALID_GRANT_REF) &&
	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));

	mutex_exit(&xnfp->xnf_gref_lock);

	if (gref == INVALID_GRANT_REF) {
		xnfp->xnf_stat_gref_failure++;
	} else {
		atomic_add_64(&xnfp->xnf_stat_gref_outstanding, 1);
		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
			xnfp->xnf_stat_gref_peak =
			    xnfp->xnf_stat_gref_outstanding;
	}

	return (gref);
}

/*
 * Release a grant reference.
 */
static void
gref_put(xnf_t *xnfp, grant_ref_t gref)
{
	ASSERT(gref != INVALID_GRANT_REF);

	mutex_enter(&xnfp->xnf_gref_lock);
	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
	mutex_exit(&xnfp->xnf_gref_lock);

	atomic_add_64(&xnfp->xnf_stat_gref_outstanding, -1);
}

/*
 * Acquire a transmit id.
 */
static xnf_txid_t *
txid_get(xnf_t *xnfp)
{
	xnf_txid_t *tidp;

	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));

	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
		return (NULL);

	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));

	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
	xnfp->xnf_tx_pkt_id_head = tidp->next;
	tidp->next = INVALID_TX_ID;

	ASSERT(tidp->txbuf == NULL);

	return (tidp);
}

/*
 * Release a transmit id.
 */
static void
txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
{
	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
	ASSERT(TX_ID_VALID(tidp->id));
	ASSERT(tidp->next == INVALID_TX_ID);

	tidp->txbuf = NULL;
	tidp->next = xnfp->xnf_tx_pkt_id_head;
	xnfp->xnf_tx_pkt_id_head = tidp->id;
}

/*
 * Get `wanted' slots in the transmit ring, waiting for at least that
 * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
 * `wanted' to zero.
 *
 * Return the number of slots available.
 */
static int
tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
{
	int slotsfree;
	boolean_t forced_clean = (wanted == 0);

	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));

	/* LINTED: constant in conditional context */
	while (B_TRUE) {
		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);

		if ((slotsfree < wanted) || forced_clean)
			slotsfree = xnf_tx_clean_ring(xnfp);

		/*
		 * If there are more than we need free, tell other
		 * people to come looking again. We hold txlock, so we
		 * are able to take our slots before anyone else runs.
		 */
		if (slotsfree > wanted)
			cv_broadcast(&xnfp->xnf_cv_tx_slots);

		if (slotsfree >= wanted)
			break;

		if (!wait)
			break;

		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
	}

	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));

	return (slotsfree);
}

static int
xnf_setup_rings(xnf_t *xnfp)
{
	domid_t			oeid;
	struct xenbus_device	*xsd;
	RING_IDX		i;
	int			err;
	xnf_txid_t		*tidp;
	xnf_buf_t **bdescp;

	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);

	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);

	err = gnttab_grant_foreign_access(oeid,
	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
	if (err <= 0) {
		err = -err;
		xenbus_dev_error(xsd, err, "granting access to tx ring page");
		goto out;
	}
	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;

	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);

	err = gnttab_grant_foreign_access(oeid,
	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
	if (err <= 0) {
		err = -err;
		xenbus_dev_error(xsd, err, "granting access to rx ring page");
		goto out;
	}
	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;

	mutex_enter(&xnfp->xnf_txlock);

	/*
	 * Setup/cleanup the TX ring.  Note that this can lose packets
	 * after a resume, but we expect to stagger on.
	 */
	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
	    i < NET_TX_RING_SIZE;
	    i++, tidp++) {
		xnf_txbuf_t *txp;

		tidp->id = i;

		txp = tidp->txbuf;
		if (txp == NULL) {
			tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
			txid_put(xnfp, tidp);
			continue;
		}

		ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF);
		ASSERT(txp->tx_mp != NULL);

		switch (txp->tx_type) {
		case TX_DATA:
			VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref)
			    == 0);

			if (txp->tx_bdesc == NULL) {
				(void) gnttab_end_foreign_access_ref(
				    txp->tx_txreq.gref, 1);
				gref_put(xnfp, txp->tx_txreq.gref);
				(void) ddi_dma_unbind_handle(
				    txp->tx_dma_handle);
			} else {
				xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
			}

			freemsg(txp->tx_mp);
			txid_put(xnfp, tidp);
			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);

			break;

		case TX_MCAST_REQ:
			txp->tx_type = TX_MCAST_RSP;
			txp->tx_status = NETIF_RSP_DROPPED;
			cv_broadcast(&xnfp->xnf_cv_multicast);

			/*
			 * The request consumed two slots in the ring,
			 * yet only a single xnf_txid_t is used. Step
			 * over the empty slot.
			 */
			i++;
			ASSERT(i < NET_TX_RING_SIZE);

			break;

		case TX_MCAST_RSP:
			break;
		}
	}

	/* LINTED: constant in conditional context */
	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
	/* LINTED: constant in conditional context */
	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
	    xnfp->xnf_tx_ring.sring, PAGESIZE);

	mutex_exit(&xnfp->xnf_txlock);

	mutex_enter(&xnfp->xnf_rxlock);

	/*
	 * Clean out any buffers currently posted to the receive ring
	 * before we reset it.
	 */
	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
	    i < NET_RX_RING_SIZE;
	    i++, bdescp++) {
		if (*bdescp != NULL) {
			xnf_buf_put(xnfp, *bdescp, B_FALSE);
			*bdescp = NULL;
		}
	}

	/* LINTED: constant in conditional context */
	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
	/* LINTED: constant in conditional context */
	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
	    xnfp->xnf_rx_ring.sring, PAGESIZE);

	/*
	 * Fill the ring with buffers.
	 */
	for (i = 0; i < NET_RX_RING_SIZE; i++) {
		xnf_buf_t *bdesc;

		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
		VERIFY(bdesc != NULL);
		xnf_rxbuf_hang(xnfp, bdesc);
	}

	/* LINTED: constant in conditional context */
	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);

	mutex_exit(&xnfp->xnf_rxlock);

	return (0);

out:
	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;

	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;

	return (err);
}

/*
 * Connect driver to back end, called to set up communication with
 * back end driver both initially and on resume after restore/migrate.
 */
void
xnf_be_connect(xnf_t *xnfp)
{
	const char	*message;
	xenbus_transaction_t xbt;
	struct		xenbus_device *xsd;
	char		*xsname;
	int		err;

	ASSERT(!xnfp->xnf_connected);

	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);

	err = xnf_setup_rings(xnfp);
	if (err != 0) {
		cmn_err(CE_WARN, "failed to set up tx/rx rings");
		xenbus_dev_error(xsd, err, "setting up ring");
		return;
	}

again:
	err = xenbus_transaction_start(&xbt);
	if (err != 0) {
		xenbus_dev_error(xsd, EIO, "starting transaction");
		return;
	}

	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
	    xnfp->xnf_tx_ring_ref);
	if (err != 0) {
		message = "writing tx ring-ref";
		goto abort_transaction;
	}

	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
	    xnfp->xnf_rx_ring_ref);
	if (err != 0) {
		message = "writing rx ring-ref";
		goto abort_transaction;
	}

	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
	    xnfp->xnf_evtchn);
	if (err != 0) {
		message = "writing event-channel";
		goto abort_transaction;
	}

	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
	if (err != 0) {
		message = "writing feature-rx-notify";
		goto abort_transaction;
	}

	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
	if (err != 0) {
		message = "writing request-rx-copy";
		goto abort_transaction;
	}

	if (xnfp->xnf_be_mcast_control) {
		err = xenbus_printf(xbt, xsname, "request-multicast-control",
		    "%d", 1);
		if (err != 0) {
			message = "writing request-multicast-control";
			goto abort_transaction;
		}
	}

	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
	if (err != 0) {
		message = "switching state to XenbusStateConnected";
		goto abort_transaction;
	}

	err = xenbus_transaction_end(xbt, 0);
	if (err != 0) {
		if (err == EAGAIN)
			goto again;
		xenbus_dev_error(xsd, err, "completing transaction");
	}

	return;

abort_transaction:
	(void) xenbus_transaction_end(xbt, 1);
	xenbus_dev_error(xsd, err, "%s", message);
}

/*
 * Read configuration information from xenstore.
 */
void
xnf_read_config(xnf_t *xnfp)
{
	int err, be_cap;
	char mac[ETHERADDRL * 3];
	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);

	err = xenbus_scanf(XBT_NULL, oename, "mac",
	    "%s", (char *)&mac[0]);
	if (err != 0) {
		/*
		 * bad: we're supposed to be set up with a proper mac
		 * addr. at this point
		 */
		cmn_err(CE_WARN, "%s%d: no mac address",
		    ddi_driver_name(xnfp->xnf_devinfo),
		    ddi_get_instance(xnfp->xnf_devinfo));
			return;
	}
	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
		err = ENOENT;
		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
		return;
	}

	err = xenbus_scanf(XBT_NULL, oename,
	    "feature-rx-copy", "%d", &be_cap);
	/*
	 * If we fail to read the store we assume that the key is
	 * absent, implying an older domain at the far end.  Older
	 * domains cannot do HV copy.
	 */
	if (err != 0)
		be_cap = 0;
	xnfp->xnf_be_rx_copy = (be_cap != 0);

	err = xenbus_scanf(XBT_NULL, oename,
	    "feature-multicast-control", "%d", &be_cap);
	/*
	 * If we fail to read the store we assume that the key is
	 * absent, implying an older domain at the far end.  Older
	 * domains do not support multicast control.
	 */
	if (err != 0)
		be_cap = 0;
	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
}

/*
 *  attach(9E) -- Attach a device to the system
 */
static int
xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
{
	mac_register_t *macp;
	xnf_t *xnfp;
	int err;
	char cachename[32];

#ifdef XNF_DEBUG
	if (xnf_debug & XNF_DEBUG_DDI)
		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
		    (void *)devinfo);
#endif

	switch (cmd) {
	case DDI_RESUME:
		xnfp = ddi_get_driver_private(devinfo);
		xnfp->xnf_gen++;

		(void) xvdi_resume(devinfo);
		(void) xvdi_alloc_evtchn(devinfo);
		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
#ifdef XPV_HVM_DRIVER
		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
		    xnfp);
#else
		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
		    (caddr_t)xnfp);
#endif
		return (DDI_SUCCESS);

	case DDI_ATTACH:
		break;

	default:
		return (DDI_FAILURE);
	}

	/*
	 *  Allocate gld_mac_info_t and xnf_instance structures
	 */
	macp = mac_alloc(MAC_VERSION);
	if (macp == NULL)
		return (DDI_FAILURE);
	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);

	macp->m_dip = devinfo;
	macp->m_driver = xnfp;
	xnfp->xnf_devinfo = devinfo;

	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
	macp->m_src_addr = xnfp->xnf_mac_addr;
	macp->m_callbacks = &xnf_callbacks;
	macp->m_min_sdu = 0;
	macp->m_max_sdu = XNF_MAXPKT;

	xnfp->xnf_running = B_FALSE;
	xnfp->xnf_connected = B_FALSE;
	xnfp->xnf_be_rx_copy = B_FALSE;
	xnfp->xnf_be_mcast_control = B_FALSE;
	xnfp->xnf_need_sched = B_FALSE;

	xnfp->xnf_rx_head = NULL;
	xnfp->xnf_rx_tail = NULL;
	xnfp->xnf_rx_new_buffers_posted = B_FALSE;

#ifdef XPV_HVM_DRIVER
	/*
	 * Report our version to dom0.
	 */
	if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
	    HVMPV_XNF_VERS))
		cmn_err(CE_WARN, "xnf: couldn't write version\n");
#endif

	/*
	 * Get the iblock cookie with which to initialize the mutexes.
	 */
	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
	    != DDI_SUCCESS)
		goto failure;

	mutex_init(&xnfp->xnf_txlock,
	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
	mutex_init(&xnfp->xnf_rxlock,
	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
	mutex_init(&xnfp->xnf_schedlock,
	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
	mutex_init(&xnfp->xnf_gref_lock,
	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);

	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);

	(void) sprintf(cachename, "xnf_buf_cache_%d",
	    ddi_get_instance(devinfo));
	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
	    sizeof (xnf_buf_t), 0,
	    xnf_buf_constructor, xnf_buf_destructor,
	    NULL, xnfp, NULL, 0);
	if (xnfp->xnf_buf_cache == NULL)
		goto failure_0;

	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
	    ddi_get_instance(devinfo));
	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
	    sizeof (xnf_txbuf_t), 0,
	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
	    NULL, xnfp, NULL, 0);
	if (xnfp->xnf_tx_buf_cache == NULL)
		goto failure_1;

	xnfp->xnf_gref_head = INVALID_GRANT_REF;

	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
		    "driver data structures",
		    ddi_get_instance(xnfp->xnf_devinfo));
		goto failure_2;
	}

	xnfp->xnf_rx_ring.sring->rsp_event =
	    xnfp->xnf_tx_ring.sring->rsp_event = 1;

	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;

	/* set driver private pointer now */
	ddi_set_driver_private(devinfo, xnfp);

	if (!xnf_kstat_init(xnfp))
		goto failure_3;

	/*
	 * Allocate an event channel, add the interrupt handler and
	 * bind it to the event channel.
	 */
	(void) xvdi_alloc_evtchn(devinfo);
	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
#ifdef XPV_HVM_DRIVER
	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
#else
	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
#endif

	err = mac_register(macp, &xnfp->xnf_mh);
	mac_free(macp);
	macp = NULL;
	if (err != 0)
		goto failure_4;

	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
	    != DDI_SUCCESS)
		goto failure_5;

#ifdef XPV_HVM_DRIVER
	/*
	 * In the HVM case, this driver essentially replaces a driver for
	 * a 'real' PCI NIC. Without the "model" property set to
	 * "Ethernet controller", like the PCI code does, netbooting does
	 * not work correctly, as strplumb_get_netdev_path() will not find
	 * this interface.
	 */
	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
	    "Ethernet controller");
#endif

#ifdef XNF_DEBUG
	if (xnf_debug_instance == NULL)
		xnf_debug_instance = xnfp;
#endif

	return (DDI_SUCCESS);

failure_5:
	(void) mac_unregister(xnfp->xnf_mh);

failure_4:
#ifdef XPV_HVM_DRIVER
	ec_unbind_evtchn(xnfp->xnf_evtchn);
	xvdi_free_evtchn(devinfo);
#else
	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
#endif
	xnfp->xnf_evtchn = INVALID_EVTCHN;
	kstat_delete(xnfp->xnf_kstat_aux);

failure_3:
	xnf_release_dma_resources(xnfp);

failure_2:
	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);

failure_1:
	kmem_cache_destroy(xnfp->xnf_buf_cache);

failure_0:
	cv_destroy(&xnfp->xnf_cv_tx_slots);
	cv_destroy(&xnfp->xnf_cv_multicast);
	cv_destroy(&xnfp->xnf_cv_state);

	mutex_destroy(&xnfp->xnf_gref_lock);
	mutex_destroy(&xnfp->xnf_schedlock);
	mutex_destroy(&xnfp->xnf_rxlock);
	mutex_destroy(&xnfp->xnf_txlock);

failure:
	kmem_free(xnfp, sizeof (*xnfp));
	if (macp != NULL)
		mac_free(macp);

	return (DDI_FAILURE);
}

/*  detach(9E) -- Detach a device from the system */
static int
xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
{
	xnf_t *xnfp;		/* Our private device info */

#ifdef XNF_DEBUG
	if (xnf_debug & XNF_DEBUG_DDI)
		printf("xnf_detach(0x%p)\n", (void *)devinfo);
#endif

	xnfp = ddi_get_driver_private(devinfo);

	switch (cmd) {
	case DDI_SUSPEND:
#ifdef XPV_HVM_DRIVER
		ec_unbind_evtchn(xnfp->xnf_evtchn);
		xvdi_free_evtchn(devinfo);
#else
		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
#endif

		xvdi_suspend(devinfo);

		mutex_enter(&xnfp->xnf_rxlock);
		mutex_enter(&xnfp->xnf_txlock);

		xnfp->xnf_evtchn = INVALID_EVTCHN;
		xnfp->xnf_connected = B_FALSE;
		mutex_exit(&xnfp->xnf_txlock);
		mutex_exit(&xnfp->xnf_rxlock);

		/* claim link to be down after disconnect */
		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
		return (DDI_SUCCESS);

	case DDI_DETACH:
		break;

	default:
		return (DDI_FAILURE);
	}

	if (xnfp->xnf_connected)
		return (DDI_FAILURE);

	/*
	 * Cannot detach if we have xnf_buf_t outstanding.
	 */
	if (xnfp->xnf_stat_buf_allocated > 0)
		return (DDI_FAILURE);

	if (mac_unregister(xnfp->xnf_mh) != 0)
		return (DDI_FAILURE);

	kstat_delete(xnfp->xnf_kstat_aux);

	/* Stop the receiver */
	xnf_stop(xnfp);

	xvdi_remove_event_handler(devinfo, XS_OE_STATE);

	/* Remove the interrupt */
#ifdef XPV_HVM_DRIVER
	ec_unbind_evtchn(xnfp->xnf_evtchn);
	xvdi_free_evtchn(devinfo);
#else
	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
#endif

	/* Release any pending xmit mblks */
	xnf_release_mblks(xnfp);

	/* Release all DMA resources */
	xnf_release_dma_resources(xnfp);

	cv_destroy(&xnfp->xnf_cv_tx_slots);
	cv_destroy(&xnfp->xnf_cv_multicast);
	cv_destroy(&xnfp->xnf_cv_state);

	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
	kmem_cache_destroy(xnfp->xnf_buf_cache);

	mutex_destroy(&xnfp->xnf_gref_lock);
	mutex_destroy(&xnfp->xnf_schedlock);
	mutex_destroy(&xnfp->xnf_rxlock);
	mutex_destroy(&xnfp->xnf_txlock);

	kmem_free(xnfp, sizeof (*xnfp));

	return (DDI_SUCCESS);
}

/*
 *  xnf_set_mac_addr() -- set the physical network address on the board.
 */
static int
xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
{
	_NOTE(ARGUNUSED(arg, macaddr));

	/*
	 * We can't set our macaddr.
	 */
	return (ENOTSUP);
}

/*
 *  xnf_set_multicast() -- set (enable) or disable a multicast address.
 *
 *  Program the hardware to enable/disable the multicast address
 *  in "mca".  Enable if "add" is true, disable if false.
 */
static int
xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
{
	xnf_t *xnfp = arg;
	xnf_txbuf_t *txp;
	int n_slots;
	RING_IDX slot;
	xnf_txid_t *tidp;
	netif_tx_request_t *txrp;
	struct netif_extra_info *erp;
	boolean_t notify, result;

	/*
	 * If the backend does not support multicast control then we
	 * must assume that the right packets will just arrive.
	 */
	if (!xnfp->xnf_be_mcast_control)
		return (0);

	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);

	mutex_enter(&xnfp->xnf_txlock);

	/*
	 * If we're not yet connected then claim success. This is
	 * acceptable because we refresh the entire set of multicast
	 * addresses when we get connected.
	 *
	 * We can't wait around here because the MAC layer expects
	 * this to be a non-blocking operation - waiting ends up
	 * causing a deadlock during resume.
	 */
	if (!xnfp->xnf_connected) {
		mutex_exit(&xnfp->xnf_txlock);
		return (0);
	}

	/*
	 * 1. Acquire two slots in the ring.
	 * 2. Fill in the slots.
	 * 3. Request notification when the operation is done.
	 * 4. Kick the peer.
	 * 5. Wait for the response via xnf_tx_clean_ring().
	 */

	n_slots = tx_slots_get(xnfp, 2, B_TRUE);
	ASSERT(n_slots >= 2);

	slot = xnfp->xnf_tx_ring.req_prod_pvt;
	tidp = txid_get(xnfp);
	VERIFY(tidp != NULL);

	txp->tx_type = TX_MCAST_REQ;
	txp->tx_slot = slot;

	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
	erp = (struct netif_extra_info *)
	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);

	txrp->gref = 0;
	txrp->size = 0;
	txrp->offset = 0;
	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
	txrp->id = txp->tx_txreq.id = tidp->id;
	txrp->flags = NETTXF_extra_info;

	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);

	tidp->txbuf = txp;

	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;

	mutex_enter(&xnfp->xnf_schedlock);
	xnfp->xnf_pending_multicast++;
	mutex_exit(&xnfp->xnf_schedlock);

	/* LINTED: constant in conditional context */
	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
	    notify);
	if (notify)
		ec_notify_via_evtchn(xnfp->xnf_evtchn);

	while (txp->tx_type == TX_MCAST_REQ)
		cv_wait(&xnfp->xnf_cv_multicast,
		    &xnfp->xnf_txlock);

	ASSERT(txp->tx_type == TX_MCAST_RSP);

	mutex_enter(&xnfp->xnf_schedlock);
	xnfp->xnf_pending_multicast--;
	mutex_exit(&xnfp->xnf_schedlock);

	result = (txp->tx_status == NETIF_RSP_OKAY);

	txid_put(xnfp, tidp);

	mutex_exit(&xnfp->xnf_txlock);

	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);

	return (result ? 0 : 1);
}

/*
 * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
 *
 *  Program the hardware to enable/disable promiscuous mode.
 */
static int
xnf_set_promiscuous(void *arg, boolean_t on)
{
	_NOTE(ARGUNUSED(arg, on));

	/*
	 * We can't really do this, but we pretend that we can in
	 * order that snoop will work.
	 */
	return (0);
}

/*
 * Clean buffers that we have responses for from the transmit ring.
 */
static int
xnf_tx_clean_ring(xnf_t *xnfp)
{
	boolean_t work_to_do;

	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));

loop:
	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
		RING_IDX cons, prod, i;

		cons = xnfp->xnf_tx_ring.rsp_cons;
		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
		membar_consumer();
		/*
		 * Clean tx requests from ring that we have responses
		 * for.
		 */
		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
		for (i = cons; i != prod; i++) {
			netif_tx_response_t *trp;
			xnf_txid_t *tidp;
			xnf_txbuf_t *txp;

			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
			ASSERT(TX_ID_VALID(trp->id));

			tidp = TX_ID_TO_TXID(xnfp, trp->id);
			ASSERT(tidp->id == trp->id);
			ASSERT(tidp->next == INVALID_TX_ID);

			txp = tidp->txbuf;
			ASSERT(txp != NULL);
			ASSERT(txp->tx_txreq.id == trp->id);

			switch (txp->tx_type) {
			case TX_DATA:
				if (gnttab_query_foreign_access(
				    txp->tx_txreq.gref) != 0)
					cmn_err(CE_PANIC,
					    "tx grant %d still in use by "
					    "backend domain",
					    txp->tx_txreq.gref);

				if (txp->tx_bdesc == NULL) {
					(void) gnttab_end_foreign_access_ref(
					    txp->tx_txreq.gref, 1);
					gref_put(xnfp, txp->tx_txreq.gref);
					(void) ddi_dma_unbind_handle(
					    txp->tx_dma_handle);
				} else {
					xnf_buf_put(xnfp, txp->tx_bdesc,
					    B_TRUE);
				}

				freemsg(txp->tx_mp);
				txid_put(xnfp, tidp);
				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);

				break;

			case TX_MCAST_REQ:
				txp->tx_type = TX_MCAST_RSP;
				txp->tx_status = trp->status;
				cv_broadcast(&xnfp->xnf_cv_multicast);

				break;

			case TX_MCAST_RSP:
				break;

			default:
				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
				    "invalid xnf_txbuf_t type: %d",
				    txp->tx_type);
				break;
			}
		}
		/*
		 * Record the last response we dealt with so that we
		 * know where to start next time around.
		 */
		xnfp->xnf_tx_ring.rsp_cons = prod;
		membar_enter();
	}

	/* LINTED: constant in conditional context */
	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
	if (work_to_do)
		goto loop;

	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
}

/*
 * Allocate and fill in a look-aside buffer for the packet `mp'. Used
 * to ensure that the packet is physically contiguous and contained
 * within a single page.
 */
static xnf_buf_t *
xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
{
	xnf_buf_t *bd;
	caddr_t bp;

	bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
	if (bd == NULL)
		return (NULL);

	bp = bd->buf;
	while (mp != NULL) {
		size_t len = MBLKL(mp);

		bcopy(mp->b_rptr, bp, len);
		bp += len;

		mp = mp->b_cont;
	}

	ASSERT((bp - bd->buf) <= PAGESIZE);

	xnfp->xnf_stat_tx_pullup++;

	return (bd);
}

/*
 * Insert the pseudo-header checksum into the packet `buf'.
 */
void
xnf_pseudo_cksum(caddr_t buf, int length)
{
	struct ether_header *ehp;
	uint16_t sap, len, *stuff;
	uint32_t cksum;
	size_t offset;
	ipha_t *ipha;
	ipaddr_t src, dst;

	ASSERT(length >= sizeof (*ehp));
	ehp = (struct ether_header *)buf;

	if (ntohs(ehp->ether_type) == VLAN_TPID) {
		struct ether_vlan_header *evhp;

		ASSERT(length >= sizeof (*evhp));
		evhp = (struct ether_vlan_header *)buf;
		sap = ntohs(evhp->ether_type);
		offset = sizeof (*evhp);
	} else {
		sap = ntohs(ehp->ether_type);
		offset = sizeof (*ehp);
	}

	ASSERT(sap == ETHERTYPE_IP);

	/* Packet should have been pulled up by the caller. */
	if ((offset + sizeof (ipha_t)) > length) {
		cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
		return;
	}

	ipha = (ipha_t *)(buf + offset);

	ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);

	len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;

	switch (ipha->ipha_protocol) {
	case IPPROTO_TCP:
		stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
		cksum = IP_TCP_CSUM_COMP;
		break;
	case IPPROTO_UDP:
		stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
		cksum = IP_UDP_CSUM_COMP;
		break;
	default:
		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
		    ipha->ipha_protocol);
		return;
	}

	src = ipha->ipha_src;
	dst = ipha->ipha_dst;

	cksum += (dst >> 16) + (dst & 0xFFFF);
	cksum += (src >> 16) + (src & 0xFFFF);
	cksum += htons(len);

	cksum = (cksum >> 16) + (cksum & 0xFFFF);
	cksum = (cksum >> 16) + (cksum & 0xFFFF);

	ASSERT(cksum <= 0xFFFF);

	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
}

/*
 * Push a list of prepared packets (`txp') into the transmit ring.
 */
static xnf_txbuf_t *
tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
{
	int slots_free;
	RING_IDX slot;
	boolean_t notify;

	mutex_enter(&xnfp->xnf_txlock);

	ASSERT(xnfp->xnf_running);

	/*
	 * Wait until we are connected to the backend.
	 */
	while (!xnfp->xnf_connected)
		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);

	slots_free = tx_slots_get(xnfp, 1, B_FALSE);
	DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free);

	slot = xnfp->xnf_tx_ring.req_prod_pvt;

	while ((txp != NULL) && (slots_free > 0)) {
		xnf_txid_t *tidp;
		netif_tx_request_t *txrp;

		tidp = txid_get(xnfp);
		VERIFY(tidp != NULL);

		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);

		txp->tx_slot = slot;
		txp->tx_txreq.id = tidp->id;
		*txrp = txp->tx_txreq;

		tidp->txbuf = txp;

		xnfp->xnf_stat_opackets++;
		xnfp->xnf_stat_obytes += txp->tx_txreq.size;

		txp = txp->tx_next;
		slots_free--;
		slot++;

	}

	xnfp->xnf_tx_ring.req_prod_pvt = slot;

	/*
	 * Tell the peer that we sent something, if it cares.
	 */
	/* LINTED: constant in conditional context */
	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
	    notify);
	if (notify)
		ec_notify_via_evtchn(xnfp->xnf_evtchn);

	mutex_exit(&xnfp->xnf_txlock);

	return (txp);
}

/*
 * Send the chain of packets `mp'. Called by the MAC framework.
 */
static mblk_t *
xnf_send(void *arg, mblk_t *mp)
{
	xnf_t *xnfp = arg;
	domid_t oeid;
	xnf_txbuf_t *head, *tail;
	mblk_t *ml;
	int prepared;

	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);

	/*
	 * Prepare packets for transmission.
	 */
	head = tail = NULL;
	prepared = 0;
	while (mp != NULL) {
		xnf_txbuf_t *txp;
		int n_chunks, length;
		boolean_t page_oops;
		uint32_t pflags;

		for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE;
		    ml != NULL;
		    ml = ml->b_cont, n_chunks++) {

			/*
			 * Test if this buffer includes a page
			 * boundary. The test assumes that the range
			 * b_rptr...b_wptr can include only a single
			 * boundary.
			 */
			if (xnf_btop((size_t)ml->b_rptr) !=
			    xnf_btop((size_t)ml->b_wptr)) {
				xnfp->xnf_stat_tx_pagebndry++;
				page_oops = B_TRUE;
			}

			length += MBLKL(ml);
		}
		DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks);

		/*
		 * Make sure packet isn't too large.
		 */
		if (length > XNF_FRAMESIZE) {
			cmn_err(CE_WARN,
			    "xnf%d: oversized packet (%d bytes) dropped",
			    ddi_get_instance(xnfp->xnf_devinfo), length);
			freemsg(mp);
			continue;
		}

		txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);

		txp->tx_type = TX_DATA;

		if ((n_chunks > xnf_max_tx_frags) || page_oops) {
			/*
			 * Loan a side buffer rather than the mblk
			 * itself.
			 */
			txp->tx_bdesc = xnf_tx_pullup(xnfp, mp);
			if (txp->tx_bdesc == NULL) {
				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
				break;
			}

			txp->tx_bufp = txp->tx_bdesc->buf;
			txp->tx_mfn = txp->tx_bdesc->buf_mfn;
			txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;

		} else {
			int rc;
			ddi_dma_cookie_t dma_cookie;
			uint_t ncookies;

			rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle,
			    NULL, (char *)mp->b_rptr, length,
			    DDI_DMA_WRITE | DDI_DMA_STREAMING,
			    DDI_DMA_DONTWAIT, 0, &dma_cookie,
			    &ncookies);
			if (rc != DDI_DMA_MAPPED) {
				ASSERT(rc != DDI_DMA_INUSE);
				ASSERT(rc != DDI_DMA_PARTIAL_MAP);

#ifdef XNF_DEBUG
				if (rc != DDI_DMA_NORESOURCES)
					cmn_err(CE_WARN,
					    "xnf%d: bind_handle failed (%x)",
					    ddi_get_instance(xnfp->xnf_devinfo),
					    rc);
#endif
				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
				break;
			}
			ASSERT(ncookies == 1);

			txp->tx_bdesc = NULL;
			txp->tx_bufp = (caddr_t)mp->b_rptr;
			txp->tx_mfn =
			    xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
			txp->tx_txreq.gref = gref_get(xnfp);
			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
				(void) ddi_dma_unbind_handle(
				    txp->tx_dma_handle);
				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
				break;
			}
			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
			    oeid, txp->tx_mfn, 1);
		}

		txp->tx_next = NULL;
		txp->tx_mp = mp;
		txp->tx_txreq.size = length;
		txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET;
		txp->tx_txreq.flags = 0;
		hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL,
		    &pflags);
		if (pflags != 0) {
			/*
			 * If the local protocol stack requests checksum
			 * offload we set the 'checksum blank' flag,
			 * indicating to the peer that we need the checksum
			 * calculated for us.
			 *
			 * We _don't_ set the validated flag, because we haven't
			 * validated that the data and the checksum match.
			 */
			xnf_pseudo_cksum(txp->tx_bufp, length);
			txp->tx_txreq.flags |= NETTXF_csum_blank;

			xnfp->xnf_stat_tx_cksum_deferred++;
		}

		if (head == NULL) {
			ASSERT(tail == NULL);

			head = txp;
		} else {
			ASSERT(tail != NULL);

			tail->tx_next = txp;
		}
		tail = txp;

		mp = mp->b_next;
		prepared++;

		/*
		 * There is no point in preparing more than
		 * NET_TX_RING_SIZE, as we won't be able to push them
		 * into the ring in one go and would hence have to
		 * un-prepare the extra.
		 */
		if (prepared == NET_TX_RING_SIZE)
			break;
	}

	DTRACE_PROBE1(xnf_send_prepared, int, prepared);

	if (mp != NULL) {
#ifdef XNF_DEBUG
		int notprepared = 0;
		mblk_t *l = mp;

		while (l != NULL) {
			notprepared++;
			l = l->b_next;
		}

		DTRACE_PROBE1(xnf_send_notprepared, int, notprepared);
#else /* !XNF_DEBUG */
		DTRACE_PROBE1(xnf_send_notprepared, int, -1);
#endif /* XNF_DEBUG */
	}

	/*
	 * Push the packets we have prepared into the ring. They may
	 * not all go.
	 */
	if (head != NULL)
		head = tx_push_packets(xnfp, head);

	/*
	 * If some packets that we prepared were not sent, unprepare
	 * them and add them back to the head of those we didn't
	 * prepare.
	 */
	{
		xnf_txbuf_t *loop;
		mblk_t *mp_head, *mp_tail;
		int unprepared = 0;

		mp_head = mp_tail = NULL;
		loop = head;

		while (loop != NULL) {
			xnf_txbuf_t *next = loop->tx_next;

			if (loop->tx_bdesc == NULL) {
				(void) gnttab_end_foreign_access_ref(
				    loop->tx_txreq.gref, 1);
				gref_put(xnfp, loop->tx_txreq.gref);
				(void) ddi_dma_unbind_handle(
				    loop->tx_dma_handle);
			} else {
				xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE);
			}

			ASSERT(loop->tx_mp != NULL);
			if (mp_head == NULL)
				mp_head = loop->tx_mp;
			mp_tail = loop->tx_mp;

			kmem_cache_free(xnfp->xnf_tx_buf_cache, loop);
			loop = next;
			unprepared++;
		}

		if (mp_tail == NULL) {
			ASSERT(mp_head == NULL);
		} else {
			ASSERT(mp_head != NULL);

			mp_tail->b_next = mp;
			mp = mp_head;
		}

		DTRACE_PROBE1(xnf_send_unprepared, int, unprepared);
	}

	/*
	 * If any mblks are left then we have deferred for some reason
	 * and need to ask for a re-schedule later. This is typically
	 * due to the ring filling.
	 */
	if (mp != NULL) {
		mutex_enter(&xnfp->xnf_schedlock);
		xnfp->xnf_need_sched = B_TRUE;
		mutex_exit(&xnfp->xnf_schedlock);

		xnfp->xnf_stat_tx_defer++;
	}

	return (mp);
}

/*
 * Notification of RX packets. Currently no TX-complete interrupt is
 * used, as we clean the TX ring lazily.
 */
static uint_t
xnf_intr(caddr_t arg)
{
	xnf_t *xnfp = (xnf_t *)arg;
	mblk_t *mp;
	boolean_t need_sched, clean_ring;

	mutex_enter(&xnfp->xnf_rxlock);

	/*
	 * Interrupts before we are connected are spurious.
	 */
	if (!xnfp->xnf_connected) {
		mutex_exit(&xnfp->xnf_rxlock);
		xnfp->xnf_stat_unclaimed_interrupts++;
		return (DDI_INTR_UNCLAIMED);
	}

	/*
	 * Receive side processing.
	 */
	do {
		/*
		 * Collect buffers from the ring.
		 */
		xnf_rx_collect(xnfp);

		/*
		 * Interrupt me when the next receive buffer is consumed.
		 */
		xnfp->xnf_rx_ring.sring->rsp_event =
		    xnfp->xnf_rx_ring.rsp_cons + 1;
		xen_mb();

	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));

	if (xnfp->xnf_rx_new_buffers_posted) {
		boolean_t notify;

		/*
		 * Indicate to the peer that we have re-filled the
		 * receive ring, if it cares.
		 */
		/* LINTED: constant in conditional context */
		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
		if (notify)
			ec_notify_via_evtchn(xnfp->xnf_evtchn);
		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
	}

	mp = xnfp->xnf_rx_head;
	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;

	xnfp->xnf_stat_interrupts++;
	mutex_exit(&xnfp->xnf_rxlock);

	if (mp != NULL)
		mac_rx(xnfp->xnf_mh, NULL, mp);

	/*
	 * Transmit side processing.
	 *
	 * If a previous transmit attempt failed or we have pending
	 * multicast requests, clean the ring.
	 *
	 * If we previously stalled transmission and cleaning produces
	 * some free slots, tell upstream to attempt sending again.
	 *
	 * The odd style is to avoid acquiring xnf_txlock unless we
	 * will actually look inside the tx machinery.
	 */
	mutex_enter(&xnfp->xnf_schedlock);
	need_sched = xnfp->xnf_need_sched;
	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
	mutex_exit(&xnfp->xnf_schedlock);

	if (clean_ring) {
		int free_slots;

		mutex_enter(&xnfp->xnf_txlock);
		free_slots = tx_slots_get(xnfp, 0, B_FALSE);

		if (need_sched && (free_slots > 0)) {
			mutex_enter(&xnfp->xnf_schedlock);
			xnfp->xnf_need_sched = B_FALSE;
			mutex_exit(&xnfp->xnf_schedlock);

			mac_tx_update(xnfp->xnf_mh);
		}
		mutex_exit(&xnfp->xnf_txlock);
	}

	return (DDI_INTR_CLAIMED);
}

/*
 *  xnf_start() -- start the board receiving and enable interrupts.
 */
static int
xnf_start(void *arg)
{
	xnf_t *xnfp = arg;

#ifdef XNF_DEBUG
	if (xnf_debug & XNF_DEBUG_TRACE)
		printf("xnf%d start(0x%p)\n",
		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
#endif

	mutex_enter(&xnfp->xnf_rxlock);
	mutex_enter(&xnfp->xnf_txlock);

	/* Accept packets from above. */
	xnfp->xnf_running = B_TRUE;

	mutex_exit(&xnfp->xnf_txlock);
	mutex_exit(&xnfp->xnf_rxlock);

	return (0);
}

/* xnf_stop() - disable hardware */
static void
xnf_stop(void *arg)
{
	xnf_t *xnfp = arg;

#ifdef XNF_DEBUG
	if (xnf_debug & XNF_DEBUG_TRACE)
		printf("xnf%d stop(0x%p)\n",
		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
#endif

	mutex_enter(&xnfp->xnf_rxlock);
	mutex_enter(&xnfp->xnf_txlock);

	xnfp->xnf_running = B_FALSE;

	mutex_exit(&xnfp->xnf_txlock);
	mutex_exit(&xnfp->xnf_rxlock);
}

/*
 * Hang buffer `bdesc' on the RX ring.
 */
static void
xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
{
	netif_rx_request_t *reqp;
	RING_IDX hang_ix;

	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));

	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
	    xnfp->xnf_rx_ring.req_prod_pvt);
	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);

	reqp->id = bdesc->id = hang_ix;
	reqp->gref = bdesc->grant_ref;

	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
	xnfp->xnf_rx_ring.req_prod_pvt++;

	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
}

/*
 * Collect packets from the RX ring, storing them in `xnfp' for later
 * use.
 */
static void
xnf_rx_collect(xnf_t *xnfp)
{
	mblk_t *head, *tail;

	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));

	/*
	 * Loop over unconsumed responses:
	 * 1. get a response
	 * 2. take corresponding buffer off recv. ring
	 * 3. indicate this by setting slot to NULL
	 * 4. create a new message and
	 * 5. copy data in, adjust ptr
	 */

	head = tail = NULL;

	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
		netif_rx_response_t *rxpkt;
		xnf_buf_t *bdesc;
		ssize_t len;
		size_t off;
		mblk_t *mp = NULL;
		boolean_t hwcsum = B_FALSE;
		grant_ref_t ref;

		/* 1. */
		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
		    xnfp->xnf_rx_ring.rsp_cons);

		DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id,
		    int, (int)rxpkt->offset,
		    int, (int)rxpkt->flags,
		    int, (int)rxpkt->status);

		/*
		 * 2.
		 */
		bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id];

		/*
		 * 3.
		 */
		xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL;
		ASSERT(bdesc->id == rxpkt->id);

		ref = bdesc->grant_ref;
		off = rxpkt->offset;
		len = rxpkt->status;

		if (!xnfp->xnf_running) {
			DTRACE_PROBE4(xnf_rx_not_running,
			    int, rxpkt->status,
			    char *, bdesc->buf, int, rxpkt->offset,
			    char *, ((char *)bdesc->buf) + rxpkt->offset);

			xnfp->xnf_stat_drop++;

		} else if (len <= 0) {
			DTRACE_PROBE4(xnf_rx_pkt_status_negative,
			    int, rxpkt->status,
			    char *, bdesc->buf, int, rxpkt->offset,
			    char *, ((char *)bdesc->buf) + rxpkt->offset);

			xnfp->xnf_stat_errrx++;

			switch (len) {
			case 0:
				xnfp->xnf_stat_runt++;
				break;
			case NETIF_RSP_ERROR:
				xnfp->xnf_stat_mac_rcv_error++;
				break;
			case NETIF_RSP_DROPPED:
				xnfp->xnf_stat_norxbuf++;
				break;
			}

		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
			cmn_err(CE_WARN, "Bad rx grant reference %d "
			    "from domain %d", ref,
			    xvdi_get_oeid(xnfp->xnf_devinfo));

		} else if ((off + len) > PAGESIZE) {
			cmn_err(CE_WARN, "Rx packet overflows page "
			    "(offset %ld, length %ld) from domain %d",
			    off, len, xvdi_get_oeid(xnfp->xnf_devinfo));
		} else {
			xnf_buf_t *nbuf = NULL;

			DTRACE_PROBE4(xnf_rx_packet, int, len,
			    char *, bdesc->buf, int, off,
			    char *, ((char *)bdesc->buf) + off);

			ASSERT(off + len <= PAGEOFFSET);

			if (rxpkt->flags & NETRXF_data_validated)
				hwcsum = B_TRUE;

			/*
			 * If the packet is below a pre-determined
			 * size we will copy data out rather than
			 * replace it.
			 */
			if (len > xnf_rx_copy_limit)
				nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);

			/*
			 * If we have a replacement buffer, attempt to
			 * wrap the existing one with an mblk_t in
			 * order that the upper layers of the stack
			 * might use it directly.
			 */
			if (nbuf != NULL) {
				mp = desballoc((unsigned char *)bdesc->buf,
				    bdesc->len, 0, &bdesc->free_rtn);
				if (mp == NULL) {
					xnfp->xnf_stat_rx_desballoc_fail++;
					xnfp->xnf_stat_norxbuf++;

					xnf_buf_put(xnfp, nbuf, B_FALSE);
					nbuf = NULL;
				} else {
					mp->b_rptr = mp->b_rptr + off;
					mp->b_wptr = mp->b_rptr + len;

					/*
					 * Release the grant reference
					 * associated with this buffer
					 * - they are scarce and the
					 * upper layers of the stack
					 * don't need it.
					 */
					(void) gnttab_end_foreign_access_ref(
					    bdesc->grant_ref, 0);
					gref_put(xnfp, bdesc->grant_ref);
					bdesc->grant_ref = INVALID_GRANT_REF;

					bdesc = nbuf;
				}
			}

			if (nbuf == NULL) {
				/*
				 * No replacement buffer allocated -
				 * attempt to copy the data out and
				 * re-hang the existing buffer.
				 */

				/* 4. */
				mp = allocb(len, BPRI_MED);
				if (mp == NULL) {
					xnfp->xnf_stat_rx_allocb_fail++;
					xnfp->xnf_stat_norxbuf++;
				} else {
					/* 5. */
					bcopy(bdesc->buf + off, mp->b_wptr,
					    len);
					mp->b_wptr += len;
				}
			}
		}

		/* Re-hang the buffer. */
		xnf_rxbuf_hang(xnfp, bdesc);

		if (mp != NULL) {
			if (hwcsum) {
				/*
				 * If the peer says that the data has
				 * been validated then we declare that
				 * the full checksum has been
				 * verified.
				 *
				 * We don't look at the "checksum
				 * blank" flag, and hence could have a
				 * packet here that we are asserting
				 * is good with a blank checksum.
				 *
				 * The hardware checksum offload
				 * specification says that we must
				 * provide the actual checksum as well
				 * as an assertion that it is valid,
				 * but the protocol stack doesn't
				 * actually use it and some other
				 * drivers don't bother, so we don't.
				 * If it was necessary we could grovel
				 * in the packet to find it.
				 */
				(void) hcksum_assoc(mp, NULL,
				    NULL, 0, 0, 0, 0,
				    HCK_FULLCKSUM |
				    HCK_FULLCKSUM_OK, 0);
				xnfp->xnf_stat_rx_cksum_no_need++;
			}
			if (head == NULL) {
				ASSERT(tail == NULL);

				head = mp;
			} else {
				ASSERT(tail != NULL);

				tail->b_next = mp;
			}
			tail = mp;

			ASSERT(mp->b_next == NULL);

			xnfp->xnf_stat_ipackets++;
			xnfp->xnf_stat_rbytes += len;
		}

		xnfp->xnf_rx_ring.rsp_cons++;
	}

	/*
	 * Store the mblks we have collected.
	 */
	if (head != NULL) {
		ASSERT(tail != NULL);

		if (xnfp->xnf_rx_head == NULL) {
			ASSERT(xnfp->xnf_rx_tail == NULL);

			xnfp->xnf_rx_head = head;
		} else {
			ASSERT(xnfp->xnf_rx_tail != NULL);

			xnfp->xnf_rx_tail->b_next = head;
		}
		xnfp->xnf_rx_tail = tail;
	}
}

/*
 *  xnf_alloc_dma_resources() -- initialize the drivers structures
 */
static int
xnf_alloc_dma_resources(xnf_t *xnfp)
{
	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
	size_t			len;
	ddi_dma_cookie_t	dma_cookie;
	uint_t			ncookies;
	int			rc;
	caddr_t			rptr;

	/*
	 * The code below allocates all the DMA data structures that
	 * need to be released when the driver is detached.
	 *
	 * Allocate page for the transmit descriptor ring.
	 */
	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
		goto alloc_error;

	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
	    DDI_DMA_SLEEP, 0, &rptr, &len,
	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
		xnfp->xnf_tx_ring_dma_handle = NULL;
		goto alloc_error;
	}

	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
		xnfp->xnf_tx_ring_dma_handle = NULL;
		xnfp->xnf_tx_ring_dma_acchandle = NULL;
		if (rc == DDI_DMA_NORESOURCES)
			goto alloc_error;
		else
			goto error;
	}

	ASSERT(ncookies == 1);
	bzero(rptr, PAGESIZE);
	/* LINTED: constant in conditional context */
	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
	/* LINTED: constant in conditional context */
	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;

	/*
	 * Allocate page for the receive descriptor ring.
	 */
	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
		goto alloc_error;

	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
	    DDI_DMA_SLEEP, 0, &rptr, &len,
	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
		xnfp->xnf_rx_ring_dma_handle = NULL;
		goto alloc_error;
	}

	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
		xnfp->xnf_rx_ring_dma_handle = NULL;
		xnfp->xnf_rx_ring_dma_acchandle = NULL;
		if (rc == DDI_DMA_NORESOURCES)
			goto alloc_error;
		else
			goto error;
	}

	ASSERT(ncookies == 1);
	bzero(rptr, PAGESIZE);
	/* LINTED: constant in conditional context */
	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
	/* LINTED: constant in conditional context */
	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;

	return (DDI_SUCCESS);

alloc_error:
	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
	    ddi_get_instance(xnfp->xnf_devinfo));
error:
	xnf_release_dma_resources(xnfp);
	return (DDI_FAILURE);
}

/*
 * Release all DMA resources in the opposite order from acquisition
 */
static void
xnf_release_dma_resources(xnf_t *xnfp)
{
	int i;

	/*
	 * Free receive buffers which are currently associated with
	 * descriptors.
	 */
	mutex_enter(&xnfp->xnf_rxlock);
	for (i = 0; i < NET_RX_RING_SIZE; i++) {
		xnf_buf_t *bp;

		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
			continue;
		xnfp->xnf_rx_pkt_info[i] = NULL;
		xnf_buf_put(xnfp, bp, B_FALSE);
	}
	mutex_exit(&xnfp->xnf_rxlock);

	/* Free the receive ring buffer. */
	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
		xnfp->xnf_rx_ring_dma_acchandle = NULL;
	}
	/* Free the transmit ring buffer. */
	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
		xnfp->xnf_tx_ring_dma_acchandle = NULL;
	}

}

/*
 * Release any packets and associated structures used by the TX ring.
 */
static void
xnf_release_mblks(xnf_t *xnfp)
{
	RING_IDX i;
	xnf_txid_t *tidp;

	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
	    i < NET_TX_RING_SIZE;
	    i++, tidp++) {
		xnf_txbuf_t *txp = tidp->txbuf;

		if (txp != NULL) {
			ASSERT(txp->tx_mp != NULL);
			freemsg(txp->tx_mp);

			txid_put(xnfp, tidp);
			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
		}
	}
}

static int
xnf_buf_constructor(void *buf, void *arg, int kmflag)
{
	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
	xnf_buf_t *bdesc = buf;
	xnf_t *xnfp = arg;
	ddi_dma_cookie_t dma_cookie;
	uint_t ncookies;
	size_t len;

	if (kmflag & KM_NOSLEEP)
		ddiflags = DDI_DMA_DONTWAIT;

	/* Allocate a DMA access handle for the buffer. */
	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
		goto failure;

	/* Allocate DMA-able memory for buffer. */
	if (ddi_dma_mem_alloc(bdesc->dma_handle,
	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
		goto failure_1;

	/* Bind to virtual address of buffer to get physical address. */
	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
		goto failure_2;
	ASSERT(ncookies == 1);

	bdesc->free_rtn.free_func = xnf_buf_recycle;
	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
	bdesc->xnfp = xnfp;
	bdesc->buf_phys = dma_cookie.dmac_laddress;
	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
	bdesc->len = dma_cookie.dmac_size;
	bdesc->grant_ref = INVALID_GRANT_REF;
	bdesc->gen = xnfp->xnf_gen;

	atomic_add_64(&xnfp->xnf_stat_buf_allocated, 1);

	return (0);

failure_2:
	ddi_dma_mem_free(&bdesc->acc_handle);

failure_1:
	ddi_dma_free_handle(&bdesc->dma_handle);

failure:

	ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
	return (-1);
}

static void
xnf_buf_destructor(void *buf, void *arg)
{
	xnf_buf_t *bdesc = buf;
	xnf_t *xnfp = arg;

	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
	ddi_dma_mem_free(&bdesc->acc_handle);
	ddi_dma_free_handle(&bdesc->dma_handle);

	atomic_add_64(&xnfp->xnf_stat_buf_allocated, -1);
}

static xnf_buf_t *
xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
{
	grant_ref_t gref;
	xnf_buf_t *bufp;

	/*
	 * Usually grant references are more scarce than memory, so we
	 * attempt to acquire a grant reference first.
	 */
	gref = gref_get(xnfp);
	if (gref == INVALID_GRANT_REF)
		return (NULL);

	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
	if (bufp == NULL) {
		gref_put(xnfp, gref);
		return (NULL);
	}

	ASSERT(bufp->grant_ref == INVALID_GRANT_REF);

	bufp->grant_ref = gref;

	if (bufp->gen != xnfp->xnf_gen)
		xnf_buf_refresh(bufp);

	gnttab_grant_foreign_access_ref(bufp->grant_ref,
	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
	    bufp->buf_mfn, readonly ? 1 : 0);

	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, 1);

	return (bufp);
}

static void
xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
{
	if (bufp->grant_ref != INVALID_GRANT_REF) {
		(void) gnttab_end_foreign_access_ref(
		    bufp->grant_ref, readonly ? 1 : 0);
		gref_put(xnfp, bufp->grant_ref);
		bufp->grant_ref = INVALID_GRANT_REF;
	}

	kmem_cache_free(xnfp->xnf_buf_cache, bufp);

	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, -1);
}

/*
 * Refresh any cached data about a buffer after resume.
 */
static void
xnf_buf_refresh(xnf_buf_t *bdesc)
{
	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
	bdesc->gen = bdesc->xnfp->xnf_gen;
}

/*
 * Streams `freeb' routine for `xnf_buf_t' when used as transmit
 * look-aside buffers.
 */
static void
xnf_buf_recycle(xnf_buf_t *bdesc)
{
	xnf_t *xnfp = bdesc->xnfp;

	xnf_buf_put(xnfp, bdesc, B_TRUE);
}

static int
xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
{
	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
	xnf_txbuf_t *txp = buf;
	xnf_t *xnfp = arg;

	if (kmflag & KM_NOSLEEP)
		ddiflags = DDI_DMA_DONTWAIT;

	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
	    ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
		ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
		return (-1);
	}

	return (0);
}

static void
xnf_tx_buf_destructor(void *buf, void *arg)
{
	_NOTE(ARGUNUSED(arg));
	xnf_txbuf_t *txp = buf;

	ddi_dma_free_handle(&txp->tx_dma_handle);
}

/*
 * Statistics.
 */
static char *xnf_aux_statistics[] = {
	"tx_cksum_deferred",
	"rx_cksum_no_need",
	"interrupts",
	"unclaimed_interrupts",
	"tx_pullup",
	"tx_pagebndry",
	"tx_attempt",
	"buf_allocated",
	"buf_outstanding",
	"gref_outstanding",
	"gref_failure",
	"gref_peak",
	"rx_allocb_fail",
	"rx_desballoc_fail",
};

static int
xnf_kstat_aux_update(kstat_t *ksp, int flag)
{
	xnf_t *xnfp;
	kstat_named_t *knp;

	if (flag != KSTAT_READ)
		return (EACCES);

	xnfp = ksp->ks_private;
	knp = ksp->ks_data;

	/*
	 * Assignment order must match that of the names in
	 * xnf_aux_statistics.
	 */
	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;

	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;

	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;

	return (0);
}

static boolean_t
xnf_kstat_init(xnf_t *xnfp)
{
	int nstat = sizeof (xnf_aux_statistics) /
	    sizeof (xnf_aux_statistics[0]);
	char **cp = xnf_aux_statistics;
	kstat_named_t *knp;

	/*
	 * Create and initialise kstats.
	 */
	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
	    ddi_get_instance(xnfp->xnf_devinfo),
	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
	    nstat, 0)) == NULL)
		return (B_FALSE);

	xnfp->xnf_kstat_aux->ks_private = xnfp;
	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;

	knp = xnfp->xnf_kstat_aux->ks_data;
	while (nstat > 0) {
		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);

		knp++;
		cp++;
		nstat--;
	}

	kstat_install(xnfp->xnf_kstat_aux);

	return (B_TRUE);
}

static int
xnf_stat(void *arg, uint_t stat, uint64_t *val)
{
	xnf_t *xnfp = arg;

	mutex_enter(&xnfp->xnf_rxlock);
	mutex_enter(&xnfp->xnf_txlock);

#define	mac_stat(q, r)				\
	case (MAC_STAT_##q):			\
		*val = xnfp->xnf_stat_##r;	\
		break

#define	ether_stat(q, r)			\
	case (ETHER_STAT_##q):			\
		*val = xnfp->xnf_stat_##r;	\
		break

	switch (stat) {

	mac_stat(IPACKETS, ipackets);
	mac_stat(OPACKETS, opackets);
	mac_stat(RBYTES, rbytes);
	mac_stat(OBYTES, obytes);
	mac_stat(NORCVBUF, norxbuf);
	mac_stat(IERRORS, errrx);
	mac_stat(NOXMTBUF, tx_defer);

	ether_stat(MACRCV_ERRORS, mac_rcv_error);
	ether_stat(TOOSHORT_ERRORS, runt);

	/* always claim to be in full duplex mode */
	case ETHER_STAT_LINK_DUPLEX:
		*val = LINK_DUPLEX_FULL;
		break;

	/* always claim to be at 1Gb/s link speed */
	case MAC_STAT_IFSPEED:
		*val = 1000000000ull;
		break;

	default:
		mutex_exit(&xnfp->xnf_txlock);
		mutex_exit(&xnfp->xnf_rxlock);

		return (ENOTSUP);
	}

#undef mac_stat
#undef ether_stat

	mutex_exit(&xnfp->xnf_txlock);
	mutex_exit(&xnfp->xnf_rxlock);

	return (0);
}

static boolean_t
xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
	_NOTE(ARGUNUSED(arg));

	switch (cap) {
	case MAC_CAPAB_HCKSUM: {
		uint32_t *capab = cap_data;

		/*
		 * Whilst the flag used to communicate with the IO
		 * domain is called "NETTXF_csum_blank", the checksum
		 * in the packet must contain the pseudo-header
		 * checksum and not zero.
		 *
		 * To help out the IO domain, we might use
		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
		 * then use checksum offload for IPv6 packets, which
		 * the IO domain can't handle.
		 *
		 * As a result, we declare outselves capable of
		 * HCKSUM_INET_FULL_V4. This means that we receive
		 * IPv4 packets from the stack with a blank checksum
		 * field and must insert the pseudo-header checksum
		 * before passing the packet to the IO domain.
		 */
		*capab = HCKSUM_INET_FULL_V4;
		break;
	}
	default:
		return (B_FALSE);
	}

	return (B_TRUE);
}

/*
 * The state of the peer has changed - react accordingly.
 */
static void
oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
    void *arg, void *impl_data)
{
	_NOTE(ARGUNUSED(id, arg));
	xnf_t *xnfp = ddi_get_driver_private(dip);
	XenbusState new_state = *(XenbusState *)impl_data;

	ASSERT(xnfp != NULL);

	switch (new_state) {
	case XenbusStateUnknown:
	case XenbusStateInitialising:
	case XenbusStateInitialised:
	case XenbusStateClosing:
	case XenbusStateClosed:
	case XenbusStateReconfiguring:
	case XenbusStateReconfigured:
		break;

	case XenbusStateInitWait:
		xnf_read_config(xnfp);

		if (!xnfp->xnf_be_rx_copy) {
			cmn_err(CE_WARN,
			    "The xnf driver requires a dom0 that "
			    "supports 'feature-rx-copy'.");
			(void) xvdi_switch_state(xnfp->xnf_devinfo,
			    XBT_NULL, XenbusStateClosed);
			break;
		}

		/*
		 * Connect to the backend.
		 */
		xnf_be_connect(xnfp);

		/*
		 * Our MAC address as discovered by xnf_read_config().
		 */
		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);

		break;

	case XenbusStateConnected:
		mutex_enter(&xnfp->xnf_rxlock);
		mutex_enter(&xnfp->xnf_txlock);

		xnfp->xnf_connected = B_TRUE;
		/*
		 * Wake up any threads waiting to send data to
		 * backend.
		 */
		cv_broadcast(&xnfp->xnf_cv_state);

		mutex_exit(&xnfp->xnf_txlock);
		mutex_exit(&xnfp->xnf_rxlock);

		/*
		 * Kick the peer in case it missed any transmits
		 * request in the TX ring.
		 */
		ec_notify_via_evtchn(xnfp->xnf_evtchn);

		/*
		 * There may already be completed receive requests in
		 * the ring sent by backend after it gets connected
		 * but before we see its state change here, so we call
		 * xnf_intr() to handle them, if any.
		 */
		(void) xnf_intr((caddr_t)xnfp);

		/*
		 * Mark the link up now that we are connected.
		 */
		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);

		/*
		 * Tell the backend about the multicast addresses in
		 * which we are interested.
		 */
		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);

		break;

	default:
		break;
	}
}