/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2018 Joyent, Inc.
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/callb.h>
#include <sys/stream.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/ksynch.h>
#include <sys/stat.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
#include <sys/debug.h>
#include <sys/ethernet.h>
#include <sys/dlpi.h>
#include <net/if.h>
#include <sys/mac_provider.h>
#include <sys/mac_client.h>
#include <sys/mac_client_priv.h>
#include <sys/mac_ether.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/strsun.h>
#include <sys/note.h>
#include <sys/atomic.h>
#include <sys/vnet.h>
#include <sys/vlan.h>
#include <sys/vnet_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/dds.h>
#include <sys/strsubr.h>
#include <sys/taskq.h>

/*
 * Function prototypes.
 */

/* DDI entrypoints */
static int vnetdevinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int vnetattach(dev_info_t *, ddi_attach_cmd_t);
static int vnetdetach(dev_info_t *, ddi_detach_cmd_t);

/* MAC entrypoints  */
static int vnet_m_stat(void *, uint_t, uint64_t *);
static int vnet_m_start(void *);
static void vnet_m_stop(void *);
static int vnet_m_promisc(void *, boolean_t);
static int vnet_m_multicst(void *, boolean_t, const uint8_t *);
static int vnet_m_unicst(void *, const uint8_t *);
mblk_t *vnet_m_tx(void *, mblk_t *);
static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp);
#ifdef	VNET_IOC_DEBUG
static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp);
#endif
static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data);
static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
	const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle);
static void vnet_get_group(void *arg, mac_ring_type_t type, const int index,
	mac_group_info_t *infop, mac_group_handle_t handle);
static int vnet_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
static void vnet_rx_ring_stop(mac_ring_driver_t rdriver);
static int vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
	uint64_t *val);
static int vnet_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
static void vnet_tx_ring_stop(mac_ring_driver_t rdriver);
static int vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
	uint64_t *val);
static int vnet_ring_enable_intr(void *arg);
static int vnet_ring_disable_intr(void *arg);
static mblk_t *vnet_rx_poll(void *arg, int bytes_to_pickup);
static int vnet_addmac(void *arg, const uint8_t *mac_addr);
static int vnet_remmac(void *arg, const uint8_t *mac_addr);

/* vnet internal functions */
static int vnet_unattach(vnet_t *vnetp);
static void vnet_ring_grp_init(vnet_t *vnetp);
static void vnet_ring_grp_uninit(vnet_t *vnetp);
static int vnet_mac_register(vnet_t *);
static int vnet_read_mac_address(vnet_t *vnetp);
static int vnet_bind_vgenring(vnet_res_t *vresp);
static void vnet_unbind_vgenring(vnet_res_t *vresp);
static int vnet_bind_hwrings(vnet_t *vnetp);
static void vnet_unbind_hwrings(vnet_t *vnetp);
static int vnet_bind_rings(vnet_res_t *vresp);
static void vnet_unbind_rings(vnet_res_t *vresp);
static int vnet_hio_stat(void *, uint_t, uint64_t *);
static int vnet_hio_start(void *);
static void vnet_hio_stop(void *);
mblk_t *vnet_hio_tx(void *, mblk_t *);

/* Forwarding database (FDB) routines */
static void vnet_fdb_create(vnet_t *vnetp);
static void vnet_fdb_destroy(vnet_t *vnetp);
static vnet_res_t *vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp);
static void vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
void vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp);
static void vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp);

static void vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp);
static void vnet_rx(vio_net_handle_t vrh, mblk_t *mp);
static void vnet_tx_update(vio_net_handle_t vrh);
static void vnet_res_start_task(void *arg);
static void vnet_start_resources(vnet_t *vnetp);
static void vnet_stop_resources(vnet_t *vnetp);
static void vnet_dispatch_res_task(vnet_t *vnetp);
static void vnet_res_start_task(void *arg);
static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err);
static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp);
static vnet_res_t *vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp);
static void vnet_tx_notify_thread(void *);

/* Exported to vnet_gen */
int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu);
void vnet_link_update(vnet_t *vnetp, link_state_t link_state);
void vnet_dds_cleanup_hio(vnet_t *vnetp);

static kstat_t *vnet_hio_setup_kstats(char *ks_mod, char *ks_name,
    vnet_res_t *vresp);
static int vnet_hio_update_kstats(kstat_t *ksp, int rw);
static void vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp);
static void vnet_hio_destroy_kstats(kstat_t *ksp);

/* Exported to to vnet_dds */
int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg);
int vnet_hio_mac_init(vnet_t *vnetp, char *ifname);
void vnet_hio_mac_cleanup(vnet_t *vnetp);

/* Externs that are imported from vnet_gen */
extern int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip,
    const uint8_t *macaddr, void **vgenhdl);
extern int vgen_init_mdeg(void *arg);
extern void vgen_uninit(void *arg);
extern int vgen_dds_tx(void *arg, void *dmsg);
extern int vgen_enable_intr(void *arg);
extern int vgen_disable_intr(void *arg);
extern mblk_t *vgen_rx_poll(void *arg, int bytes_to_pickup);

/* Externs that are imported from vnet_dds */
extern void vdds_mod_init(void);
extern void vdds_mod_fini(void);
extern int vdds_init(vnet_t *vnetp);
extern void vdds_cleanup(vnet_t *vnetp);
extern void vdds_process_dds_msg(vnet_t *vnetp, vio_dds_msg_t *dmsg);
extern void vdds_cleanup_hybrid_res(void *arg);
extern void vdds_cleanup_hio(vnet_t *vnetp);

extern pri_t	minclsyspri;

#define	DRV_NAME	"vnet"
#define	VNET_FDBE_REFHOLD(p)						\
{									\
	atomic_inc_32(&(p)->refcnt);					\
	ASSERT((p)->refcnt != 0);					\
}

#define	VNET_FDBE_REFRELE(p)						\
{									\
	ASSERT((p)->refcnt != 0);					\
	atomic_dec_32(&(p)->refcnt);					\
}

#ifdef	VNET_IOC_DEBUG
#define	VNET_M_CALLBACK_FLAGS	(MC_IOCTL | MC_GETCAPAB)
#else
#define	VNET_M_CALLBACK_FLAGS	(MC_GETCAPAB)
#endif

static mac_callbacks_t vnet_m_callbacks = {
	VNET_M_CALLBACK_FLAGS,
	vnet_m_stat,
	vnet_m_start,
	vnet_m_stop,
	vnet_m_promisc,
	vnet_m_multicst,
	NULL,	/* m_unicst entry must be NULL while rx rings are exposed */
	NULL,	/* m_tx entry must be NULL while tx rings are exposed */
	NULL,
	vnet_m_ioctl,
	vnet_m_capab,
	NULL
};

static mac_callbacks_t vnet_hio_res_callbacks = {
	0,
	vnet_hio_stat,
	vnet_hio_start,
	vnet_hio_stop,
	NULL,
	NULL,
	NULL,
	vnet_hio_tx,
	NULL,
	NULL,
	NULL
};

/*
 * Linked list of "vnet_t" structures - one per instance.
 */
static vnet_t	*vnet_headp = NULL;
static krwlock_t vnet_rw;

/* Tunables */
uint32_t vnet_num_descriptors = VNET_NUM_DESCRIPTORS;

/*
 * Configure tx serialization in mac layer for the vnet device. This tunable
 * should be enabled to improve performance only if HybridIO is configured for
 * the vnet device.
 */
boolean_t vnet_mac_tx_serialize = B_FALSE;

/* Configure enqueing at Rx soft rings in mac layer for the vnet device */
boolean_t vnet_mac_rx_queuing = B_TRUE;

/*
 * Set this to non-zero to enable additional internal receive buffer pools
 * based on the MTU of the device for better performance at the cost of more
 * memory consumption. This is turned off by default, to use allocb(9F) for
 * receive buffer allocations of sizes > 2K.
 */
boolean_t vnet_jumbo_rxpools = B_FALSE;

/* # of chains in fdb hash table */
uint32_t	vnet_fdb_nchains = VNET_NFDB_HASH;

/* Internal tunables */
uint32_t	vnet_ethermtu = 1500;	/* mtu of the device */

/*
 * Default vlan id. This is only used internally when the "default-vlan-id"
 * property is not present in the MD device node. Therefore, this should not be
 * used as a tunable; if this value is changed, the corresponding variable
 * should be updated to the same value in vsw and also other vnets connected to
 * the same vsw.
 */
uint16_t	vnet_default_vlan_id = 1;

/* delay in usec to wait for all references on a fdb entry to be dropped */
uint32_t vnet_fdbe_refcnt_delay = 10;

static struct ether_addr etherbroadcastaddr = {
	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
};

/* mac_open() retry delay in usec */
uint32_t vnet_mac_open_delay = 100;	/* 0.1 ms */

/* max # of mac_open() retries */
uint32_t vnet_mac_open_retries = 100;

/*
 * Property names
 */
static char macaddr_propname[] = "local-mac-address";

/*
 * This is the string displayed by modinfo(1m).
 */
static char vnet_ident[] = "vnet driver";
extern struct mod_ops mod_driverops;
static struct cb_ops cb_vnetops = {
	nulldev,		/* cb_open */
	nulldev,		/* cb_close */
	nodev,			/* cb_strategy */
	nodev,			/* cb_print */
	nodev,			/* cb_dump */
	nodev,			/* cb_read */
	nodev,			/* cb_write */
	nodev,			/* cb_ioctl */
	nodev,			/* cb_devmap */
	nodev,			/* cb_mmap */
	nodev,			/* cb_segmap */
	nochpoll,		/* cb_chpoll */
	ddi_prop_op,		/* cb_prop_op */
	NULL,			/* cb_stream */
	(int)(D_MP)		/* cb_flag */
};

static struct dev_ops vnetops = {
	DEVO_REV,		/* devo_rev */
	0,			/* devo_refcnt */
	NULL,			/* devo_getinfo */
	nulldev,		/* devo_identify */
	nulldev,		/* devo_probe */
	vnetattach,		/* devo_attach */
	vnetdetach,		/* devo_detach */
	nodev,			/* devo_reset */
	&cb_vnetops,		/* devo_cb_ops */
	(struct bus_ops *)NULL,	/* devo_bus_ops */
	NULL,			/* devo_power */
	ddi_quiesce_not_supported,	/* devo_quiesce */
};

static struct modldrv modldrv = {
	&mod_driverops,		/* Type of module.  This one is a driver */
	vnet_ident,		/* ID string */
	&vnetops		/* driver specific ops */
};

static struct modlinkage modlinkage = {
	MODREV_1, (void *)&modldrv, NULL
};

#ifdef DEBUG

#define	DEBUG_PRINTF	debug_printf

/*
 * Print debug messages - set to 0xf to enable all msgs
 */
int vnet_dbglevel = 0x8;

static void
debug_printf(const char *fname, void *arg, const char *fmt, ...)
{
	char    buf[512];
	va_list ap;
	vnet_t *vnetp = (vnet_t *)arg;
	char    *bufp = buf;

	if (vnetp == NULL) {
		(void) sprintf(bufp, "%s: ", fname);
		bufp += strlen(bufp);
	} else {
		(void) sprintf(bufp, "vnet%d:%s: ", vnetp->instance, fname);
		bufp += strlen(bufp);
	}
	va_start(ap, fmt);
	(void) vsprintf(bufp, fmt, ap);
	va_end(ap);
	cmn_err(CE_CONT, "%s\n", buf);
}

#endif

/* _init(9E): initialize the loadable module */
int
_init(void)
{
	int status;

	DBG1(NULL, "enter\n");

	mac_init_ops(&vnetops, "vnet");
	status = mod_install(&modlinkage);
	if (status != 0) {
		mac_fini_ops(&vnetops);
	}
	vdds_mod_init();
	DBG1(NULL, "exit(%d)\n", status);
	return (status);
}

/* _fini(9E): prepare the module for unloading. */
int
_fini(void)
{
	int		status;

	DBG1(NULL, "enter\n");

	status = mod_remove(&modlinkage);
	if (status != 0)
		return (status);
	mac_fini_ops(&vnetops);
	vdds_mod_fini();

	DBG1(NULL, "exit(%d)\n", status);
	return (status);
}

/* _info(9E): return information about the loadable module */
int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}

/*
 * attach(9E): attach a device to the system.
 * called once for each instance of the device on the system.
 */
static int
vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
	vnet_t			*vnetp;
	int			status;
	int			instance;
	uint64_t		reg;
	char			qname[TASKQ_NAMELEN];
	vnet_attach_progress_t	attach_progress;

	attach_progress = AST_init;

	switch (cmd) {
	case DDI_ATTACH:
		break;
	case DDI_RESUME:
	case DDI_PM_RESUME:
	default:
		goto vnet_attach_fail;
	}

	instance = ddi_get_instance(dip);
	DBG1(NULL, "instance(%d) enter\n", instance);

	/* allocate vnet_t and mac_t structures */
	vnetp = kmem_zalloc(sizeof (vnet_t), KM_SLEEP);
	vnetp->dip = dip;
	vnetp->instance = instance;
	rw_init(&vnetp->vrwlock, NULL, RW_DRIVER, NULL);
	rw_init(&vnetp->vsw_fp_rw, NULL, RW_DRIVER, NULL);
	attach_progress |= AST_vnet_alloc;

	vnet_ring_grp_init(vnetp);
	attach_progress |= AST_ring_init;

	status = vdds_init(vnetp);
	if (status != 0) {
		goto vnet_attach_fail;
	}
	attach_progress |= AST_vdds_init;

	/* setup links to vnet_t from both devinfo and mac_t */
	ddi_set_driver_private(dip, (caddr_t)vnetp);

	/* read the mac address */
	status = vnet_read_mac_address(vnetp);
	if (status != DDI_SUCCESS) {
		goto vnet_attach_fail;
	}
	attach_progress |= AST_read_macaddr;

	reg = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
	    DDI_PROP_DONTPASS, "reg", -1);
	if (reg == -1) {
		goto vnet_attach_fail;
	}
	vnetp->reg = reg;

	vnet_fdb_create(vnetp);
	attach_progress |= AST_fdbh_alloc;

	(void) snprintf(qname, TASKQ_NAMELEN, "vres_taskq%d", instance);
	if ((vnetp->taskqp = ddi_taskq_create(dip, qname, 1,
	    TASKQ_DEFAULTPRI, 0)) == NULL) {
		cmn_err(CE_WARN, "!vnet%d: Unable to create task queue",
		    instance);
		goto vnet_attach_fail;
	}
	attach_progress |= AST_taskq_create;

	/* add to the list of vnet devices */
	WRITE_ENTER(&vnet_rw);
	vnetp->nextp = vnet_headp;
	vnet_headp = vnetp;
	RW_EXIT(&vnet_rw);

	attach_progress |= AST_vnet_list;

	/*
	 * Initialize the generic vnet plugin which provides communication via
	 * sun4v LDC (logical domain channel) based resources. This involves 2
	 * steps; first, vgen_init() is invoked to read the various properties
	 * of the vnet device from its MD node (including its mtu which is
	 * needed to mac_register()) and obtain a handle to the vgen layer.
	 * After mac_register() is done and we have a mac handle, we then
	 * invoke vgen_init_mdeg() which registers with the the MD event
	 * generator (mdeg) framework to allow LDC resource notifications.
	 * Note: this sequence also allows us to report the correct default #
	 * of pseudo rings (2TX and 3RX) in vnet_m_capab() which gets invoked
	 * in the context of mac_register(); and avoids conflicting with
	 * dynamic pseudo rx rings which get added/removed as a result of mdeg
	 * events in vgen.
	 */
	status = vgen_init(vnetp, reg, vnetp->dip,
	    (uint8_t *)vnetp->curr_macaddr, &vnetp->vgenhdl);
	if (status != DDI_SUCCESS) {
		DERR(vnetp, "vgen_init() failed\n");
		goto vnet_attach_fail;
	}
	attach_progress |= AST_vgen_init;

	status = vnet_mac_register(vnetp);
	if (status != DDI_SUCCESS) {
		goto vnet_attach_fail;
	}
	vnetp->link_state = LINK_STATE_UNKNOWN;
	attach_progress |= AST_macreg;

	status = vgen_init_mdeg(vnetp->vgenhdl);
	if (status != DDI_SUCCESS) {
		goto vnet_attach_fail;
	}
	attach_progress |= AST_init_mdeg;

	vnetp->attach_progress = attach_progress;

	DBG1(NULL, "instance(%d) exit\n", instance);
	return (DDI_SUCCESS);

vnet_attach_fail:
	vnetp->attach_progress = attach_progress;
	status = vnet_unattach(vnetp);
	ASSERT(status == 0);
	return (DDI_FAILURE);
}

/*
 * detach(9E): detach a device from the system.
 */
static int
vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	vnet_t		*vnetp;
	int		instance;

	instance = ddi_get_instance(dip);
	DBG1(NULL, "instance(%d) enter\n", instance);

	vnetp = ddi_get_driver_private(dip);
	if (vnetp == NULL) {
		goto vnet_detach_fail;
	}

	switch (cmd) {
	case DDI_DETACH:
		break;
	case DDI_SUSPEND:
	case DDI_PM_SUSPEND:
	default:
		goto vnet_detach_fail;
	}

	if (vnet_unattach(vnetp) != 0) {
		goto vnet_detach_fail;
	}

	return (DDI_SUCCESS);

vnet_detach_fail:
	return (DDI_FAILURE);
}

/*
 * Common routine to handle vnetattach() failure and vnetdetach(). Note that
 * the only reason this function could fail is if mac_unregister() fails.
 * Otherwise, this function must ensure that all resources are freed and return
 * success.
 */
static int
vnet_unattach(vnet_t *vnetp)
{
	vnet_attach_progress_t	attach_progress;

	attach_progress = vnetp->attach_progress;

	/*
	 * Disable the mac device in the gldv3 subsystem. This can fail, in
	 * particular if there are still any open references to this mac
	 * device; in which case we just return failure without continuing to
	 * detach further.
	 * If it succeeds, we then invoke vgen_uninit() which should unregister
	 * any pseudo rings registered with the mac layer. Note we keep the
	 * AST_macreg flag on, so we can unregister with the mac layer at
	 * the end of this routine.
	 */
	if (attach_progress & AST_macreg) {
		if (mac_disable(vnetp->mh) != 0) {
			return (1);
		}
	}

	/*
	 * Now that we have disabled the device, we must finish all other steps
	 * and successfully return from this function; otherwise we will end up
	 * leaving the device in a broken/unusable state.
	 *
	 * First, release any hybrid resources assigned to this vnet device.
	 */
	if (attach_progress & AST_vdds_init) {
		vdds_cleanup(vnetp);
		attach_progress &= ~AST_vdds_init;
	}

	/*
	 * Uninit vgen. This stops further mdeg callbacks to this vnet
	 * device and/or its ports; and detaches any existing ports.
	 */
	if (attach_progress & (AST_vgen_init|AST_init_mdeg)) {
		vgen_uninit(vnetp->vgenhdl);
		attach_progress &= ~AST_vgen_init;
		attach_progress &= ~AST_init_mdeg;
	}

	/* Destroy the taskq. */
	if (attach_progress & AST_taskq_create) {
		ddi_taskq_destroy(vnetp->taskqp);
		attach_progress &= ~AST_taskq_create;
	}

	/* Destroy fdb. */
	if (attach_progress & AST_fdbh_alloc) {
		vnet_fdb_destroy(vnetp);
		attach_progress &= ~AST_fdbh_alloc;
	}

	/* Remove from the device list */
	if (attach_progress & AST_vnet_list) {
		vnet_t		**vnetpp;
		/* unlink from instance(vnet_t) list */
		WRITE_ENTER(&vnet_rw);
		for (vnetpp = &vnet_headp; *vnetpp;
		    vnetpp = &(*vnetpp)->nextp) {
			if (*vnetpp == vnetp) {
				*vnetpp = vnetp->nextp;
				break;
			}
		}
		RW_EXIT(&vnet_rw);
		attach_progress &= ~AST_vnet_list;
	}

	if (attach_progress & AST_ring_init) {
		vnet_ring_grp_uninit(vnetp);
		attach_progress &= ~AST_ring_init;
	}

	if (attach_progress & AST_macreg) {
		VERIFY(mac_unregister(vnetp->mh) == 0);
		vnetp->mh = NULL;
		attach_progress &= ~AST_macreg;
	}

	if (attach_progress & AST_vnet_alloc) {
		rw_destroy(&vnetp->vrwlock);
		rw_destroy(&vnetp->vsw_fp_rw);
		attach_progress &= ~AST_vnet_list;
		KMEM_FREE(vnetp);
	}

	return (0);
}

/* enable the device for transmit/receive */
static int
vnet_m_start(void *arg)
{
	vnet_t		*vnetp = arg;

	DBG1(vnetp, "enter\n");

	WRITE_ENTER(&vnetp->vrwlock);
	vnetp->flags |= VNET_STARTED;
	vnet_start_resources(vnetp);
	RW_EXIT(&vnetp->vrwlock);

	DBG1(vnetp, "exit\n");
	return (VNET_SUCCESS);

}

/* stop transmit/receive for the device */
static void
vnet_m_stop(void *arg)
{
	vnet_t		*vnetp = arg;

	DBG1(vnetp, "enter\n");

	WRITE_ENTER(&vnetp->vrwlock);
	if (vnetp->flags & VNET_STARTED) {
		/*
		 * Set the flags appropriately; this should prevent starting of
		 * any new resources that are added(see vnet_res_start_task()),
		 * while we release the vrwlock in vnet_stop_resources() before
		 * stopping each resource.
		 */
		vnetp->flags &= ~VNET_STARTED;
		vnetp->flags |= VNET_STOPPING;
		vnet_stop_resources(vnetp);
		vnetp->flags &= ~VNET_STOPPING;
	}
	RW_EXIT(&vnetp->vrwlock);

	DBG1(vnetp, "exit\n");
}

/* set the unicast mac address of the device */
static int
vnet_m_unicst(void *arg, const uint8_t *macaddr)
{
	_NOTE(ARGUNUSED(macaddr))

	vnet_t *vnetp = arg;

	DBG1(vnetp, "enter\n");
	/*
	 * NOTE: setting mac address dynamically is not supported.
	 */
	DBG1(vnetp, "exit\n");

	return (VNET_FAILURE);
}

/* enable/disable a multicast address */
static int
vnet_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
{
	_NOTE(ARGUNUSED(add, mca))

	vnet_t		*vnetp = arg;
	vnet_res_t	*vresp;
	mac_register_t	*macp;
	mac_callbacks_t	*cbp;
	int		rv = VNET_SUCCESS;

	DBG1(vnetp, "enter\n");

	READ_ENTER(&vnetp->vsw_fp_rw);
	if (vnetp->vsw_fp == NULL) {
		RW_EXIT(&vnetp->vsw_fp_rw);
		return (EAGAIN);
	}
	VNET_FDBE_REFHOLD(vnetp->vsw_fp);
	RW_EXIT(&vnetp->vsw_fp_rw);

	vresp = vnetp->vsw_fp;
	macp = &vresp->macreg;
	cbp = macp->m_callbacks;
	rv = cbp->mc_multicst(macp->m_driver, add, mca);

	VNET_FDBE_REFRELE(vnetp->vsw_fp);

	DBG1(vnetp, "exit(%d)\n", rv);
	return (rv);
}

/* set or clear promiscuous mode on the device */
static int
vnet_m_promisc(void *arg, boolean_t on)
{
	_NOTE(ARGUNUSED(on))

	vnet_t *vnetp = arg;
	DBG1(vnetp, "enter\n");
	/*
	 * NOTE: setting promiscuous mode is not supported, just return success.
	 */
	DBG1(vnetp, "exit\n");
	return (VNET_SUCCESS);
}

/*
 * Transmit a chain of packets. This function provides switching functionality
 * based on the destination mac address to reach other guests (within ldoms) or
 * external hosts.
 */
mblk_t *
vnet_tx_ring_send(void *arg, mblk_t *mp)
{
	vnet_pseudo_tx_ring_t	*tx_ringp;
	vnet_tx_ring_stats_t	*statsp;
	vnet_t			*vnetp;
	vnet_res_t		*vresp;
	mblk_t			*next;
	mblk_t			*resid_mp;
	mac_register_t		*macp;
	struct ether_header	*ehp;
	boolean_t		is_unicast;
	boolean_t		is_pvid;	/* non-default pvid ? */
	boolean_t		hres;		/* Hybrid resource ? */
	void			*tx_arg;
	size_t			size;

	tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
	statsp = &tx_ringp->tx_ring_stats;
	vnetp = (vnet_t *)tx_ringp->vnetp;
	DBG1(vnetp, "enter\n");
	ASSERT(mp != NULL);

	is_pvid = (vnetp->pvid != vnetp->default_vlan_id) ? B_TRUE : B_FALSE;

	while (mp != NULL) {

		next = mp->b_next;
		mp->b_next = NULL;

		/* update stats */
		size = msgsize(mp);

		/*
		 * Find fdb entry for the destination
		 * and hold a reference to it.
		 */
		ehp = (struct ether_header *)mp->b_rptr;
		vresp = vnet_fdbe_find(vnetp, &ehp->ether_dhost);
		if (vresp != NULL) {

			/*
			 * Destination found in FDB.
			 * The destination is a vnet device within ldoms
			 * and directly reachable, invoke the tx function
			 * in the fdb entry.
			 */
			macp = &vresp->macreg;
			resid_mp = macp->m_callbacks->mc_tx(macp->m_driver, mp);

			/* tx done; now release ref on fdb entry */
			VNET_FDBE_REFRELE(vresp);

			if (resid_mp != NULL) {
				/* m_tx failed */
				mp->b_next = next;
				break;
			}
		} else {
			is_unicast = !(IS_BROADCAST(ehp) ||
			    (IS_MULTICAST(ehp)));
			/*
			 * Destination is not in FDB.
			 * If the destination is broadcast or multicast,
			 * then forward the packet to vswitch.
			 * If a Hybrid resource avilable, then send the
			 * unicast packet via hybrid resource, otherwise
			 * forward it to vswitch.
			 */
			READ_ENTER(&vnetp->vsw_fp_rw);

			if ((is_unicast) && (vnetp->hio_fp != NULL)) {
				vresp = vnetp->hio_fp;
				hres = B_TRUE;
			} else {
				vresp = vnetp->vsw_fp;
				hres = B_FALSE;
			}
			if (vresp == NULL) {
				/*
				 * no fdb entry to vsw? drop the packet.
				 */
				RW_EXIT(&vnetp->vsw_fp_rw);
				freemsg(mp);
				mp = next;
				continue;
			}

			/* ref hold the fdb entry to vsw */
			VNET_FDBE_REFHOLD(vresp);

			RW_EXIT(&vnetp->vsw_fp_rw);

			/*
			 * In the case of a hybrid resource we need to insert
			 * the tag for the pvid case here; unlike packets that
			 * are destined to a vnet/vsw in which case the vgen
			 * layer does the tagging before sending it over ldc.
			 */
			if (hres == B_TRUE) {
				/*
				 * Determine if the frame being transmitted
				 * over the hybrid resource is untagged. If so,
				 * insert the tag before transmitting.
				 */
				if (is_pvid == B_TRUE &&
				    ehp->ether_type != htons(ETHERTYPE_VLAN)) {

					mp = vnet_vlan_insert_tag(mp,
					    vnetp->pvid);
					if (mp == NULL) {
						VNET_FDBE_REFRELE(vresp);
						mp = next;
						continue;
					}

				}

				macp = &vresp->macreg;
				tx_arg = tx_ringp;
			} else {
				macp = &vresp->macreg;
				tx_arg = macp->m_driver;
			}
			resid_mp = macp->m_callbacks->mc_tx(tx_arg, mp);

			/* tx done; now release ref on fdb entry */
			VNET_FDBE_REFRELE(vresp);

			if (resid_mp != NULL) {
				/* m_tx failed */
				mp->b_next = next;
				break;
			}
		}

		statsp->obytes += size;
		statsp->opackets++;
		mp = next;
	}

	DBG1(vnetp, "exit\n");
	return (mp);
}

/* get statistics from the device */
int
vnet_m_stat(void *arg, uint_t stat, uint64_t *val)
{
	vnet_t *vnetp = arg;
	vnet_res_t	*vresp;
	mac_register_t	*macp;
	mac_callbacks_t	*cbp;
	uint64_t val_total = 0;

	DBG1(vnetp, "enter\n");

	/*
	 * get the specified statistic from each transport and return the
	 * aggregate val.  This obviously only works for counters.
	 */
	if ((IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat)) ||
	    (IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat))) {
		return (ENOTSUP);
	}

	READ_ENTER(&vnetp->vrwlock);
	for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
		macp = &vresp->macreg;
		cbp = macp->m_callbacks;
		if (cbp->mc_getstat(macp->m_driver, stat, val) == 0)
			val_total += *val;
	}
	RW_EXIT(&vnetp->vrwlock);

	*val = val_total;

	DBG1(vnetp, "exit\n");
	return (0);
}

static void
vnet_ring_grp_init(vnet_t *vnetp)
{
	vnet_pseudo_rx_group_t	*rx_grp;
	vnet_pseudo_rx_ring_t	*rx_ringp;
	vnet_pseudo_tx_group_t	*tx_grp;
	vnet_pseudo_tx_ring_t	*tx_ringp;
	int			i;

	tx_grp = &vnetp->tx_grp[0];
	tx_ringp = kmem_zalloc(sizeof (vnet_pseudo_tx_ring_t) *
	    VNET_NUM_PSEUDO_TXRINGS, KM_SLEEP);
	for (i = 0; i < VNET_NUM_PSEUDO_TXRINGS; i++) {
		tx_ringp[i].state |= VNET_TXRING_SHARED;
	}
	tx_grp->rings = tx_ringp;
	tx_grp->ring_cnt = VNET_NUM_PSEUDO_TXRINGS;
	mutex_init(&tx_grp->flowctl_lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&tx_grp->flowctl_cv, NULL, CV_DRIVER, NULL);
	tx_grp->flowctl_thread = thread_create(NULL, 0,
	    vnet_tx_notify_thread, tx_grp, 0, &p0, TS_RUN, minclsyspri);

	rx_grp = &vnetp->rx_grp[0];
	rx_grp->max_ring_cnt = MAX_RINGS_PER_GROUP;
	rw_init(&rx_grp->lock, NULL, RW_DRIVER, NULL);
	rx_ringp = kmem_zalloc(sizeof (vnet_pseudo_rx_ring_t) *
	    rx_grp->max_ring_cnt, KM_SLEEP);

	/*
	 * Setup the first 3 Pseudo RX Rings that are reserved;
	 * 1 for LDC resource to vswitch + 2 for RX rings of Hybrid resource.
	 */
	rx_ringp[0].state |= VNET_RXRING_INUSE|VNET_RXRING_LDC_SERVICE;
	rx_ringp[0].index = 0;
	rx_ringp[1].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
	rx_ringp[1].index = 1;
	rx_ringp[2].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
	rx_ringp[2].index = 2;

	rx_grp->ring_cnt = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
	rx_grp->rings = rx_ringp;

	for (i = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
	    i < rx_grp->max_ring_cnt; i++) {
		rx_ringp = &rx_grp->rings[i];
		rx_ringp->state = VNET_RXRING_FREE;
		rx_ringp->index = i;
	}
}

static void
vnet_ring_grp_uninit(vnet_t *vnetp)
{
	vnet_pseudo_rx_group_t	*rx_grp;
	vnet_pseudo_tx_group_t	*tx_grp;
	kt_did_t		tid = 0;

	tx_grp = &vnetp->tx_grp[0];

	/* Inform tx_notify_thread to exit */
	mutex_enter(&tx_grp->flowctl_lock);
	if (tx_grp->flowctl_thread != NULL) {
		tid = tx_grp->flowctl_thread->t_did;
		tx_grp->flowctl_done = B_TRUE;
		cv_signal(&tx_grp->flowctl_cv);
	}
	mutex_exit(&tx_grp->flowctl_lock);
	if (tid != 0)
		thread_join(tid);

	if (tx_grp->rings != NULL) {
		ASSERT(tx_grp->ring_cnt == VNET_NUM_PSEUDO_TXRINGS);
		kmem_free(tx_grp->rings, sizeof (vnet_pseudo_tx_ring_t) *
		    tx_grp->ring_cnt);
		tx_grp->rings = NULL;
	}

	rx_grp = &vnetp->rx_grp[0];
	if (rx_grp->rings != NULL) {
		ASSERT(rx_grp->max_ring_cnt == MAX_RINGS_PER_GROUP);
		ASSERT(rx_grp->ring_cnt == VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
		kmem_free(rx_grp->rings, sizeof (vnet_pseudo_rx_ring_t) *
		    rx_grp->max_ring_cnt);
		rx_grp->rings = NULL;
	}
}

static vnet_pseudo_rx_ring_t *
vnet_alloc_pseudo_rx_ring(vnet_t *vnetp)
{
	vnet_pseudo_rx_group_t  *rx_grp;
	vnet_pseudo_rx_ring_t	*rx_ringp;
	int			index;

	rx_grp = &vnetp->rx_grp[0];
	WRITE_ENTER(&rx_grp->lock);

	if (rx_grp->ring_cnt == rx_grp->max_ring_cnt) {
		/* no rings available */
		RW_EXIT(&rx_grp->lock);
		return (NULL);
	}

	for (index = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
	    index < rx_grp->max_ring_cnt; index++) {
		rx_ringp = &rx_grp->rings[index];
		if (rx_ringp->state == VNET_RXRING_FREE) {
			rx_ringp->state |= VNET_RXRING_INUSE;
			rx_grp->ring_cnt++;
			break;
		}
	}

	RW_EXIT(&rx_grp->lock);
	return (rx_ringp);
}

static void
vnet_free_pseudo_rx_ring(vnet_t *vnetp, vnet_pseudo_rx_ring_t *ringp)
{
	vnet_pseudo_rx_group_t  *rx_grp;

	ASSERT(ringp->index >= VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
	rx_grp = &vnetp->rx_grp[0];
	WRITE_ENTER(&rx_grp->lock);

	if (ringp->state != VNET_RXRING_FREE) {
		ringp->state = VNET_RXRING_FREE;
		ringp->handle = NULL;
		rx_grp->ring_cnt--;
	}

	RW_EXIT(&rx_grp->lock);
}

/* wrapper function for mac_register() */
static int
vnet_mac_register(vnet_t *vnetp)
{
	mac_register_t	*macp;
	int		err;

	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
		return (DDI_FAILURE);
	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
	macp->m_driver = vnetp;
	macp->m_dip = vnetp->dip;
	macp->m_src_addr = vnetp->curr_macaddr;
	macp->m_callbacks = &vnet_m_callbacks;
	macp->m_min_sdu = 0;
	macp->m_max_sdu = vnetp->mtu;
	macp->m_margin = VLAN_TAGSZ;

	macp->m_v12n = MAC_VIRT_LEVEL1;

	/*
	 * Finally, we're ready to register ourselves with the MAC layer
	 * interface; if this succeeds, we're all ready to start()
	 */
	err = mac_register(macp, &vnetp->mh);
	mac_free(macp);
	return (err == 0 ? DDI_SUCCESS : DDI_FAILURE);
}

/* read the mac address of the device */
static int
vnet_read_mac_address(vnet_t *vnetp)
{
	uchar_t		*macaddr;
	uint32_t	size;
	int		rv;

	rv = ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, vnetp->dip,
	    DDI_PROP_DONTPASS, macaddr_propname, &macaddr, &size);
	if ((rv != DDI_PROP_SUCCESS) || (size != ETHERADDRL)) {
		DWARN(vnetp, "prop_lookup failed(%s) err(%d)\n",
		    macaddr_propname, rv);
		return (DDI_FAILURE);
	}
	bcopy(macaddr, (caddr_t)vnetp->vendor_addr, ETHERADDRL);
	bcopy(macaddr, (caddr_t)vnetp->curr_macaddr, ETHERADDRL);
	ddi_prop_free(macaddr);

	return (DDI_SUCCESS);
}

static void
vnet_fdb_create(vnet_t *vnetp)
{
	char		hashname[MAXNAMELEN];

	(void) snprintf(hashname, MAXNAMELEN, "vnet%d-fdbhash",
	    vnetp->instance);
	vnetp->fdb_nchains = vnet_fdb_nchains;
	vnetp->fdb_hashp = mod_hash_create_ptrhash(hashname, vnetp->fdb_nchains,
	    mod_hash_null_valdtor, sizeof (void *));
}

static void
vnet_fdb_destroy(vnet_t *vnetp)
{
	/* destroy fdb-hash-table */
	if (vnetp->fdb_hashp != NULL) {
		mod_hash_destroy_hash(vnetp->fdb_hashp);
		vnetp->fdb_hashp = NULL;
		vnetp->fdb_nchains = 0;
	}
}

/*
 * Add an entry into the fdb.
 */
void
vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp)
{
	uint64_t	addr = 0;
	int		rv;

	KEY_HASH(addr, vresp->rem_macaddr);

	/*
	 * If the entry being added corresponds to LDC_SERVICE resource,
	 * that is, vswitch connection, it is added to the hash and also
	 * the entry is cached, an additional reference count reflects
	 * this. The HYBRID resource is not added to the hash, but only
	 * cached, as it is only used for sending out packets for unknown
	 * unicast destinations.
	 */
	(vresp->type == VIO_NET_RES_LDC_SERVICE) ?
	    (vresp->refcnt = 1) : (vresp->refcnt = 0);

	/*
	 * Note: duplicate keys will be rejected by mod_hash.
	 */
	if (vresp->type != VIO_NET_RES_HYBRID) {
		rv = mod_hash_insert(vnetp->fdb_hashp, (mod_hash_key_t)addr,
		    (mod_hash_val_t)vresp);
		if (rv != 0) {
			DWARN(vnetp, "Duplicate macaddr key(%lx)\n", addr);
			return;
		}
	}

	if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
		/* Cache the fdb entry to vsw-port */
		WRITE_ENTER(&vnetp->vsw_fp_rw);
		if (vnetp->vsw_fp == NULL)
			vnetp->vsw_fp = vresp;
		RW_EXIT(&vnetp->vsw_fp_rw);
	} else if (vresp->type == VIO_NET_RES_HYBRID) {
		/* Cache the fdb entry to hybrid resource */
		WRITE_ENTER(&vnetp->vsw_fp_rw);
		if (vnetp->hio_fp == NULL)
			vnetp->hio_fp = vresp;
		RW_EXIT(&vnetp->vsw_fp_rw);
	}
}

/*
 * Remove an entry from fdb.
 */
static void
vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp)
{
	uint64_t	addr = 0;
	int		rv;
	uint32_t	refcnt;
	vnet_res_t	*tmp;

	KEY_HASH(addr, vresp->rem_macaddr);

	/*
	 * Remove the entry from fdb hash table.
	 * This prevents further references to this fdb entry.
	 */
	if (vresp->type != VIO_NET_RES_HYBRID) {
		rv = mod_hash_remove(vnetp->fdb_hashp, (mod_hash_key_t)addr,
		    (mod_hash_val_t *)&tmp);
		if (rv != 0) {
			/*
			 * As the resources are added to the hash only
			 * after they are started, this can occur if
			 * a resource unregisters before it is ever started.
			 */
			return;
		}
	}

	if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
		WRITE_ENTER(&vnetp->vsw_fp_rw);

		ASSERT(tmp == vnetp->vsw_fp);
		vnetp->vsw_fp = NULL;

		RW_EXIT(&vnetp->vsw_fp_rw);
	} else if (vresp->type == VIO_NET_RES_HYBRID) {
		WRITE_ENTER(&vnetp->vsw_fp_rw);

		vnetp->hio_fp = NULL;

		RW_EXIT(&vnetp->vsw_fp_rw);
	}

	/*
	 * If there are threads already ref holding before the entry was
	 * removed from hash table, then wait for ref count to drop to zero.
	 */
	(vresp->type == VIO_NET_RES_LDC_SERVICE) ?
	    (refcnt = 1) : (refcnt = 0);
	while (vresp->refcnt > refcnt) {
		delay(drv_usectohz(vnet_fdbe_refcnt_delay));
	}
}

/*
 * Search fdb for a given mac address. If an entry is found, hold
 * a reference to it and return the entry; else returns NULL.
 */
static vnet_res_t *
vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp)
{
	uint64_t	key = 0;
	vnet_res_t	*vresp;
	int		rv;

	KEY_HASH(key, addrp->ether_addr_octet);

	rv = mod_hash_find_cb(vnetp->fdb_hashp, (mod_hash_key_t)key,
	    (mod_hash_val_t *)&vresp, vnet_fdbe_find_cb);

	if (rv != 0)
		return (NULL);

	return (vresp);
}

/*
 * Callback function provided to mod_hash_find_cb(). After finding the fdb
 * entry corresponding to the key (macaddr), this callback will be invoked by
 * mod_hash_find_cb() to atomically increment the reference count on the fdb
 * entry before returning the found entry.
 */
static void
vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
{
	_NOTE(ARGUNUSED(key))
	VNET_FDBE_REFHOLD((vnet_res_t *)val);
}

/*
 * Frames received that are tagged with the pvid of the vnet device must be
 * untagged before sending up the stack. This function walks the chain of rx
 * frames, untags any such frames and returns the updated chain.
 *
 * Arguments:
 *    pvid:  pvid of the vnet device for which packets are being received
 *    mp:    head of pkt chain to be validated and untagged
 *
 * Returns:
 *    mp:    head of updated chain of packets
 */
static void
vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp)
{
	struct ether_vlan_header	*evhp;
	mblk_t				*bp;
	mblk_t				*bpt;
	mblk_t				*bph;
	mblk_t				*bpn;

	bpn = bph = bpt = NULL;

	for (bp = *mp; bp != NULL; bp = bpn) {

		bpn = bp->b_next;
		bp->b_next = bp->b_prev = NULL;

		evhp = (struct ether_vlan_header *)bp->b_rptr;

		if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN &&
		    VLAN_ID(ntohs(evhp->ether_tci)) == pvid) {

			bp = vnet_vlan_remove_tag(bp);
			if (bp == NULL) {
				continue;
			}

		}

		/* build a chain of processed packets */
		if (bph == NULL) {
			bph = bpt = bp;
		} else {
			bpt->b_next = bp;
			bpt = bp;
		}

	}

	*mp = bph;
}

static void
vnet_rx(vio_net_handle_t vrh, mblk_t *mp)
{
	vnet_res_t		*vresp = (vnet_res_t *)vrh;
	vnet_t			*vnetp = vresp->vnetp;
	vnet_pseudo_rx_ring_t	*ringp;

	if ((vnetp == NULL) || (vnetp->mh == 0)) {
		freemsgchain(mp);
		return;
	}

	ringp = vresp->rx_ringp;
	mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
}

void
vnet_tx_update(vio_net_handle_t vrh)
{
	vnet_res_t		*vresp = (vnet_res_t *)vrh;
	vnet_t			*vnetp = vresp->vnetp;
	vnet_pseudo_tx_ring_t	*tx_ringp;
	vnet_pseudo_tx_group_t	*tx_grp;
	int			i;

	if (vnetp == NULL || vnetp->mh == NULL) {
		return;
	}

	/*
	 * Currently, the tx hwring API (used to access rings that belong to
	 * a Hybrid IO resource) does not provide us a per ring flow ctrl
	 * update; also the pseudo rings are shared by the ports/ldcs in the
	 * vgen layer. Thus we can't figure out which pseudo ring is being
	 * re-enabled for transmits. To work around this, when we get a tx
	 * restart notification from below, we simply propagate that to all
	 * the tx pseudo rings registered with the mac layer above.
	 *
	 * There are a couple of side effects with this approach, but they are
	 * not harmful, as outlined below:
	 *
	 * A) We might send an invalid ring_update() for a ring that is not
	 * really flow controlled. This will not have any effect in the mac
	 * layer and packets will continue to be transmitted on that ring.
	 *
	 * B) We might end up clearing the flow control in the mac layer for
	 * a ring that is still flow controlled in the underlying resource.
	 * This will result in the mac layer restarting	transmit, only to be
	 * flow controlled again on that ring.
	 */
	tx_grp = &vnetp->tx_grp[0];
	for (i = 0; i < tx_grp->ring_cnt; i++) {
		tx_ringp = &tx_grp->rings[i];
		mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
	}
}

/*
 * vnet_tx_notify_thread:
 *
 * vnet_tx_ring_update() callback function wakes up this thread when
 * it gets called. This thread will call mac_tx_ring_update() to
 * notify upper mac of flow control getting relieved. Note that
 * vnet_tx_ring_update() cannot call mac_tx_ring_update() directly
 * because vnet_tx_ring_update() is called from lower mac with
 * mi_rw_lock held and mac_tx_ring_update() would also try to grab
 * the same lock.
 */
static void
vnet_tx_notify_thread(void *arg)
{
	callb_cpr_t		cprinfo;
	vnet_pseudo_tx_group_t	*tx_grp = (vnet_pseudo_tx_group_t *)arg;
	vnet_pseudo_tx_ring_t	*tx_ringp;
	vnet_t			*vnetp;
	int			i;

	CALLB_CPR_INIT(&cprinfo, &tx_grp->flowctl_lock, callb_generic_cpr,
	    "vnet_tx_notify_thread");

	mutex_enter(&tx_grp->flowctl_lock);
	while (!tx_grp->flowctl_done) {
		CALLB_CPR_SAFE_BEGIN(&cprinfo);
		cv_wait(&tx_grp->flowctl_cv, &tx_grp->flowctl_lock);
		CALLB_CPR_SAFE_END(&cprinfo, &tx_grp->flowctl_lock);

		for (i = 0; i < tx_grp->ring_cnt; i++) {
			tx_ringp = &tx_grp->rings[i];
			if (tx_ringp->woken_up) {
				tx_ringp->woken_up = B_FALSE;
				vnetp = tx_ringp->vnetp;
				mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
			}
		}
	}
	/*
	 * The tx_grp is being destroyed, exit the thread.
	 */
	tx_grp->flowctl_thread = NULL;
	CALLB_CPR_EXIT(&cprinfo);
	thread_exit();
}

void
vnet_tx_ring_update(void *arg1, uintptr_t arg2)
{
	vnet_t			*vnetp = (vnet_t *)arg1;
	vnet_pseudo_tx_group_t	*tx_grp;
	vnet_pseudo_tx_ring_t	*tx_ringp;
	int			i;

	tx_grp = &vnetp->tx_grp[0];
	for (i = 0; i < tx_grp->ring_cnt; i++) {
		tx_ringp = &tx_grp->rings[i];
		if (tx_ringp->hw_rh == (mac_ring_handle_t)arg2) {
			mutex_enter(&tx_grp->flowctl_lock);
			tx_ringp->woken_up = B_TRUE;
			cv_signal(&tx_grp->flowctl_cv);
			mutex_exit(&tx_grp->flowctl_lock);
			break;
		}
	}
}

/*
 * Update the new mtu of vnet into the mac layer. First check if the device has
 * been plumbed and if so fail the mtu update. Returns 0 on success.
 */
int
vnet_mtu_update(vnet_t *vnetp, uint32_t mtu)
{
	int	rv;

	if (vnetp == NULL || vnetp->mh == NULL) {
		return (EINVAL);
	}

	WRITE_ENTER(&vnetp->vrwlock);

	if (vnetp->flags & VNET_STARTED) {
		RW_EXIT(&vnetp->vrwlock);
		cmn_err(CE_NOTE, "!vnet%d: Unable to process mtu "
		    "update as the device is plumbed\n",
		    vnetp->instance);
		return (EBUSY);
	}

	/* update mtu in the mac layer */
	rv = mac_maxsdu_update(vnetp->mh, mtu);
	if (rv != 0) {
		RW_EXIT(&vnetp->vrwlock);
		cmn_err(CE_NOTE,
		    "!vnet%d: Unable to update mtu with mac layer\n",
		    vnetp->instance);
		return (EIO);
	}

	vnetp->mtu = mtu;

	RW_EXIT(&vnetp->vrwlock);

	return (0);
}

/*
 * Update the link state of vnet to the mac layer.
 */
void
vnet_link_update(vnet_t *vnetp, link_state_t link_state)
{
	if (vnetp == NULL || vnetp->mh == NULL) {
		return;
	}

	WRITE_ENTER(&vnetp->vrwlock);
	if (vnetp->link_state == link_state) {
		RW_EXIT(&vnetp->vrwlock);
		return;
	}
	vnetp->link_state = link_state;
	RW_EXIT(&vnetp->vrwlock);

	mac_link_update(vnetp->mh, link_state);
}

/*
 * vio_net_resource_reg -- An interface called to register a resource
 *	with vnet.
 *	macp -- a GLDv3 mac_register that has all the details of
 *		a resource and its callbacks etc.
 *	type -- resource type.
 *	local_macaddr -- resource's MAC address. This is used to
 *			 associate a resource with a corresponding vnet.
 *	remote_macaddr -- remote side MAC address. This is ignored for
 *			  the Hybrid resources.
 *	vhp -- A handle returned to the caller.
 *	vcb -- A set of callbacks provided to the callers.
 */
int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type,
    ether_addr_t local_macaddr, ether_addr_t rem_macaddr, vio_net_handle_t *vhp,
    vio_net_callbacks_t *vcb)
{
	vnet_t		*vnetp;
	vnet_res_t	*vresp;

	vresp = kmem_zalloc(sizeof (vnet_res_t), KM_SLEEP);
	ether_copy(local_macaddr, vresp->local_macaddr);
	ether_copy(rem_macaddr, vresp->rem_macaddr);
	vresp->type = type;
	bcopy(macp, &vresp->macreg, sizeof (mac_register_t));

	DBG1(NULL, "Resource Registerig type=0%X\n", type);

	READ_ENTER(&vnet_rw);
	vnetp = vnet_headp;
	while (vnetp != NULL) {
		if (VNET_MATCH_RES(vresp, vnetp)) {
			vresp->vnetp = vnetp;

			/* Setup kstats for hio resource */
			if (vresp->type == VIO_NET_RES_HYBRID) {
				vresp->ksp = vnet_hio_setup_kstats(DRV_NAME,
				    "hio", vresp);
				if (vresp->ksp == NULL) {
					cmn_err(CE_NOTE, "!vnet%d: Cannot "
					    "create kstats for hio resource",
					    vnetp->instance);
				}
			}
			vnet_add_resource(vnetp, vresp);
			break;
		}
		vnetp = vnetp->nextp;
	}
	RW_EXIT(&vnet_rw);
	if (vresp->vnetp == NULL) {
		DWARN(NULL, "No vnet instance");
		kmem_free(vresp, sizeof (vnet_res_t));
		return (ENXIO);
	}

	*vhp = vresp;
	vcb->vio_net_rx_cb = vnet_rx;
	vcb->vio_net_tx_update = vnet_tx_update;
	vcb->vio_net_report_err = vnet_handle_res_err;

	/* Bind the resource to pseudo ring(s) */
	if (vnet_bind_rings(vresp) != 0) {
		(void) vnet_rem_resource(vnetp, vresp);
		vnet_hio_destroy_kstats(vresp->ksp);
		KMEM_FREE(vresp);
		return (1);
	}

	/* Dispatch a task to start resources */
	vnet_dispatch_res_task(vnetp);
	return (0);
}

/*
 * vio_net_resource_unreg -- An interface to unregister a resource.
 */
void
vio_net_resource_unreg(vio_net_handle_t vhp)
{
	vnet_res_t	*vresp = (vnet_res_t *)vhp;
	vnet_t		*vnetp = vresp->vnetp;

	DBG1(NULL, "Resource Registerig hdl=0x%p", vhp);

	ASSERT(vnetp != NULL);
	/*
	 * Remove the resource from fdb; this ensures
	 * there are no references to the resource.
	 */
	vnet_fdbe_del(vnetp, vresp);

	vnet_unbind_rings(vresp);

	/* Now remove the resource from the list */
	(void) vnet_rem_resource(vnetp, vresp);

	vnet_hio_destroy_kstats(vresp->ksp);
	KMEM_FREE(vresp);
}

static void
vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp)
{
	WRITE_ENTER(&vnetp->vrwlock);
	vresp->nextp = vnetp->vres_list;
	vnetp->vres_list = vresp;
	RW_EXIT(&vnetp->vrwlock);
}

static vnet_res_t *
vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp)
{
	vnet_res_t	*vrp;

	WRITE_ENTER(&vnetp->vrwlock);
	if (vresp == vnetp->vres_list) {
		vnetp->vres_list = vresp->nextp;
	} else {
		vrp = vnetp->vres_list;
		while (vrp->nextp != NULL) {
			if (vrp->nextp == vresp) {
				vrp->nextp = vresp->nextp;
				break;
			}
			vrp = vrp->nextp;
		}
	}
	vresp->vnetp = NULL;
	vresp->nextp = NULL;

	RW_EXIT(&vnetp->vrwlock);

	return (vresp);
}

/*
 * vnet_dds_rx -- an interface called by vgen to DDS messages.
 */
void
vnet_dds_rx(void *arg, void *dmsg)
{
	vnet_t *vnetp = arg;
	vdds_process_dds_msg(vnetp, dmsg);
}

/*
 * vnet_send_dds_msg -- An interface provided to DDS to send
 *	DDS messages. This simply sends meessages via vgen.
 */
int
vnet_send_dds_msg(vnet_t *vnetp, void *dmsg)
{
	int rv;

	if (vnetp->vgenhdl != NULL) {
		rv = vgen_dds_tx(vnetp->vgenhdl, dmsg);
	}
	return (rv);
}

/*
 * vnet_cleanup_hio -- an interface called by vgen to cleanup hio resources.
 */
void
vnet_dds_cleanup_hio(vnet_t *vnetp)
{
	vdds_cleanup_hio(vnetp);
}

/*
 * vnet_handle_res_err -- A callback function called by a resource
 *	to report an error. For example, vgen can call to report
 *	an LDC down/reset event. This will trigger cleanup of associated
 *	Hybrid resource.
 */
/* ARGSUSED */
static void
vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err)
{
	vnet_res_t *vresp = (vnet_res_t *)vrh;
	vnet_t *vnetp = vresp->vnetp;

	if (vnetp == NULL) {
		return;
	}
	if ((vresp->type != VIO_NET_RES_LDC_SERVICE) &&
	    (vresp->type != VIO_NET_RES_HYBRID)) {
		return;
	}

	vdds_cleanup_hio(vnetp);
}

/*
 * vnet_dispatch_res_task -- A function to dispatch tasks start resources.
 */
static void
vnet_dispatch_res_task(vnet_t *vnetp)
{
	int rv;

	/*
	 * Dispatch the task. It could be the case that vnetp->flags does
	 * not have VNET_STARTED set. This is ok as vnet_rest_start_task()
	 * can abort the task when the task is started. See related comments
	 * in vnet_m_stop() and vnet_stop_resources().
	 */
	rv = ddi_taskq_dispatch(vnetp->taskqp, vnet_res_start_task,
	    vnetp, DDI_NOSLEEP);
	if (rv != DDI_SUCCESS) {
		cmn_err(CE_WARN,
		    "vnet%d:Can't dispatch start resource task",
		    vnetp->instance);
	}
}

/*
 * vnet_res_start_task -- A taskq callback function that starts a resource.
 */
static void
vnet_res_start_task(void *arg)
{
	vnet_t *vnetp = arg;

	WRITE_ENTER(&vnetp->vrwlock);
	if (vnetp->flags & VNET_STARTED) {
		vnet_start_resources(vnetp);
	}
	RW_EXIT(&vnetp->vrwlock);
}

/*
 * vnet_start_resources -- starts all resources associated with
 *	a vnet.
 */
static void
vnet_start_resources(vnet_t *vnetp)
{
	mac_register_t	*macp;
	mac_callbacks_t	*cbp;
	vnet_res_t	*vresp;
	int rv;

	DBG1(vnetp, "enter\n");

	ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));

	for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
		/* skip if it is already started */
		if (vresp->flags & VNET_STARTED) {
			continue;
		}
		macp = &vresp->macreg;
		cbp = macp->m_callbacks;
		rv = cbp->mc_start(macp->m_driver);
		if (rv == 0) {
			/*
			 * Successfully started the resource, so now
			 * add it to the fdb.
			 */
			vresp->flags |= VNET_STARTED;
			vnet_fdbe_add(vnetp, vresp);
		}
	}

	DBG1(vnetp, "exit\n");

}

/*
 * vnet_stop_resources -- stop all resources associated with a vnet.
 */
static void
vnet_stop_resources(vnet_t *vnetp)
{
	vnet_res_t	*vresp;
	mac_register_t	*macp;
	mac_callbacks_t	*cbp;

	DBG1(vnetp, "enter\n");

	ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));

	for (vresp = vnetp->vres_list; vresp != NULL; ) {
		if (vresp->flags & VNET_STARTED) {
			/*
			 * Release the lock while invoking mc_stop() of the
			 * underlying resource. We hold a reference to this
			 * resource to prevent being removed from the list in
			 * vio_net_resource_unreg(). Note that new resources
			 * can be added to the head of the list while the lock
			 * is released, but they won't be started, as
			 * VNET_STARTED flag has been cleared for the vnet
			 * device in vnet_m_stop(). Also, while the lock is
			 * released a resource could be removed from the list
			 * in vio_net_resource_unreg(); but that is ok, as we
			 * re-acquire the lock and only then access the forward
			 * link (vresp->nextp) to continue with the next
			 * resource.
			 */
			vresp->flags &= ~VNET_STARTED;
			vresp->flags |= VNET_STOPPING;
			macp = &vresp->macreg;
			cbp = macp->m_callbacks;
			VNET_FDBE_REFHOLD(vresp);
			RW_EXIT(&vnetp->vrwlock);

			cbp->mc_stop(macp->m_driver);

			WRITE_ENTER(&vnetp->vrwlock);
			vresp->flags &= ~VNET_STOPPING;
			VNET_FDBE_REFRELE(vresp);
		}
		vresp = vresp->nextp;
	}
	DBG1(vnetp, "exit\n");
}

/*
 * Setup kstats for the HIO statistics.
 * NOTE: the synchronization for the statistics is the
 * responsibility of the caller.
 */
kstat_t *
vnet_hio_setup_kstats(char *ks_mod, char *ks_name, vnet_res_t *vresp)
{
	kstat_t *ksp;
	vnet_t *vnetp = vresp->vnetp;
	vnet_hio_kstats_t *hiokp;
	size_t size;

	ASSERT(vnetp != NULL);
	size = sizeof (vnet_hio_kstats_t) / sizeof (kstat_named_t);
	ksp = kstat_create(ks_mod, vnetp->instance, ks_name, "net",
	    KSTAT_TYPE_NAMED, size, 0);
	if (ksp == NULL) {
		return (NULL);
	}

	hiokp = (vnet_hio_kstats_t *)ksp->ks_data;
	kstat_named_init(&hiokp->ipackets,		"ipackets",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->ierrors,		"ierrors",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->opackets,		"opackets",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->oerrors,		"oerrors",
	    KSTAT_DATA_ULONG);


	/* MIB II kstat variables */
	kstat_named_init(&hiokp->rbytes,		"rbytes",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->obytes,		"obytes",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->multircv,		"multircv",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->multixmt,		"multixmt",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->brdcstrcv,		"brdcstrcv",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->brdcstxmt,		"brdcstxmt",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->norcvbuf,		"norcvbuf",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&hiokp->noxmtbuf,		"noxmtbuf",
	    KSTAT_DATA_ULONG);

	ksp->ks_update = vnet_hio_update_kstats;
	ksp->ks_private = (void *)vresp;
	kstat_install(ksp);
	return (ksp);
}

/*
 * Destroy kstats.
 */
static void
vnet_hio_destroy_kstats(kstat_t *ksp)
{
	if (ksp != NULL)
		kstat_delete(ksp);
}

/*
 * Update the kstats.
 */
static int
vnet_hio_update_kstats(kstat_t *ksp, int rw)
{
	vnet_t *vnetp;
	vnet_res_t *vresp;
	vnet_hio_stats_t statsp;
	vnet_hio_kstats_t *hiokp;

	vresp = (vnet_res_t *)ksp->ks_private;
	vnetp = vresp->vnetp;

	bzero(&statsp, sizeof (vnet_hio_stats_t));

	READ_ENTER(&vnetp->vsw_fp_rw);
	if (vnetp->hio_fp == NULL) {
		/* not using hio resources, just return */
		RW_EXIT(&vnetp->vsw_fp_rw);
		return (0);
	}
	VNET_FDBE_REFHOLD(vnetp->hio_fp);
	RW_EXIT(&vnetp->vsw_fp_rw);
	vnet_hio_get_stats(vnetp->hio_fp, &statsp);
	VNET_FDBE_REFRELE(vnetp->hio_fp);

	hiokp = (vnet_hio_kstats_t *)ksp->ks_data;

	if (rw == KSTAT_READ) {
		/* Link Input/Output stats */
		hiokp->ipackets.value.ul	= (uint32_t)statsp.ipackets;
		hiokp->ipackets64.value.ull	= statsp.ipackets;
		hiokp->ierrors.value.ul		= statsp.ierrors;
		hiokp->opackets.value.ul	= (uint32_t)statsp.opackets;
		hiokp->opackets64.value.ull	= statsp.opackets;
		hiokp->oerrors.value.ul		= statsp.oerrors;

		/* MIB II kstat variables */
		hiokp->rbytes.value.ul		= (uint32_t)statsp.rbytes;
		hiokp->rbytes64.value.ull	= statsp.rbytes;
		hiokp->obytes.value.ul		= (uint32_t)statsp.obytes;
		hiokp->obytes64.value.ull	= statsp.obytes;
		hiokp->multircv.value.ul	= statsp.multircv;
		hiokp->multixmt.value.ul	= statsp.multixmt;
		hiokp->brdcstrcv.value.ul	= statsp.brdcstrcv;
		hiokp->brdcstxmt.value.ul	= statsp.brdcstxmt;
		hiokp->norcvbuf.value.ul	= statsp.norcvbuf;
		hiokp->noxmtbuf.value.ul	= statsp.noxmtbuf;
	} else {
		return (EACCES);
	}

	return (0);
}

static void
vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp)
{
	mac_register_t		*macp;
	mac_callbacks_t		*cbp;
	uint64_t		val;
	int			stat;

	/*
	 * get the specified statistics from the underlying nxge.
	 */
	macp = &vresp->macreg;
	cbp = macp->m_callbacks;
	for (stat = MAC_STAT_MIN; stat < MAC_STAT_OVERFLOWS; stat++) {
		if (cbp->mc_getstat(macp->m_driver, stat, &val) == 0) {
			switch (stat) {
			case MAC_STAT_IPACKETS:
				statsp->ipackets = val;
				break;

			case MAC_STAT_IERRORS:
				statsp->ierrors = val;
				break;

			case MAC_STAT_OPACKETS:
				statsp->opackets = val;
				break;

			case MAC_STAT_OERRORS:
				statsp->oerrors = val;
				break;

			case MAC_STAT_RBYTES:
				statsp->rbytes = val;
				break;

			case MAC_STAT_OBYTES:
				statsp->obytes = val;
				break;

			case MAC_STAT_MULTIRCV:
				statsp->multircv = val;
				break;

			case MAC_STAT_MULTIXMT:
				statsp->multixmt = val;
				break;

			case MAC_STAT_BRDCSTRCV:
				statsp->brdcstrcv = val;
				break;

			case MAC_STAT_BRDCSTXMT:
				statsp->brdcstxmt = val;
				break;

			case MAC_STAT_NOXMTBUF:
				statsp->noxmtbuf = val;
				break;

			case MAC_STAT_NORCVBUF:
				statsp->norcvbuf = val;
				break;

			default:
				/*
				 * parameters not interested.
				 */
				break;
			}
		}
	}
}

static boolean_t
vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data)
{
	vnet_t	*vnetp = (vnet_t *)arg;

	if (vnetp == NULL) {
		return (0);
	}

	switch (cap) {

	case MAC_CAPAB_RINGS: {

		mac_capab_rings_t *cap_rings = cap_data;
		/*
		 * Rings Capability Notes:
		 * We advertise rings to make use of the rings framework in
		 * gldv3 mac layer, to improve the performance. This is
		 * specifically needed when a Hybrid resource (with multiple
		 * tx/rx hardware rings) is assigned to a vnet device. We also
		 * leverage this for the normal case when no Hybrid resource is
		 * assigned.
		 *
		 * Ring Allocation:
		 * - TX path:
		 * We expose a pseudo ring group with 2 pseudo tx rings (as
		 * currently HybridIO exports only 2 rings) In the normal case,
		 * transmit traffic that comes down to the driver through the
		 * mri_tx (vnet_tx_ring_send()) entry point goes through the
		 * distributed switching algorithm in vnet and gets transmitted
		 * over a port/LDC in the vgen layer to either the vswitch or a
		 * peer vnet. If and when a Hybrid resource is assigned to the
		 * vnet, we obtain the tx ring information of the Hybrid device
		 * (nxge) and map the pseudo rings 1:1 to the 2 hw tx rings.
		 * Traffic being sent over the Hybrid resource by the mac layer
		 * gets spread across both hw rings, as they are mapped to the
		 * 2 pseudo tx rings in vnet.
		 *
		 * - RX path:
		 * We expose a pseudo ring group with 3 pseudo rx rings (static
		 * rings) initially. The first (default) pseudo rx ring is
		 * reserved for the resource that connects to the vswitch
		 * service. The next 2 rings are reserved for a Hybrid resource
		 * that may be assigned to the vnet device. If and when a
		 * Hybrid resource is assigned to the vnet, we obtain the rx
		 * ring information of the Hybrid device (nxge) and map these
		 * pseudo rings 1:1 to the 2 hw rx rings. For each additional
		 * resource that connects to a peer vnet, we dynamically
		 * allocate a pseudo rx ring and map it to that resource, when
		 * the resource gets added; and the pseudo rx ring is
		 * dynamically registered with the upper mac layer. We do the
		 * reverse and unregister the ring with the mac layer when
		 * the resource gets removed.
		 *
		 * Synchronization notes:
		 * We don't need any lock to protect members of ring structure,
		 * specifically ringp->hw_rh, in either the TX or the RX ring,
		 * as explained below.
		 * - TX ring:
		 * ring->hw_rh is initialized only when a Hybrid resource is
		 * associated; and gets referenced only in vnet_hio_tx(). The
		 * Hybrid resource itself is available in fdb only after tx
		 * hwrings are found and mapped; i.e, in vio_net_resource_reg()
		 * we call vnet_bind_rings() first and then call
		 * vnet_start_resources() which adds an entry to fdb. For
		 * traffic going over LDC resources, we don't reference
		 * ring->hw_rh at all.
		 * - RX ring:
		 * For rings mapped to Hybrid resource ring->hw_rh is
		 * initialized and only then do we add the rx callback for
		 * the underlying Hybrid resource; we disable callbacks before
		 * we unmap ring->hw_rh. For rings mapped to LDC resources, we
		 * stop the rx callbacks (in vgen) before we remove ring->hw_rh
		 * (vio_net_resource_unreg()).
		 * Also, we access ring->hw_rh in vnet_rx_ring_stat().
		 * Note that for rings mapped to Hybrid resource, though the
		 * rings are statically registered with the mac layer, its
		 * hardware ring mapping (ringp->hw_rh) can be torn down in
		 * vnet_unbind_hwrings() while the kstat operation is in
		 * progress. To protect against this, we hold a reference to
		 * the resource in FDB; this ensures that the thread in
		 * vio_net_resource_unreg() waits for the reference to be
		 * dropped before unbinding the ring.
		 *
		 * We don't need to do this for rings mapped to LDC resources.
		 * These rings are registered/unregistered dynamically with
		 * the mac layer and so any attempt to unregister the ring
		 * while kstat operation is in progress will block in
		 * mac_group_rem_ring(). Thus implicitly protects the
		 * resource (ringp->hw_rh) from disappearing.
		 */

		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;

			/*
			 * The ring_cnt for rx grp is initialized in
			 * vnet_ring_grp_init(). Later, the ring_cnt gets
			 * updated dynamically whenever LDC resources are added
			 * or removed.
			 */
			cap_rings->mr_rnum = vnetp->rx_grp[0].ring_cnt;
			cap_rings->mr_rget = vnet_get_ring;

			cap_rings->mr_gnum = VNET_NUM_PSEUDO_GROUPS;
			cap_rings->mr_gget = vnet_get_group;
			cap_rings->mr_gaddring = NULL;
			cap_rings->mr_gremring = NULL;
		} else {
			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;

			/*
			 * The ring_cnt for tx grp is initialized in
			 * vnet_ring_grp_init() and remains constant, as we
			 * do not support dymanic tx rings for now.
			 */
			cap_rings->mr_rnum = vnetp->tx_grp[0].ring_cnt;
			cap_rings->mr_rget = vnet_get_ring;

			/*
			 * Transmit rings are not grouped; i.e, the number of
			 * transmit ring groups advertised should be set to 0.
			 */
			cap_rings->mr_gnum = 0;

			cap_rings->mr_gget = vnet_get_group;
			cap_rings->mr_gaddring = NULL;
			cap_rings->mr_gremring = NULL;
		}
		return (B_TRUE);

	}

	default:
		break;

	}

	return (B_FALSE);
}

/*
 * Callback funtion for MAC layer to get ring information.
 */
static void
vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
    const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle)
{
	vnet_t	*vnetp = arg;

	switch (rtype) {

	case MAC_RING_TYPE_RX: {

		vnet_pseudo_rx_group_t	*rx_grp;
		vnet_pseudo_rx_ring_t	*rx_ringp;
		mac_intr_t		*mintr;

		/* We advertised only one RX group */
		ASSERT(g_index == 0);
		rx_grp = &vnetp->rx_grp[g_index];

		/* Check the current # of rings in the rx group */
		ASSERT((r_index >= 0) && (r_index < rx_grp->max_ring_cnt));

		/* Get the ring based on the index */
		rx_ringp = &rx_grp->rings[r_index];

		rx_ringp->handle = r_handle;
		/*
		 * Note: we don't need to save the incoming r_index in rx_ring,
		 * as vnet_ring_grp_init() would have initialized the index for
		 * each ring in the array.
		 */
		rx_ringp->grp = rx_grp;
		rx_ringp->vnetp = vnetp;

		mintr = &infop->mri_intr;
		mintr->mi_handle = (mac_intr_handle_t)rx_ringp;
		mintr->mi_enable = (mac_intr_enable_t)vnet_ring_enable_intr;
		mintr->mi_disable = (mac_intr_disable_t)vnet_ring_disable_intr;

		infop->mri_driver = (mac_ring_driver_t)rx_ringp;
		infop->mri_start = vnet_rx_ring_start;
		infop->mri_stop = vnet_rx_ring_stop;
		infop->mri_stat = vnet_rx_ring_stat;

		/* Set the poll function, as this is an rx ring */
		infop->mri_poll = vnet_rx_poll;
		/*
		 * MAC_RING_RX_ENQUEUE bit needed to be set for nxge
		 * which was not sending packet chains in interrupt
		 * context. For such drivers, packets are queued in
		 * Rx soft rings so that we get a chance to switch
		 * into a polling mode under backlog. This bug (not
		 * sending packet chains) has now been fixed. Once
		 * the performance impact is measured, this change
		 * will be removed.
		 */
		infop->mri_flags = (vnet_mac_rx_queuing ?
		    MAC_RING_RX_ENQUEUE : 0);
		break;
	}

	case MAC_RING_TYPE_TX: {
		vnet_pseudo_tx_group_t	*tx_grp;
		vnet_pseudo_tx_ring_t	*tx_ringp;

		/*
		 * No need to check grp index; mac layer passes -1 for it.
		 */
		tx_grp = &vnetp->tx_grp[0];

		/* Check the # of rings in the tx group */
		ASSERT((r_index >= 0) && (r_index < tx_grp->ring_cnt));

		/* Get the ring based on the index */
		tx_ringp = &tx_grp->rings[r_index];

		tx_ringp->handle = r_handle;
		tx_ringp->index = r_index;
		tx_ringp->grp = tx_grp;
		tx_ringp->vnetp = vnetp;

		infop->mri_driver = (mac_ring_driver_t)tx_ringp;
		infop->mri_start = vnet_tx_ring_start;
		infop->mri_stop = vnet_tx_ring_stop;
		infop->mri_stat = vnet_tx_ring_stat;

		/* Set the transmit function, as this is a tx ring */
		infop->mri_tx = vnet_tx_ring_send;
		/*
		 * MAC_RING_TX_SERIALIZE bit needs to be set while
		 * hybridIO is enabled to workaround tx lock
		 * contention issues in nxge.
		 */
		infop->mri_flags = (vnet_mac_tx_serialize ?
		    MAC_RING_TX_SERIALIZE : 0);
		break;
	}

	default:
		break;
	}
}

/*
 * Callback funtion for MAC layer to get group information.
 */
static void
vnet_get_group(void *arg, mac_ring_type_t type, const int index,
    mac_group_info_t *infop, mac_group_handle_t handle)
{
	vnet_t	*vnetp = (vnet_t *)arg;

	switch (type) {

	case MAC_RING_TYPE_RX:
	{
		vnet_pseudo_rx_group_t	*rx_grp;

		/* We advertised only one RX group */
		ASSERT(index == 0);

		rx_grp = &vnetp->rx_grp[index];
		rx_grp->handle = handle;
		rx_grp->index = index;
		rx_grp->vnetp = vnetp;

		infop->mgi_driver = (mac_group_driver_t)rx_grp;
		infop->mgi_start = NULL;
		infop->mgi_stop = NULL;
		infop->mgi_addmac = vnet_addmac;
		infop->mgi_remmac = vnet_remmac;
		infop->mgi_count = rx_grp->ring_cnt;

		break;
	}

	case MAC_RING_TYPE_TX:
	{
		vnet_pseudo_tx_group_t	*tx_grp;

		/* We advertised only one TX group */
		ASSERT(index == 0);

		tx_grp = &vnetp->tx_grp[index];
		tx_grp->handle = handle;
		tx_grp->index = index;
		tx_grp->vnetp = vnetp;

		infop->mgi_driver = (mac_group_driver_t)tx_grp;
		infop->mgi_start = NULL;
		infop->mgi_stop = NULL;
		infop->mgi_addmac = NULL;
		infop->mgi_remmac = NULL;
		infop->mgi_count = VNET_NUM_PSEUDO_TXRINGS;

		break;
	}

	default:
		break;

	}
}

static int
vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
{
	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
	int			err;

	/*
	 * If this ring is mapped to a LDC resource, simply mark the state to
	 * indicate the ring is started and return.
	 */
	if ((rx_ringp->state &
	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
		rx_ringp->gen_num = mr_gen_num;
		rx_ringp->state |= VNET_RXRING_STARTED;
		return (0);
	}

	ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);

	/*
	 * This must be a ring reserved for a hwring. If the hwring is not
	 * bound yet, simply mark the state to indicate the ring is started and
	 * return. If and when a hybrid resource is activated for this vnet
	 * device, we will bind the hwring and start it then. If a hwring is
	 * already bound, start it now.
	 */
	if (rx_ringp->hw_rh == NULL) {
		rx_ringp->gen_num = mr_gen_num;
		rx_ringp->state |= VNET_RXRING_STARTED;
		return (0);
	}

	err = mac_hwring_activate(rx_ringp->hw_rh);
	if (err == 0) {
		rx_ringp->gen_num = mr_gen_num;
		rx_ringp->state |= VNET_RXRING_STARTED;
	} else {
		err = ENXIO;
	}

	return (err);
}

static void
vnet_rx_ring_stop(mac_ring_driver_t arg)
{
	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;

	/*
	 * If this ring is mapped to a LDC resource, simply mark the state to
	 * indicate the ring is now stopped and return.
	 */
	if ((rx_ringp->state &
	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
		rx_ringp->state &= ~VNET_RXRING_STARTED;
		return;
	}

	ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);

	/*
	 * This must be a ring reserved for a hwring. If the hwring is not
	 * bound yet, simply mark the state to indicate the ring is stopped and
	 * return. If a hwring is already bound, stop it now.
	 */
	if (rx_ringp->hw_rh == NULL) {
		rx_ringp->state &= ~VNET_RXRING_STARTED;
		return;
	}

	mac_hwring_quiesce(rx_ringp->hw_rh);
	rx_ringp->state &= ~VNET_RXRING_STARTED;
}

static int
vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
{
	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)rdriver;
	vnet_t			*vnetp = (vnet_t *)rx_ringp->vnetp;
	vnet_res_t		*vresp;
	mac_register_t		*macp;
	mac_callbacks_t		*cbp;

	/*
	 * Refer to vnet_m_capab() function for detailed comments on ring
	 * synchronization.
	 */
	if ((rx_ringp->state & VNET_RXRING_HYBRID) != 0) {
		READ_ENTER(&vnetp->vsw_fp_rw);
		if (vnetp->hio_fp == NULL) {
			RW_EXIT(&vnetp->vsw_fp_rw);
			return (0);
		}

		VNET_FDBE_REFHOLD(vnetp->hio_fp);
		RW_EXIT(&vnetp->vsw_fp_rw);
		(void) mac_hwring_getstat(rx_ringp->hw_rh, stat, val);
		VNET_FDBE_REFRELE(vnetp->hio_fp);
		return (0);
	}

	ASSERT((rx_ringp->state &
	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0);
	vresp = (vnet_res_t *)rx_ringp->hw_rh;
	macp = &vresp->macreg;
	cbp = macp->m_callbacks;

	(void) cbp->mc_getstat(macp->m_driver, stat, val);

	return (0);
}

/* ARGSUSED */
static int
vnet_tx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
{
	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)arg;

	tx_ringp->state |= VNET_TXRING_STARTED;
	return (0);
}

static void
vnet_tx_ring_stop(mac_ring_driver_t arg)
{
	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)arg;

	tx_ringp->state &= ~VNET_TXRING_STARTED;
}

static int
vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
{
	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)rdriver;
	vnet_tx_ring_stats_t	*statsp;

	statsp = &tx_ringp->tx_ring_stats;

	switch (stat) {
	case MAC_STAT_OPACKETS:
		*val = statsp->opackets;
		break;

	case MAC_STAT_OBYTES:
		*val = statsp->obytes;
		break;

	default:
		*val = 0;
		return (ENOTSUP);
	}

	return (0);
}

/*
 * Disable polling for a ring and enable its interrupt.
 */
static int
vnet_ring_enable_intr(void *arg)
{
	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
	vnet_res_t		*vresp;

	if (rx_ringp->hw_rh == NULL) {
		/*
		 * Ring enable intr func is being invoked, but the ring is
		 * not bound to any underlying resource ? This must be a ring
		 * reserved for Hybrid resource and no such resource has been
		 * assigned to this vnet device yet. We simply return success.
		 */
		ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
		return (0);
	}

	/*
	 * The rx ring has been bound to either a LDC or a Hybrid resource.
	 * Call the appropriate function to enable interrupts for the ring.
	 */
	if (rx_ringp->state & VNET_RXRING_HYBRID) {
		return (mac_hwring_enable_intr(rx_ringp->hw_rh));
	} else {
		vresp = (vnet_res_t *)rx_ringp->hw_rh;
		return (vgen_enable_intr(vresp->macreg.m_driver));
	}
}

/*
 * Enable polling for a ring and disable its interrupt.
 */
static int
vnet_ring_disable_intr(void *arg)
{
	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
	vnet_res_t		*vresp;

	if (rx_ringp->hw_rh == NULL) {
		/*
		 * Ring disable intr func is being invoked, but the ring is
		 * not bound to any underlying resource ? This must be a ring
		 * reserved for Hybrid resource and no such resource has been
		 * assigned to this vnet device yet. We simply return success.
		 */
		ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
		return (0);
	}

	/*
	 * The rx ring has been bound to either a LDC or a Hybrid resource.
	 * Call the appropriate function to disable interrupts for the ring.
	 */
	if (rx_ringp->state & VNET_RXRING_HYBRID) {
		return (mac_hwring_disable_intr(rx_ringp->hw_rh));
	} else {
		vresp = (vnet_res_t *)rx_ringp->hw_rh;
		return (vgen_disable_intr(vresp->macreg.m_driver));
	}
}

/*
 * Poll 'bytes_to_pickup' bytes of message from the rx ring.
 */
static mblk_t *
vnet_rx_poll(void *arg, int bytes_to_pickup)
{
	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
	mblk_t			*mp = NULL;
	vnet_res_t		*vresp;
	vnet_t			*vnetp = rx_ringp->vnetp;

	if (rx_ringp->hw_rh == NULL) {
		return (NULL);
	}

	if (rx_ringp->state & VNET_RXRING_HYBRID) {
		mp = mac_hwring_poll(rx_ringp->hw_rh, bytes_to_pickup);
		/*
		 * Packets received over a hybrid resource need additional
		 * processing to remove the tag, for the pvid case. The
		 * underlying resource is not aware of the vnet's pvid and thus
		 * packets are received with the vlan tag in the header; unlike
		 * packets that are received over a ldc channel in which case
		 * the peer vnet/vsw would have already removed the tag.
		 */
		if (vnetp->pvid != vnetp->default_vlan_id) {
			vnet_rx_frames_untag(vnetp->pvid, &mp);
		}
	} else {
		vresp = (vnet_res_t *)rx_ringp->hw_rh;
		mp = vgen_rx_poll(vresp->macreg.m_driver, bytes_to_pickup);
	}
	return (mp);
}

/* ARGSUSED */
void
vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
    boolean_t loopback)
{
	vnet_t			*vnetp = (vnet_t *)arg;
	vnet_pseudo_rx_ring_t	*ringp = (vnet_pseudo_rx_ring_t *)mrh;

	/*
	 * Packets received over a hybrid resource need additional processing
	 * to remove the tag, for the pvid case. The underlying resource is
	 * not aware of the vnet's pvid and thus packets are received with the
	 * vlan tag in the header; unlike packets that are received over a ldc
	 * channel in which case the peer vnet/vsw would have already removed
	 * the tag.
	 */
	if (vnetp->pvid != vnetp->default_vlan_id) {
		vnet_rx_frames_untag(vnetp->pvid, &mp);
		if (mp == NULL) {
			return;
		}
	}
	mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
}

static int
vnet_addmac(void *arg, const uint8_t *mac_addr)
{
	vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
	vnet_t			*vnetp;

	vnetp = rx_grp->vnetp;

	if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
		return (0);
	}

	cmn_err(CE_CONT, "!vnet%d: %s: Multiple macaddr unsupported\n",
	    vnetp->instance, __func__);
	return (EINVAL);
}

static int
vnet_remmac(void *arg, const uint8_t *mac_addr)
{
	vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
	vnet_t			*vnetp;

	vnetp = rx_grp->vnetp;

	if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
		return (0);
	}

	cmn_err(CE_CONT, "!vnet%d: %s: Invalid macaddr: %s\n",
	    vnetp->instance, __func__, ether_sprintf((void *)mac_addr));
	return (EINVAL);
}

int
vnet_hio_mac_init(vnet_t *vnetp, char *ifname)
{
	mac_handle_t		mh;
	mac_client_handle_t	mch = NULL;
	mac_unicast_handle_t	muh = NULL;
	mac_diag_t		diag;
	mac_register_t		*macp;
	char			client_name[MAXNAMELEN];
	int			rv;
	uint16_t		mac_flags = MAC_UNICAST_TAG_DISABLE |
	    MAC_UNICAST_STRIP_DISABLE | MAC_UNICAST_PRIMARY;
	vio_net_callbacks_t	vcb;
	ether_addr_t		rem_addr =
		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
	uint32_t		retries = 0;

	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
		return (EAGAIN);
	}

	do {
		rv = mac_open_by_linkname(ifname, &mh);
		if (rv == 0) {
			break;
		}
		if (rv != ENOENT || (retries++ >= vnet_mac_open_retries)) {
			mac_free(macp);
			return (rv);
		}
		drv_usecwait(vnet_mac_open_delay);
	} while (rv == ENOENT);

	vnetp->hio_mh = mh;

	(void) snprintf(client_name, MAXNAMELEN, "vnet%d-%s", vnetp->instance,
	    ifname);
	rv = mac_client_open(mh, &mch, client_name, MAC_OPEN_FLAGS_EXCLUSIVE);
	if (rv != 0) {
		goto fail;
	}
	vnetp->hio_mch = mch;

	rv = mac_unicast_add(mch, vnetp->curr_macaddr, mac_flags, &muh, 0,
	    &diag);
	if (rv != 0) {
		goto fail;
	}
	vnetp->hio_muh = muh;

	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
	macp->m_driver = vnetp;
	macp->m_dip = NULL;
	macp->m_src_addr = NULL;
	macp->m_callbacks = &vnet_hio_res_callbacks;
	macp->m_min_sdu = 0;
	macp->m_max_sdu = ETHERMTU;

	rv = vio_net_resource_reg(macp, VIO_NET_RES_HYBRID,
	    vnetp->curr_macaddr, rem_addr, &vnetp->hio_vhp, &vcb);
	if (rv != 0) {
		goto fail;
	}
	mac_free(macp);

	/* add the recv callback */
	mac_rx_set(vnetp->hio_mch, vnet_hio_rx_cb, vnetp);

	return (0);

fail:
	mac_free(macp);
	vnet_hio_mac_cleanup(vnetp);
	return (1);
}

void
vnet_hio_mac_cleanup(vnet_t *vnetp)
{
	if (vnetp->hio_vhp != NULL) {
		vio_net_resource_unreg(vnetp->hio_vhp);
		vnetp->hio_vhp = NULL;
	}

	if (vnetp->hio_muh != NULL) {
		(void) mac_unicast_remove(vnetp->hio_mch, vnetp->hio_muh);
		vnetp->hio_muh = NULL;
	}

	if (vnetp->hio_mch != NULL) {
		mac_client_close(vnetp->hio_mch, 0);
		vnetp->hio_mch = NULL;
	}

	if (vnetp->hio_mh != NULL) {
		mac_close(vnetp->hio_mh);
		vnetp->hio_mh = NULL;
	}
}

/* Bind pseudo rings to hwrings */
static int
vnet_bind_hwrings(vnet_t *vnetp)
{
	mac_ring_handle_t	hw_rh[VNET_NUM_HYBRID_RINGS];
	mac_perim_handle_t	mph1;
	vnet_pseudo_rx_group_t	*rx_grp;
	vnet_pseudo_rx_ring_t	*rx_ringp;
	vnet_pseudo_tx_group_t	*tx_grp;
	vnet_pseudo_tx_ring_t	*tx_ringp;
	int			hw_ring_cnt;
	int			i;
	int			rv;

	mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);

	/* Get the list of the underlying RX rings. */
	hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->rx_hwgh, hw_rh,
	    MAC_RING_TYPE_RX);

	/* We expect the the # of hw rx rings to match VNET_NUM_HYBRID_RINGS */
	if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
		cmn_err(CE_WARN,
		    "!vnet%d: vnet_bind_hwrings: bad rx hw_ring_cnt(%d)\n",
		    vnetp->instance, hw_ring_cnt);
		goto fail;
	}

	if (vnetp->rx_hwgh != NULL) {
		/*
		 * Quiesce the HW ring and the mac srs on the ring. Note
		 * that the HW ring will be restarted when the pseudo ring
		 * is started. At that time all the packets will be
		 * directly passed up to the pseudo RX ring and handled
		 * by mac srs created over the pseudo RX ring.
		 */
		mac_rx_client_quiesce(vnetp->hio_mch);
		mac_srs_perm_quiesce(vnetp->hio_mch, B_TRUE);
	}

	/*
	 * Bind the pseudo rings to the hwrings and start the hwrings.
	 * Note we don't need to register these with the upper mac, as we have
	 * statically exported these pseudo rxrings which are reserved for
	 * rxrings of Hybrid resource.
	 */
	rx_grp = &vnetp->rx_grp[0];
	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
		/* Pick the rxrings reserved for Hybrid resource */
		rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];

		/* Store the hw ring handle */
		rx_ringp->hw_rh = hw_rh[i];

		/* Bind the pseudo ring to the underlying hwring */
		mac_hwring_setup(rx_ringp->hw_rh,
		    (mac_resource_handle_t)rx_ringp, NULL);

		/* Start the hwring if needed */
		if (rx_ringp->state & VNET_RXRING_STARTED) {
			rv = mac_hwring_activate(rx_ringp->hw_rh);
			if (rv != 0) {
				mac_hwring_teardown(rx_ringp->hw_rh);
				rx_ringp->hw_rh = NULL;
				goto fail;
			}
		}
	}

	/* Get the list of the underlying TX rings. */
	hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->tx_hwgh, hw_rh,
	    MAC_RING_TYPE_TX);

	/* We expect the # of hw tx rings to match VNET_NUM_HYBRID_RINGS */
	if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
		cmn_err(CE_WARN,
		    "!vnet%d: vnet_bind_hwrings: bad tx hw_ring_cnt(%d)\n",
		    vnetp->instance, hw_ring_cnt);
		goto fail;
	}

	/*
	 * Now map the pseudo txrings to the hw txrings. Note we don't need
	 * to register these with the upper mac, as we have statically exported
	 * these rings. Note that these rings will continue to be used for LDC
	 * resources to peer vnets and vswitch (shared ring).
	 */
	tx_grp = &vnetp->tx_grp[0];
	for (i = 0; i < tx_grp->ring_cnt; i++) {
		tx_ringp = &tx_grp->rings[i];
		tx_ringp->hw_rh = hw_rh[i];
		tx_ringp->state |= VNET_TXRING_HYBRID;
	}
	tx_grp->tx_notify_handle =
	    mac_client_tx_notify(vnetp->hio_mch, vnet_tx_ring_update, vnetp);

	mac_perim_exit(mph1);
	return (0);

fail:
	mac_perim_exit(mph1);
	vnet_unbind_hwrings(vnetp);
	return (1);
}

/* Unbind pseudo rings from hwrings */
static void
vnet_unbind_hwrings(vnet_t *vnetp)
{
	mac_perim_handle_t	mph1;
	vnet_pseudo_rx_ring_t	*rx_ringp;
	vnet_pseudo_rx_group_t	*rx_grp;
	vnet_pseudo_tx_group_t	*tx_grp;
	vnet_pseudo_tx_ring_t	*tx_ringp;
	int			i;

	mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);

	tx_grp = &vnetp->tx_grp[0];
	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
		tx_ringp = &tx_grp->rings[i];
		if (tx_ringp->state & VNET_TXRING_HYBRID) {
			tx_ringp->state &= ~VNET_TXRING_HYBRID;
			tx_ringp->hw_rh = NULL;
		}
	}
	(void) mac_client_tx_notify(vnetp->hio_mch, NULL,
	    tx_grp->tx_notify_handle);

	rx_grp = &vnetp->rx_grp[0];
	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
		rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
		if (rx_ringp->hw_rh != NULL) {
			/* Stop the hwring */
			mac_hwring_quiesce(rx_ringp->hw_rh);

			/* Teardown the hwring */
			mac_hwring_teardown(rx_ringp->hw_rh);
			rx_ringp->hw_rh = NULL;
		}
	}

	if (vnetp->rx_hwgh != NULL) {
		vnetp->rx_hwgh = NULL;
		/*
		 * First clear the permanent-quiesced flag of the RX srs then
		 * restart the HW ring and the mac srs on the ring.
		 */
		mac_srs_perm_quiesce(vnetp->hio_mch, B_FALSE);
		mac_rx_client_restart(vnetp->hio_mch);
	}

	mac_perim_exit(mph1);
}

/* Bind pseudo ring to a LDC resource */
static int
vnet_bind_vgenring(vnet_res_t *vresp)
{
	vnet_t			*vnetp;
	vnet_pseudo_rx_group_t	*rx_grp;
	vnet_pseudo_rx_ring_t	*rx_ringp;
	mac_perim_handle_t	mph1;
	int			rv;
	int			type;

	vnetp = vresp->vnetp;
	type = vresp->type;
	rx_grp = &vnetp->rx_grp[0];

	if (type == VIO_NET_RES_LDC_SERVICE) {
		/*
		 * Ring Index 0 is the default ring in the group and is
		 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
		 * is allocated statically and is reported to the mac layer
		 * in vnet_m_capab(). So, all we need to do here, is save a
		 * reference to the associated vresp.
		 */
		rx_ringp = &rx_grp->rings[0];
		rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
		vresp->rx_ringp = (void *)rx_ringp;
		return (0);
	}
	ASSERT(type == VIO_NET_RES_LDC_GUEST);

	mac_perim_enter_by_mh(vnetp->mh, &mph1);

	rx_ringp = vnet_alloc_pseudo_rx_ring(vnetp);
	if (rx_ringp == NULL) {
		cmn_err(CE_WARN, "!vnet%d: Failed to allocate pseudo rx ring",
		    vnetp->instance);
		goto fail;
	}

	/* Store the LDC resource itself as the ring handle */
	rx_ringp->hw_rh = (mac_ring_handle_t)vresp;

	/*
	 * Save a reference to the ring in the resource for lookup during
	 * unbind. Note this is only done for LDC resources. We don't need this
	 * in the case of a Hybrid resource (see vnet_bind_hwrings()), as its
	 * rx rings are mapped to reserved pseudo rx rings (index 1 and 2).
	 */
	vresp->rx_ringp = (void *)rx_ringp;
	rx_ringp->state |= VNET_RXRING_LDC_GUEST;

	/* Register the pseudo ring with upper-mac */
	rv = mac_group_add_ring(rx_grp->handle, rx_ringp->index);
	if (rv != 0) {
		rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
		rx_ringp->hw_rh = NULL;
		vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
		goto fail;
	}

	mac_perim_exit(mph1);
	return (0);
fail:
	mac_perim_exit(mph1);
	return (1);
}

/* Unbind pseudo ring from a LDC resource */
static void
vnet_unbind_vgenring(vnet_res_t *vresp)
{
	vnet_t			*vnetp;
	vnet_pseudo_rx_group_t	*rx_grp;
	vnet_pseudo_rx_ring_t	*rx_ringp;
	mac_perim_handle_t	mph1;
	int			type;

	vnetp = vresp->vnetp;
	type = vresp->type;
	rx_grp = &vnetp->rx_grp[0];

	if (vresp->rx_ringp == NULL) {
		return;
	}

	if (type == VIO_NET_RES_LDC_SERVICE) {
		/*
		 * Ring Index 0 is the default ring in the group and is
		 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
		 * is allocated statically and is reported to the mac layer
		 * in vnet_m_capab(). So, all we need to do here, is remove its
		 * reference to the associated vresp.
		 */
		rx_ringp = &rx_grp->rings[0];
		rx_ringp->hw_rh = NULL;
		vresp->rx_ringp = NULL;
		return;
	}
	ASSERT(type == VIO_NET_RES_LDC_GUEST);

	mac_perim_enter_by_mh(vnetp->mh, &mph1);

	rx_ringp = (vnet_pseudo_rx_ring_t *)vresp->rx_ringp;
	vresp->rx_ringp = NULL;

	if (rx_ringp != NULL && (rx_ringp->state & VNET_RXRING_LDC_GUEST)) {
		/* Unregister the pseudo ring with upper-mac */
		mac_group_rem_ring(rx_grp->handle, rx_ringp->handle);

		rx_ringp->hw_rh = NULL;
		rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;

		/* Free the pseudo rx ring */
		vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
	}

	mac_perim_exit(mph1);
}

static void
vnet_unbind_rings(vnet_res_t *vresp)
{
	switch (vresp->type) {

	case VIO_NET_RES_LDC_SERVICE:
	case VIO_NET_RES_LDC_GUEST:
		vnet_unbind_vgenring(vresp);
		break;

	case VIO_NET_RES_HYBRID:
		vnet_unbind_hwrings(vresp->vnetp);
		break;

	default:
		break;

	}
}

static int
vnet_bind_rings(vnet_res_t *vresp)
{
	int	rv;

	switch (vresp->type) {

	case VIO_NET_RES_LDC_SERVICE:
	case VIO_NET_RES_LDC_GUEST:
		rv = vnet_bind_vgenring(vresp);
		break;

	case VIO_NET_RES_HYBRID:
		rv = vnet_bind_hwrings(vresp->vnetp);
		break;

	default:
		rv = 1;
		break;

	}

	return (rv);
}

/* ARGSUSED */
int
vnet_hio_stat(void *arg, uint_t stat, uint64_t *val)
{
	vnet_t	*vnetp = (vnet_t *)arg;

	*val = mac_stat_get(vnetp->hio_mh, stat);
	return (0);
}

/*
 * The start() and stop() routines for the Hybrid resource below, are just
 * dummy functions. This is provided to avoid resource type specific code in
 * vnet_start_resources() and vnet_stop_resources(). The starting and stopping
 * of the Hybrid resource happens in the context of the mac_client interfaces
 * that are invoked in vnet_hio_mac_init() and vnet_hio_mac_cleanup().
 */
/* ARGSUSED */
static int
vnet_hio_start(void *arg)
{
	return (0);
}

/* ARGSUSED */
static void
vnet_hio_stop(void *arg)
{
}

mblk_t *
vnet_hio_tx(void *arg, mblk_t *mp)
{
	vnet_pseudo_tx_ring_t	*tx_ringp;
	mblk_t			*nextp;
	mblk_t			*ret_mp;

	tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
	for (;;) {
		nextp = mp->b_next;
		mp->b_next = NULL;

		ret_mp = mac_hwring_tx(tx_ringp->hw_rh, mp);
		if (ret_mp != NULL) {
			ret_mp->b_next = nextp;
			mp = ret_mp;
			break;
		}

		if ((mp = nextp) == NULL)
			break;
	}
	return (mp);
}

#ifdef	VNET_IOC_DEBUG

/*
 * The ioctl entry point is used only for debugging for now. The ioctl commands
 * can be used to force the link state of the channel connected to vsw.
 */
static void
vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
{
	struct iocblk	*iocp;
	vnet_t		*vnetp;

	iocp = (struct iocblk *)(uintptr_t)mp->b_rptr;
	iocp->ioc_error = 0;
	vnetp = (vnet_t *)arg;

	if (vnetp == NULL) {
		miocnak(q, mp, 0, EINVAL);
		return;
	}

	switch (iocp->ioc_cmd) {

	case VNET_FORCE_LINK_DOWN:
	case VNET_FORCE_LINK_UP:
		vnet_force_link_state(vnetp, q, mp);
		break;

	default:
		iocp->ioc_error = EINVAL;
		miocnak(q, mp, 0, iocp->ioc_error);
		break;

	}
}

static void
vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp)
{
	mac_register_t	*macp;
	mac_callbacks_t	*cbp;
	vnet_res_t	*vresp;

	READ_ENTER(&vnetp->vsw_fp_rw);

	vresp = vnetp->vsw_fp;
	if (vresp == NULL) {
		RW_EXIT(&vnetp->vsw_fp_rw);
		return;
	}

	macp = &vresp->macreg;
	cbp = macp->m_callbacks;
	cbp->mc_ioctl(macp->m_driver, q, mp);

	RW_EXIT(&vnetp->vsw_fp_rw);
}

#else

static void
vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
{
	vnet_t		*vnetp;

	vnetp = (vnet_t *)arg;

	if (vnetp == NULL) {
		miocnak(q, mp, 0, EINVAL);
		return;
	}

	/* ioctl support only for debugging */
	miocnak(q, mp, 0, ENOTSUP);
}

#endif