/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strlog.h>
#include <sys/strsubr.h>
#include <sys/cmn_err.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/stat.h>
#include <sys/kstat.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <net/if.h>
#include <sys/varargs.h>
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
#include <sys/mac.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
#include <sys/mdeg.h>
#include <sys/ldc.h>
#include <sys/vsw_fdb.h>
#include <sys/vsw.h>
#include <sys/vio_mailbox.h>
#include <sys/vnet_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/vio_util.h>
#include <sys/sdt.h>
#include <sys/atomic.h>
#include <sys/callb.h>
#include <sys/vlan.h>

/* Port add/deletion/etc routines */
static	void vsw_port_delete(vsw_port_t *port);
static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
static	void vsw_ldc_detach(vsw_ldc_t *ldcp);
static	int vsw_ldc_init(vsw_ldc_t *ldcp);
static	void vsw_ldc_uninit(vsw_ldc_t *ldcp);
static	void vsw_ldc_drain(vsw_ldc_t *ldcp);
static	void vsw_drain_port_taskq(vsw_port_t *port);
static	void vsw_marker_task(void *);
static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
void vsw_detach_ports(vsw_t *vswp);
int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
int vsw_port_detach(vsw_t *vswp, int p_instance);
int vsw_portsend(vsw_port_t *port, mblk_t *mp);
int vsw_port_attach(vsw_port_t *portp);
vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
void vsw_reset_ports(vsw_t *vswp);
void vsw_port_reset(vsw_port_t *portp);
void vsw_physlink_update_ports(vsw_t *vswp);
static	void vsw_port_physlink_update(vsw_port_t *portp);

/* Interrupt routines */
static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);

/* Handshake routines */
static	void vsw_ldc_reinit(vsw_ldc_t *);
static	void vsw_conn_task(void *);
static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
static	void vsw_next_milestone(vsw_ldc_t *);
static	int vsw_supported_version(vio_ver_msg_t *);
static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);

/* Data processing routines */
void vsw_process_pkt(void *);
static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *, int);
static void vsw_process_ctrl_pkt(void *);
static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
	uint32_t);
static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
static void vsw_process_pkt_data(void *, void *, uint32_t);
static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
static void vsw_process_evt_read(vsw_ldc_t *ldcp);
static void vsw_ldc_rcv(vsw_ldc_t *ldcp);

/* Switching/data transmit routines */
static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);

/* Packet creation routines */
static void vsw_send_ver(void *);
static void vsw_send_attr(vsw_ldc_t *);
static void vsw_send_dring_info(vsw_ldc_t *);
static void vsw_send_rdx(vsw_ldc_t *);
static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);

/* Dring routines */
static void vsw_create_privring(vsw_ldc_t *);
static dring_info_t *vsw_map_dring(vsw_ldc_t *ldcp, void *pkt);
static void vsw_unmap_dring(vsw_ldc_t *ldcp);
static void vsw_destroy_dring(vsw_ldc_t *ldcp);
static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt);
static void vsw_set_lane_attr(vsw_t *, lane_t *);
dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp,
    vio_dring_reg_msg_t *dring_pkt);
static int vsw_mapin_avail(vsw_ldc_t *ldcp);

/* tx/msg/rcv thread routines */
static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
static void vsw_ldc_tx_worker(void *arg);

/* Misc support routines */
static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
static int vsw_get_same_dest_list(struct ether_header *ehp,
    mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
static mblk_t *vsw_dupmsgchain(mblk_t *mp);

/* Debugging routines */
static void dump_flags(uint64_t);
static void display_state(void);
static void display_lane(lane_t *);
static void display_ring(dring_info_t *);

/*
 * Functions imported from other files.
 */
extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
extern void vsw_del_mcst_port(vsw_port_t *port);
extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
extern void vsw_fdbe_add(vsw_t *vswp, void *port);
extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
extern void vsw_create_vlans(void *arg, int type);
extern void vsw_destroy_vlans(void *arg, int type);
extern void vsw_vlan_add_ids(void *arg, int type);
extern void vsw_vlan_remove_ids(void *arg, int type);
extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
	struct ether_header *ehp, uint16_t *vidp);
extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
	mblk_t **npt);
extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
extern void vsw_hio_stop_port(vsw_port_t *portp);
extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
extern void vsw_destroy_rxpools(void *arg);
extern void vsw_stop_msg_thread(vsw_ldc_t *ldcp);
extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
extern int vsw_dringsend(vsw_ldc_t *, mblk_t *);
extern int vsw_reclaim_dring(dring_info_t *dp, int start);
extern int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
    int *);
extern vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *);
extern int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp);
extern void vsw_destroy_tx_dring(vsw_ldc_t *ldcp);
extern dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt);
extern void vsw_unmap_rx_dring(vsw_ldc_t *ldcp);
extern void vsw_ldc_msg_worker(void *arg);
extern void vsw_process_dringdata(void *, void *);
extern vio_dring_reg_msg_t *vsw_create_rx_dring_info(vsw_ldc_t *);
extern void vsw_destroy_rx_dring(vsw_ldc_t *ldcp);
extern dring_info_t *vsw_map_tx_dring(vsw_ldc_t *ldcp, void *pkt);
extern void vsw_unmap_tx_dring(vsw_ldc_t *ldcp);
extern void vsw_ldc_rcv_worker(void *arg);
extern void vsw_stop_rcv_thread(vsw_ldc_t *ldcp);
extern int vsw_dringsend_shm(vsw_ldc_t *, mblk_t *);
extern void vsw_process_dringdata_shm(void *, void *);

/*
 * Tunables used in this file.
 */
extern int vsw_num_handshakes;
extern int vsw_ldc_tx_delay;
extern int vsw_ldc_tx_retries;
extern int vsw_ldc_retries;
extern int vsw_ldc_delay;
extern boolean_t vsw_ldc_rxthr_enabled;
extern boolean_t vsw_ldc_txthr_enabled;
extern uint32_t vsw_num_descriptors;
extern uint8_t  vsw_dring_mode;
extern uint32_t vsw_max_tx_qcount;
extern boolean_t vsw_obp_ver_proto_workaround;
extern uint32_t vsw_publish_macaddr_count;
extern uint32_t vsw_nrbufs_factor;

#define	LDC_ENTER_LOCK(ldcp)	\
				mutex_enter(&((ldcp)->ldc_cblock));\
				mutex_enter(&((ldcp)->ldc_rxlock));\
				mutex_enter(&((ldcp)->ldc_txlock));
#define	LDC_EXIT_LOCK(ldcp)	\
				mutex_exit(&((ldcp)->ldc_txlock));\
				mutex_exit(&((ldcp)->ldc_rxlock));\
				mutex_exit(&((ldcp)->ldc_cblock));

#define	VSW_VER_EQ(ldcp, major, minor)	\
	((ldcp)->lane_out.ver_major == (major) &&	\
	    (ldcp)->lane_out.ver_minor == (minor))

#define	VSW_VER_LT(ldcp, major, minor)	\
	(((ldcp)->lane_out.ver_major < (major)) ||	\
	    ((ldcp)->lane_out.ver_major == (major) &&	\
	    (ldcp)->lane_out.ver_minor < (minor)))

#define	VSW_VER_GTEQ(ldcp, major, minor)	\
	(((ldcp)->lane_out.ver_major > (major)) ||	\
	    ((ldcp)->lane_out.ver_major == (major) &&	\
	    (ldcp)->lane_out.ver_minor >= (minor)))

#define	VSW_VER_LTEQ(ldcp, major, minor)	\
	(((ldcp)->lane_out.ver_major < (major)) ||	\
	    ((ldcp)->lane_out.ver_major == (major) &&	\
	    (ldcp)->lane_out.ver_minor <= (minor)))

/*
 * VIO Protocol Version Info:
 *
 * The version specified below represents the version of protocol currently
 * supported in the driver. It means the driver can negotiate with peers with
 * versions <= this version. Here is a summary of the feature(s) that are
 * supported at each version of the protocol:
 *
 * 1.0			Basic VIO protocol.
 * 1.1			vDisk protocol update (no virtual network update).
 * 1.2			Support for priority frames (priority-ether-types).
 * 1.3			VLAN and HybridIO support.
 * 1.4			Jumbo Frame support.
 * 1.5			Link State Notification support with optional support
 * 			for Physical Link information.
 * 1.6			Support for RxDringData mode.
 */
static	ver_sup_t	vsw_versions[] = { {1, 6} };

/*
 * For the moment the state dump routines have their own
 * private flag.
 */
#define	DUMP_STATE	0

#if DUMP_STATE

#define	DUMP_TAG(tag) \
{			\
	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
}

#define	DUMP_TAG_PTR(tag) \
{			\
	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
}

#define	DUMP_FLAGS(flags) dump_flags(flags);
#define	DISPLAY_STATE()	display_state()

#else

#define	DUMP_TAG(tag)
#define	DUMP_TAG_PTR(tag)
#define	DUMP_FLAGS(state)
#define	DISPLAY_STATE()

#endif	/* DUMP_STATE */

/*
 * Attach the specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
int
vsw_port_attach(vsw_port_t *port)
{
	vsw_t			*vswp = port->p_vswp;
	vsw_port_list_t		*plist = &vswp->plist;
	vsw_port_t		*p, **pp;
	int			nids = port->num_ldcs;
	uint64_t		*ldcids;
	int			rv;

	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);

	/* port already exists? */
	READ_ENTER(&plist->lockrw);
	for (p = plist->head; p != NULL; p = p->p_next) {
		if (p->p_instance == port->p_instance) {
			DWARN(vswp, "%s: port instance %d already attached",
			    __func__, p->p_instance);
			RW_EXIT(&plist->lockrw);
			return (1);
		}
	}
	RW_EXIT(&plist->lockrw);

	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);

	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
	port->state = VSW_PORT_INIT;

	D2(vswp, "%s: %d nids", __func__, nids);
	ldcids = port->ldc_ids;
	D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[0]);
	if (vsw_ldc_attach(port, (uint64_t)ldcids[0]) != 0) {
		DERR(vswp, "%s: ldc_attach failed", __func__);
		goto exit_error;
	}

	if (vswp->switching_setup_done == B_TRUE) {
		/*
		 * If the underlying network device has been setup,
		 * then open a mac client and porgram the mac address
		 * for this port.
		 */
		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
		if (rv != 0) {
			goto exit_error;
		}
	}

	/* create the fdb entry for this port/mac address */
	vsw_fdbe_add(vswp, port);

	vsw_create_vlans(port, VSW_VNETPORT);

	WRITE_ENTER(&plist->lockrw);

	/* link it into the list of ports for this vsw instance */
	pp = (vsw_port_t **)(&plist->head);
	port->p_next = *pp;
	*pp = port;
	plist->num_ports++;

	RW_EXIT(&plist->lockrw);

	/*
	 * Initialise the port and any ldc's under it.
	 */
	(void) vsw_ldc_init(port->ldcp);

	/* announce macaddr of vnet to the physical switch */
	if (vsw_publish_macaddr_count != 0) {	/* enabled */
		vsw_publish_macaddr(vswp, port);
	}

	D1(vswp, "%s: exit", __func__);
	return (0);

exit_error:

	cv_destroy(&port->state_cv);
	mutex_destroy(&port->state_lock);

	rw_destroy(&port->maccl_rwlock);
	mutex_destroy(&port->tx_lock);
	mutex_destroy(&port->mca_lock);
	kmem_free(port, sizeof (vsw_port_t));
	return (1);
}

/*
 * Detach the specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
int
vsw_port_detach(vsw_t *vswp, int p_instance)
{
	vsw_port_t	*port = NULL;
	vsw_port_list_t	*plist = &vswp->plist;

	D1(vswp, "%s: enter: port id %d", __func__, p_instance);

	WRITE_ENTER(&plist->lockrw);

	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
		RW_EXIT(&plist->lockrw);
		return (1);
	}

	if (vsw_plist_del_node(vswp, port)) {
		RW_EXIT(&plist->lockrw);
		return (1);
	}

	/* cleanup any HybridIO for this port */
	vsw_hio_stop_port(port);

	/*
	 * No longer need to hold writer lock on port list now
	 * that we have unlinked the target port from the list.
	 */
	RW_EXIT(&plist->lockrw);

	/* Cleanup and close the mac client */
	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);

	/* Remove the fdb entry for this port/mac address */
	vsw_fdbe_del(vswp, &(port->p_macaddr));
	vsw_destroy_vlans(port, VSW_VNETPORT);

	/* Remove any multicast addresses.. */
	vsw_del_mcst_port(port);

	vsw_port_delete(port);

	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
	return (0);
}

/*
 * Detach all active ports.
 */
void
vsw_detach_ports(vsw_t *vswp)
{
	vsw_port_list_t 	*plist = &vswp->plist;
	vsw_port_t		*port = NULL;

	D1(vswp, "%s: enter", __func__);

	WRITE_ENTER(&plist->lockrw);

	while ((port = plist->head) != NULL) {
		(void) vsw_plist_del_node(vswp, port);

		/* cleanup any HybridIO for this port */
		vsw_hio_stop_port(port);

		/* Cleanup and close the mac client */
		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);

		/* Remove the fdb entry for this port/mac address */
		vsw_fdbe_del(vswp, &(port->p_macaddr));
		vsw_destroy_vlans(port, VSW_VNETPORT);

		/* Remove any multicast addresses.. */
		vsw_del_mcst_port(port);

		/*
		 * No longer need to hold the lock on the port list
		 * now that we have unlinked the target port from the
		 * list.
		 */
		RW_EXIT(&plist->lockrw);
		vsw_port_delete(port);
		WRITE_ENTER(&plist->lockrw);
	}
	RW_EXIT(&plist->lockrw);

	D1(vswp, "%s: exit", __func__);
}

/*
 * Delete the specified port.
 */
static void
vsw_port_delete(vsw_port_t *port)
{
	vsw_t			*vswp = port->p_vswp;

	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);

	vsw_ldc_uninit(port->ldcp);

	/*
	 * Wait for any pending ctrl msg tasks which reference this
	 * port to finish.
	 */
	vsw_drain_port_taskq(port);

	/*
	 * Wait for any active callbacks to finish
	 */
	vsw_ldc_drain(port->ldcp);

	vsw_ldc_detach(port->ldcp);

	rw_destroy(&port->maccl_rwlock);
	mutex_destroy(&port->mca_lock);
	mutex_destroy(&port->tx_lock);

	cv_destroy(&port->state_cv);
	mutex_destroy(&port->state_lock);

	if (port->num_ldcs != 0) {
		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
		port->num_ldcs = 0;
	}

	if (port->nvids != 0) {
		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
	}

	kmem_free(port, sizeof (vsw_port_t));

	D1(vswp, "%s: exit", __func__);
}

/*
 * Attach a logical domain channel (ldc) under a specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
{
	vsw_t 		*vswp = port->p_vswp;
	vsw_ldc_t 	*ldcp = NULL;
	ldc_attr_t 	attr;
	ldc_status_t	istatus;
	int 		status = DDI_FAILURE;
	char		kname[MAXNAMELEN];
	enum		{ PROG_init = 0x0,
			    PROG_callback = 0x1,
			    PROG_tx_thread = 0x2}
			progress;

	progress = PROG_init;

	D1(vswp, "%s: enter", __func__);

	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
	if (ldcp == NULL) {
		DERR(vswp, "%s: kmem_zalloc failed", __func__);
		return (1);
	}
	ldcp->ldc_id = ldc_id;

	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
	ldcp->msg_thr_flags = 0;
	mutex_init(&ldcp->msg_thr_lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&ldcp->msg_thr_cv, NULL, CV_DRIVER, NULL);
	ldcp->rcv_thr_flags = 0;
	mutex_init(&ldcp->rcv_thr_lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&ldcp->rcv_thr_cv, NULL, CV_DRIVER, NULL);
	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);

	/* required for handshake with peer */
	ldcp->local_session = (uint64_t)ddi_get_lbolt();
	ldcp->peer_session = 0;
	ldcp->session_status = 0;
	ldcp->hss_id = 1;	/* Initial handshake session id */
	ldcp->hphase = VSW_MILESTONE0;

	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);

	/* only set for outbound lane, inbound set by peer */
	vsw_set_lane_attr(vswp, &ldcp->lane_out);

	attr.devclass = LDC_DEV_NT_SVC;
	attr.instance = ddi_get_instance(vswp->dip);
	attr.mode = LDC_MODE_UNRELIABLE;
	attr.mtu = VSW_LDC_MTU;
	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
	if (status != 0) {
		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
		    __func__, ldc_id, status);
		goto ldc_attach_fail;
	}

	if (vsw_ldc_txthr_enabled) {
		ldcp->tx_thr_flags = 0;
		ldcp->tx_mhead = ldcp->tx_mtail = NULL;

		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);

		progress |= PROG_tx_thread;
		if (ldcp->tx_thread == NULL) {
			DWARN(vswp, "%s(%lld): Failed to create worker thread",
			    __func__, ldc_id);
			goto ldc_attach_fail;
		}
	}

	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
	if (status != 0) {
		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
		    __func__, ldc_id, status);
		(void) ldc_fini(ldcp->ldc_handle);
		goto ldc_attach_fail;
	}
	/*
	 * allocate a message for ldc_read()s, big enough to hold ctrl and
	 * data msgs, including raw data msgs used to recv priority frames.
	 */
	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);

	progress |= PROG_callback;

	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);

	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
		DERR(vswp, "%s: ldc_status failed", __func__);
		mutex_destroy(&ldcp->status_lock);
		goto ldc_attach_fail;
	}

	ldcp->ldc_status = istatus;
	ldcp->ldc_port = port;
	ldcp->ldc_vswp = vswp;

	vsw_reset_vnet_proto_ops(ldcp);

	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
	    kname, &ldcp->ldc_stats);
	if (ldcp->ksp == NULL) {
		DERR(vswp, "%s: kstats setup failed", __func__);
		goto ldc_attach_fail;
	}

	/* link it into this port */
	port->ldcp = ldcp;

	D1(vswp, "%s: exit", __func__);
	return (0);

ldc_attach_fail:

	if (progress & PROG_callback) {
		(void) ldc_unreg_callback(ldcp->ldc_handle);
		kmem_free(ldcp->ldcmsg, ldcp->msglen);
	}

	if (progress & PROG_tx_thread) {
		if (ldcp->tx_thread != NULL) {
			vsw_stop_tx_thread(ldcp);
		}
		mutex_destroy(&ldcp->tx_thr_lock);
		cv_destroy(&ldcp->tx_thr_cv);
	}
	if (ldcp->ksp != NULL) {
		vgen_destroy_kstats(ldcp->ksp);
	}
	mutex_destroy(&ldcp->msg_thr_lock);
	mutex_destroy(&ldcp->rcv_thr_lock);
	mutex_destroy(&ldcp->ldc_txlock);
	mutex_destroy(&ldcp->ldc_rxlock);
	mutex_destroy(&ldcp->ldc_cblock);
	mutex_destroy(&ldcp->drain_cv_lock);
	cv_destroy(&ldcp->msg_thr_cv);
	cv_destroy(&ldcp->rcv_thr_cv);
	cv_destroy(&ldcp->drain_cv);

	kmem_free(ldcp, sizeof (vsw_ldc_t));

	return (1);
}

/*
 * Detach a logical domain channel (ldc) belonging to a
 * particular port.
 */
static void
vsw_ldc_detach(vsw_ldc_t *ldcp)
{
	int 		rv;
	vsw_t 		*vswp = ldcp->ldc_port->p_vswp;
	int		retries = 0;

	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);

	/* Stop msg/rcv thread */
	if (ldcp->rcv_thread != NULL) {
		vsw_stop_rcv_thread(ldcp);
	} else if (ldcp->msg_thread != NULL) {
		vsw_stop_msg_thread(ldcp);
	}
	kmem_free(ldcp->ldcmsg, ldcp->msglen);

	/* Stop the tx thread */
	if (ldcp->tx_thread != NULL) {
		vsw_stop_tx_thread(ldcp);
		mutex_destroy(&ldcp->tx_thr_lock);
		cv_destroy(&ldcp->tx_thr_cv);
		if (ldcp->tx_mhead != NULL) {
			freemsgchain(ldcp->tx_mhead);
			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
			ldcp->tx_cnt = 0;
		}
	}

	/* Destory kstats */
	vgen_destroy_kstats(ldcp->ksp);

	/*
	 * Before we can close the channel we must release any mapped
	 * resources (e.g. drings).
	 */
	vsw_free_lane_resources(ldcp, INBOUND);
	vsw_free_lane_resources(ldcp, OUTBOUND);

	/*
	 * Close the channel, retry on EAAGIN.
	 */
	while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
		if (++retries > vsw_ldc_retries) {
			break;
		}
		drv_usecwait(vsw_ldc_delay);
	}
	if (rv != 0) {
		cmn_err(CE_NOTE,
		    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
		    vswp->instance, rv, ldcp->ldc_id);
	}

	(void) ldc_fini(ldcp->ldc_handle);

	ldcp->ldc_status = LDC_INIT;
	ldcp->ldc_handle = NULL;
	ldcp->ldc_vswp = NULL;

	mutex_destroy(&ldcp->msg_thr_lock);
	mutex_destroy(&ldcp->rcv_thr_lock);
	mutex_destroy(&ldcp->ldc_txlock);
	mutex_destroy(&ldcp->ldc_rxlock);
	mutex_destroy(&ldcp->ldc_cblock);
	mutex_destroy(&ldcp->drain_cv_lock);
	mutex_destroy(&ldcp->status_lock);
	cv_destroy(&ldcp->msg_thr_cv);
	cv_destroy(&ldcp->rcv_thr_cv);
	cv_destroy(&ldcp->drain_cv);

	kmem_free(ldcp, sizeof (vsw_ldc_t));
}

/*
 * Open and attempt to bring up the channel. Note that channel
 * can only be brought up if peer has also opened channel.
 *
 * Returns 0 if can open and bring up channel, otherwise
 * returns 1.
 */
static int
vsw_ldc_init(vsw_ldc_t *ldcp)
{
	vsw_t 		*vswp = ldcp->ldc_vswp;
	ldc_status_t	istatus = 0;
	int		rv;

	D1(vswp, "%s: enter", __func__);

	LDC_ENTER_LOCK(ldcp);

	/* don't start at 0 in case clients don't like that */
	ldcp->next_ident = 1;

	rv = ldc_open(ldcp->ldc_handle);
	if (rv != 0) {
		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
		    __func__, ldcp->ldc_id, rv);
		LDC_EXIT_LOCK(ldcp);
		return (1);
	}

	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
		DERR(vswp, "%s: unable to get status", __func__);
		LDC_EXIT_LOCK(ldcp);
		return (1);

	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
		    __func__, ldcp->ldc_id, istatus);
		LDC_EXIT_LOCK(ldcp);
		return (1);
	}

	mutex_enter(&ldcp->status_lock);
	ldcp->ldc_status = istatus;
	mutex_exit(&ldcp->status_lock);

	rv = ldc_up(ldcp->ldc_handle);
	if (rv != 0) {
		/*
		 * Not a fatal error for ldc_up() to fail, as peer
		 * end point may simply not be ready yet.
		 */
		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
		    ldcp->ldc_id, rv);
		LDC_EXIT_LOCK(ldcp);
		return (1);
	}

	/*
	 * ldc_up() call is non-blocking so need to explicitly
	 * check channel status to see if in fact the channel
	 * is UP.
	 */
	mutex_enter(&ldcp->status_lock);
	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
		DERR(vswp, "%s: unable to get status", __func__);
		mutex_exit(&ldcp->status_lock);
		LDC_EXIT_LOCK(ldcp);
		return (1);

	}

	if (ldcp->ldc_status == LDC_UP) {
		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
		    ldcp->ldc_id, istatus);
		mutex_exit(&ldcp->status_lock);
		LDC_EXIT_LOCK(ldcp);

		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
		return (0);
	}

	mutex_exit(&ldcp->status_lock);
	LDC_EXIT_LOCK(ldcp);

	D1(vswp, "%s: exit", __func__);
	return (0);
}

/* disable callbacks on the channel */
static void
vsw_ldc_uninit(vsw_ldc_t *ldcp)
{
	vsw_t	*vswp = ldcp->ldc_vswp;
	int	rv;

	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);

	LDC_ENTER_LOCK(ldcp);

	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
	if (rv != 0) {
		cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
	}

	mutex_enter(&ldcp->status_lock);
	ldcp->ldc_status = LDC_INIT;
	mutex_exit(&ldcp->status_lock);

	LDC_EXIT_LOCK(ldcp);

	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
}

/*
 * Wait until the callback(s) associated with the ldcs under the specified
 * port have completed.
 *
 * Prior to this function being invoked each channel under this port
 * should have been quiesced via ldc_set_cb_mode(DISABLE).
 *
 * A short explaination of what we are doing below..
 *
 * The simplest approach would be to have a reference counter in
 * the ldc structure which is increment/decremented by the callbacks as
 * they use the channel. The drain function could then simply disable any
 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
 * there is a tiny window here - before the callback is able to get the lock
 * on the channel it is interrupted and this function gets to execute. It
 * sees that the ref count is zero and believes its free to delete the
 * associated data structures.
 *
 * We get around this by taking advantage of the fact that before the ldc
 * framework invokes a callback it sets a flag to indicate that there is a
 * callback active (or about to become active). If when we attempt to
 * unregister a callback when this active flag is set then the unregister
 * will fail with EWOULDBLOCK.
 *
 * If the unregister fails we do a cv_timedwait. We will either be signaled
 * by the callback as it is exiting (note we have to wait a short period to
 * allow the callback to return fully to the ldc framework and it to clear
 * the active flag), or by the timer expiring. In either case we again attempt
 * the unregister. We repeat this until we can succesfully unregister the
 * callback.
 *
 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
 * the case where the callback has finished but the ldc framework has not yet
 * cleared the active flag. In this case we would never get a cv_signal.
 */
static void
vsw_ldc_drain(vsw_ldc_t *ldcp)
{
	vsw_t	*vswp = ldcp->ldc_port->p_vswp;

	D1(vswp, "%s: enter", __func__);

	/*
	 * If we can unregister the channel callback then we
	 * know that there is no callback either running or
	 * scheduled to run for this channel so move on to next
	 * channel in the list.
	 */
	mutex_enter(&ldcp->drain_cv_lock);

	/* prompt active callbacks to quit */
	ldcp->drain_state = VSW_LDC_DRAINING;

	if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
		D2(vswp, "%s: unreg callback for chan %ld", __func__,
		    ldcp->ldc_id);
		mutex_exit(&ldcp->drain_cv_lock);
	} else {
		/*
		 * If we end up here we know that either 1) a callback
		 * is currently executing, 2) is about to start (i.e.
		 * the ldc framework has set the active flag but
		 * has not actually invoked the callback yet, or 3)
		 * has finished and has returned to the ldc framework
		 * but the ldc framework has not yet cleared the
		 * active bit.
		 *
		 * Wait for it to finish.
		 */
		while (ldc_unreg_callback(ldcp->ldc_handle) == EWOULDBLOCK) {
			(void) cv_timedwait(&ldcp->drain_cv,
			    &ldcp->drain_cv_lock, ddi_get_lbolt() + hz);
		}

		mutex_exit(&ldcp->drain_cv_lock);
		D2(vswp, "%s: unreg callback for chan %ld after "
		    "timeout", __func__, ldcp->ldc_id);
	}

	D1(vswp, "%s: exit", __func__);
}

/*
 * Wait until all tasks which reference this port have completed.
 *
 * Prior to this function being invoked each channel under this port
 * should have been quiesced via ldc_set_cb_mode(DISABLE).
 */
static void
vsw_drain_port_taskq(vsw_port_t *port)
{
	vsw_t		*vswp = port->p_vswp;

	D1(vswp, "%s: enter", __func__);

	/*
	 * Mark the port as in the process of being detached, and
	 * dispatch a marker task to the queue so we know when all
	 * relevant tasks have completed.
	 */
	mutex_enter(&port->state_lock);
	port->state = VSW_PORT_DETACHING;

	if ((vswp->taskq_p == NULL) ||
	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
		cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
		    vswp->instance);
		mutex_exit(&port->state_lock);
		return;
	}

	/*
	 * Wait for the marker task to finish.
	 */
	while (port->state != VSW_PORT_DETACHABLE)
		cv_wait(&port->state_cv, &port->state_lock);

	mutex_exit(&port->state_lock);

	D1(vswp, "%s: exit", __func__);
}

static void
vsw_marker_task(void *arg)
{
	vsw_port_t	*port = arg;
	vsw_t		*vswp = port->p_vswp;

	D1(vswp, "%s: enter", __func__);

	mutex_enter(&port->state_lock);

	/*
	 * No further tasks should be dispatched which reference
	 * this port so ok to mark it as safe to detach.
	 */
	port->state = VSW_PORT_DETACHABLE;

	cv_signal(&port->state_cv);

	mutex_exit(&port->state_lock);

	D1(vswp, "%s: exit", __func__);
}

vsw_port_t *
vsw_lookup_port(vsw_t *vswp, int p_instance)
{
	vsw_port_list_t *plist = &vswp->plist;
	vsw_port_t	*port;

	for (port = plist->head; port != NULL; port = port->p_next) {
		if (port->p_instance == p_instance) {
			D2(vswp, "vsw_lookup_port: found p_instance\n");
			return (port);
		}
	}

	return (NULL);
}

void
vsw_vlan_unaware_port_reset(vsw_port_t *portp)
{
	vsw_ldc_t	*ldcp = portp->ldcp;

	mutex_enter(&ldcp->ldc_cblock);

	/*
	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
	 * the connection. See comments in vsw_set_vnet_proto_ops().
	 */
	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
	    portp->nvids != 0) {
		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
	}

	mutex_exit(&ldcp->ldc_cblock);
}

void
vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
{
	vsw_ldc_t	*ldcp = portp->ldcp;

	mutex_enter(&ldcp->ldc_cblock);

	/*
	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
	 * to trigger re-negotiation, which inturn trigger HybridIO
	 * setup/cleanup.
	 */
	if ((ldcp->hphase == VSW_MILESTONE4) &&
	    (portp->p_hio_capable == B_TRUE)) {
		if (immediate == B_TRUE) {
			(void) ldc_down(ldcp->ldc_handle);
		} else {
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
		}
	}

	mutex_exit(&ldcp->ldc_cblock);
}

void
vsw_port_reset(vsw_port_t *portp)
{
	vsw_ldc_t	*ldcp = portp->ldcp;

	mutex_enter(&ldcp->ldc_cblock);

	/*
	 * reset channel and terminate the connection.
	 */
	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);

	mutex_exit(&ldcp->ldc_cblock);
}

void
vsw_reset_ports(vsw_t *vswp)
{
	vsw_port_list_t	*plist = &vswp->plist;
	vsw_port_t	*portp;

	READ_ENTER(&plist->lockrw);
	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
			vsw_hio_stop_port(portp);
		}
		vsw_port_reset(portp);
	}
	RW_EXIT(&plist->lockrw);
}

static void
vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
{
	vnet_physlink_msg_t	msg;
	vnet_physlink_msg_t	*msgp = &msg;
	uint32_t		physlink_info = 0;

	if (plink_state == LINK_STATE_UP) {
		physlink_info |= VNET_PHYSLINK_STATE_UP;
	} else {
		physlink_info |= VNET_PHYSLINK_STATE_DOWN;
	}

	msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
	msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
	msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
	msgp->tag.vio_sid = ldcp->local_session;
	msgp->physlink_info = physlink_info;

	(void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
}

static void
vsw_port_physlink_update(vsw_port_t *portp)
{
	vsw_ldc_t	*ldcp;
	vsw_t		*vswp;

	vswp = portp->p_vswp;
	ldcp = portp->ldcp;

	mutex_enter(&ldcp->ldc_cblock);

	/*
	 * If handshake has completed successfully and if the vnet device
	 * has negotiated to get physical link state updates, send a message
	 * with the current state.
	 */
	if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
		vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
	}

	mutex_exit(&ldcp->ldc_cblock);
}

void
vsw_physlink_update_ports(vsw_t *vswp)
{
	vsw_port_list_t	*plist = &vswp->plist;
	vsw_port_t	*portp;

	READ_ENTER(&plist->lockrw);
	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
		vsw_port_physlink_update(portp);
	}
	RW_EXIT(&plist->lockrw);
}

/*
 * Search for and remove the specified port from the port
 * list. Returns 0 if able to locate and remove port, otherwise
 * returns 1.
 */
static int
vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
{
	vsw_port_list_t *plist = &vswp->plist;
	vsw_port_t	*curr_p, *prev_p;

	if (plist->head == NULL)
		return (1);

	curr_p = prev_p = plist->head;

	while (curr_p != NULL) {
		if (curr_p == port) {
			if (prev_p == curr_p) {
				plist->head = curr_p->p_next;
			} else {
				prev_p->p_next = curr_p->p_next;
			}
			plist->num_ports--;
			break;
		} else {
			prev_p = curr_p;
			curr_p = curr_p->p_next;
		}
	}
	return (0);
}

/*
 * Interrupt handler for ldc messages.
 */
static uint_t
vsw_ldc_cb(uint64_t event, caddr_t arg)
{
	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
	vsw_t 		*vswp = ldcp->ldc_vswp;

	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);

	mutex_enter(&ldcp->ldc_cblock);
	ldcp->ldc_stats.callbacks++;

	mutex_enter(&ldcp->status_lock);
	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
		mutex_exit(&ldcp->status_lock);
		mutex_exit(&ldcp->ldc_cblock);
		return (LDC_SUCCESS);
	}
	mutex_exit(&ldcp->status_lock);

	if (event & LDC_EVT_UP) {
		/*
		 * Channel has come up.
		 */
		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);

		vsw_process_conn_evt(ldcp, VSW_CONN_UP);

		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
	}

	if (event & LDC_EVT_READ) {
		/*
		 * Data available for reading.
		 */
		D2(vswp, "%s: id(ld) event(%llx) data READ",
		    __func__, ldcp->ldc_id, event);

		vsw_process_evt_read(ldcp);

		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);

		goto vsw_cb_exit;
	}

	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);

		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
	}

	/*
	 * Catch either LDC_EVT_WRITE which we don't support or any
	 * unknown event.
	 */
	if (event &
	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
	}

vsw_cb_exit:
	mutex_exit(&ldcp->ldc_cblock);

	/*
	 * Let the drain function know we are finishing if it
	 * is waiting.
	 */
	mutex_enter(&ldcp->drain_cv_lock);
	if (ldcp->drain_state == VSW_LDC_DRAINING)
		cv_signal(&ldcp->drain_cv);
	mutex_exit(&ldcp->drain_cv_lock);

	return (LDC_SUCCESS);
}

/*
 * Reinitialise data structures associated with the channel.
 */
static void
vsw_ldc_reinit(vsw_ldc_t *ldcp)
{
	vsw_t		*vswp = ldcp->ldc_vswp;
	vsw_port_t	*port;

	D1(vswp, "%s: enter", __func__);

	port = ldcp->ldc_port;

	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);

	vsw_free_lane_resources(ldcp, INBOUND);
	vsw_free_lane_resources(ldcp, OUTBOUND);

	ldcp->lane_in.lstate = 0;
	ldcp->lane_out.lstate = 0;

	/*
	 * Remove parent port from any multicast groups
	 * it may have registered with. Client must resend
	 * multicast add command after handshake completes.
	 */
	vsw_del_mcst_port(port);

	ldcp->peer_session = 0;
	ldcp->session_status = 0;
	ldcp->hcnt = 0;
	ldcp->hphase = VSW_MILESTONE0;

	vsw_reset_vnet_proto_ops(ldcp);

	D1(vswp, "%s: exit", __func__);
}

/*
 * Process a connection event.
 */
void
vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
{
	vsw_t		*vswp = ldcp->ldc_vswp;
	vsw_conn_evt_t	*conn = NULL;

	D1(vswp, "%s: enter", __func__);

	/*
	 * Check if either a reset or restart event is pending
	 * or in progress. If so just return.
	 *
	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
	 * being received by the callback handler, or a ECONNRESET error
	 * code being returned from a ldc_read() or ldc_write() call.
	 *
	 * A VSW_CONN_RESTART event occurs when some error checking code
	 * decides that there is a problem with data from the channel,
	 * and that the handshake should be restarted.
	 */
	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
	    (ldstub((uint8_t *)&ldcp->reset_active)))
		return;

	/*
	 * If it is an LDC_UP event we first check the recorded
	 * state of the channel. If this is UP then we know that
	 * the channel moving to the UP state has already been dealt
	 * with and don't need to dispatch a  new task.
	 *
	 * The reason for this check is that when we do a ldc_up(),
	 * depending on the state of the peer, we may or may not get
	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
	 * every time we do ldc_up() we explicitly check the channel
	 * status to see has it come up (ldc_up() is asynch and will
	 * complete at some undefined time), and take the appropriate
	 * action.
	 *
	 * The flip side of this is that we may get a LDC_UP event
	 * when we have already seen that the channel is up and have
	 * dealt with that.
	 */
	mutex_enter(&ldcp->status_lock);
	if (evt == VSW_CONN_UP) {
		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
			mutex_exit(&ldcp->status_lock);
			return;
		}
	}
	mutex_exit(&ldcp->status_lock);

	/*
	 * The transaction group id allows us to identify and discard
	 * any tasks which are still pending on the taskq and refer
	 * to the handshake session we are about to restart or reset.
	 * These stale messages no longer have any real meaning.
	 */
	(void) atomic_inc_32(&ldcp->hss_id);

	ASSERT(vswp->taskq_p != NULL);

	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
		    " connection event", vswp->instance);
		goto err_exit;
	}

	conn->evt = evt;
	conn->ldcp = ldcp;

	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
	    DDI_NOSLEEP) != DDI_SUCCESS) {
		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
		    vswp->instance);

		kmem_free(conn, sizeof (vsw_conn_evt_t));
		goto err_exit;
	}

	D1(vswp, "%s: exit", __func__);
	return;

err_exit:
	/*
	 * Have mostly likely failed due to memory shortage. Clear the flag so
	 * that future requests will at least be attempted and will hopefully
	 * succeed.
	 */
	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
		ldcp->reset_active = 0;
}

/*
 * Deal with events relating to a connection. Invoked from a taskq.
 */
static void
vsw_conn_task(void *arg)
{
	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
	vsw_ldc_t	*ldcp = NULL;
	vsw_port_t	*portp;
	vsw_t		*vswp = NULL;
	uint16_t	evt;
	ldc_status_t	curr_status;

	ldcp = conn->ldcp;
	evt = conn->evt;
	vswp = ldcp->ldc_vswp;
	portp = ldcp->ldc_port;

	D1(vswp, "%s: enter", __func__);

	/* can safely free now have copied out data */
	kmem_free(conn, sizeof (vsw_conn_evt_t));

	if (ldcp->rcv_thread != NULL) {
		vsw_stop_rcv_thread(ldcp);
	} else if (ldcp->msg_thread != NULL) {
		vsw_stop_msg_thread(ldcp);
	}

	mutex_enter(&ldcp->status_lock);
	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
		    "channel %ld", vswp->instance, ldcp->ldc_id);
		mutex_exit(&ldcp->status_lock);
		return;
	}

	/*
	 * If we wish to restart the handshake on this channel, then if
	 * the channel is UP we bring it DOWN to flush the underlying
	 * ldc queue.
	 */
	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
		(void) ldc_down(ldcp->ldc_handle);

	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
		vsw_hio_stop(vswp, ldcp);
	}

	/*
	 * re-init all the associated data structures.
	 */
	vsw_ldc_reinit(ldcp);

	/*
	 * Bring the channel back up (note it does no harm to
	 * do this even if the channel is already UP, Just
	 * becomes effectively a no-op).
	 */
	(void) ldc_up(ldcp->ldc_handle);

	/*
	 * Check if channel is now UP. This will only happen if
	 * peer has also done a ldc_up().
	 */
	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
		    "channel %ld", vswp->instance, ldcp->ldc_id);
		mutex_exit(&ldcp->status_lock);
		return;
	}

	ldcp->ldc_status = curr_status;

	/* channel UP so restart handshake by sending version info */
	if (curr_status == LDC_UP) {
		if (ldcp->hcnt++ > vsw_num_handshakes) {
			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
			    " handshake attempts (%d) on channel %ld",
			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
			mutex_exit(&ldcp->status_lock);
			return;
		}

		if (vsw_obp_ver_proto_workaround == B_FALSE &&
		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
		    DDI_NOSLEEP) != DDI_SUCCESS)) {
			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
			    vswp->instance);

			/*
			 * Don't count as valid restart attempt if couldn't
			 * send version msg.
			 */
			if (ldcp->hcnt > 0)
				ldcp->hcnt--;
		}
	}

	/*
	 * Mark that the process is complete by clearing the flag.
	 *
	 * Note is it possible that the taskq dispatch above may have failed,
	 * most likely due to memory shortage. We still clear the flag so
	 * future attempts will at least be attempted and will hopefully
	 * succeed.
	 */
	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
		ldcp->reset_active = 0;

	mutex_exit(&ldcp->status_lock);

	D1(vswp, "%s: exit", __func__);
}

/*
 * returns 0 if legal for event signified by flag to have
 * occured at the time it did. Otherwise returns 1.
 */
int
vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
{
	vsw_t		*vswp = ldcp->ldc_vswp;
	uint64_t	state;
	uint64_t	phase;

	if (dir == INBOUND)
		state = ldcp->lane_in.lstate;
	else
		state = ldcp->lane_out.lstate;

	phase = ldcp->hphase;

	switch (flag) {
	case VSW_VER_INFO_RECV:
		if (phase > VSW_MILESTONE0) {
			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
			    " when in state %d\n", ldcp->ldc_id, phase);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return (1);
		}
		break;

	case VSW_VER_ACK_RECV:
	case VSW_VER_NACK_RECV:
		if (!(state & VSW_VER_INFO_SENT)) {
			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return (1);
		} else
			state &= ~VSW_VER_INFO_SENT;
		break;

	case VSW_ATTR_INFO_RECV:
		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
			    " when in state %d\n", ldcp->ldc_id, phase);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return (1);
		}
		break;

	case VSW_ATTR_ACK_RECV:
	case VSW_ATTR_NACK_RECV:
		if (!(state & VSW_ATTR_INFO_SENT)) {
			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
			    " or ATTR_NACK when in state %d\n",
			    ldcp->ldc_id, phase);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return (1);
		} else
			state &= ~VSW_ATTR_INFO_SENT;
		break;

	case VSW_DRING_INFO_RECV:
		if (phase < VSW_MILESTONE1) {
			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
			    " when in state %d\n", ldcp->ldc_id, phase);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return (1);
		}
		break;

	case VSW_DRING_ACK_RECV:
	case VSW_DRING_NACK_RECV:
		if (!(state & VSW_DRING_INFO_SENT)) {
			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
			    " or DRING_NACK when in state %d\n",
			    ldcp->ldc_id, phase);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return (1);
		} else
			state &= ~VSW_DRING_INFO_SENT;
		break;

	case VSW_RDX_INFO_RECV:
		if (phase < VSW_MILESTONE3) {
			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
			    " when in state %d\n", ldcp->ldc_id, phase);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return (1);
		}
		break;

	case VSW_RDX_ACK_RECV:
	case VSW_RDX_NACK_RECV:
		if (!(state & VSW_RDX_INFO_SENT)) {
			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return (1);
		} else
			state &= ~VSW_RDX_INFO_SENT;
		break;

	case VSW_MCST_INFO_RECV:
		if (phase < VSW_MILESTONE3) {
			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
			    " when in state %d\n", ldcp->ldc_id, phase);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return (1);
		}
		break;

	default:
		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
		    ldcp->ldc_id, flag);
		return (1);
	}

	if (dir == INBOUND)
		ldcp->lane_in.lstate = state;
	else
		ldcp->lane_out.lstate = state;

	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);

	return (0);
}

void
vsw_next_milestone(vsw_ldc_t *ldcp)
{
	vsw_t		*vswp = ldcp->ldc_vswp;
	vsw_port_t	*portp = ldcp->ldc_port;
	lane_t		*lane_out = &ldcp->lane_out;
	lane_t		*lane_in = &ldcp->lane_in;

	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
	    ldcp->ldc_id, ldcp->hphase);

	DUMP_FLAGS(lane_in->lstate);
	DUMP_FLAGS(lane_out->lstate);

	switch (ldcp->hphase) {

	case VSW_MILESTONE0:
		/*
		 * If we haven't started to handshake with our peer,
		 * start to do so now.
		 */
		if (lane_out->lstate == 0) {
			D2(vswp, "%s: (chan %lld) starting handshake "
			    "with peer", __func__, ldcp->ldc_id);
			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
		}

		/*
		 * Only way to pass this milestone is to have successfully
		 * negotiated version info.
		 */
		if ((lane_in->lstate & VSW_VER_ACK_SENT) &&
		    (lane_out->lstate & VSW_VER_ACK_RECV)) {

			D2(vswp, "%s: (chan %lld) leaving milestone 0",
			    __func__, ldcp->ldc_id);

			vsw_set_vnet_proto_ops(ldcp);

			/*
			 * Next milestone is passed when attribute
			 * information has been successfully exchanged.
			 */
			ldcp->hphase = VSW_MILESTONE1;
			vsw_send_attr(ldcp);

		}
		break;

	case VSW_MILESTONE1:
		/*
		 * Only way to pass this milestone is to have successfully
		 * negotiated attribute information, in both directions.
		 */
		if (!((lane_in->lstate & VSW_ATTR_ACK_SENT) &&
		    (lane_out->lstate & VSW_ATTR_ACK_RECV))) {
			break;
		}

		ldcp->hphase = VSW_MILESTONE2;

		/*
		 * If the peer device has said it wishes to
		 * use descriptor rings then we send it our ring
		 * info, otherwise we just set up a private ring
		 * which we use an internal buffer
		 */
		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
		    (VSW_VER_LT(ldcp, 1, 2) &&
		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
			vsw_send_dring_info(ldcp);
			break;
		}

		/*
		 * The peer doesn't operate in dring mode; we
		 * can simply fallthru to the RDX phase from
		 * here.
		 */
		/*FALLTHRU*/

	case VSW_MILESTONE2:
		/*
		 * If peer has indicated in its attribute message that
		 * it wishes to use descriptor rings then the only way
		 * to pass this milestone is for us to have received
		 * valid dring info.
		 *
		 * If peer is not using descriptor rings then just fall
		 * through.
		 */
		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
		    (VSW_VER_LT(ldcp, 1, 2) &&
		    (lane_in->xfer_mode ==
		    VIO_DRING_MODE_V1_0))) {
			if (!(lane_in->lstate & VSW_DRING_ACK_SENT))
				break;
		}

		D2(vswp, "%s: (chan %lld) leaving milestone 2",
		    __func__, ldcp->ldc_id);

		ldcp->hphase = VSW_MILESTONE3;
		vsw_send_rdx(ldcp);
		break;

	case VSW_MILESTONE3:
		/*
		 * Pass this milestone when all paramaters have been
		 * successfully exchanged and RDX sent in both directions.
		 *
		 * Mark the relevant lane as available to transmit data. In
		 * RxDringData mode, lane_in is associated with transmit and
		 * lane_out is associated with receive. It is the reverse in
		 * TxDring mode.
		 */
		if ((lane_out->lstate & VSW_RDX_ACK_SENT) &&
		    (lane_in->lstate & VSW_RDX_ACK_RECV)) {

			D2(vswp, "%s: (chan %lld) leaving milestone 3",
			    __func__, ldcp->ldc_id);
			D2(vswp, "%s: ** handshake complete (0x%llx : "
			    "0x%llx) **", __func__, lane_in->lstate,
			    lane_out->lstate);
			if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
				lane_in->lstate |= VSW_LANE_ACTIVE;
			} else {
				lane_out->lstate |= VSW_LANE_ACTIVE;
			}
			ldcp->hphase = VSW_MILESTONE4;
			ldcp->hcnt = 0;
			DISPLAY_STATE();
			/* Start HIO if enabled and capable */
			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
				D2(vswp, "%s: start HybridIO setup", __func__);
				vsw_hio_start(vswp, ldcp);
			}

			if (ldcp->pls_negotiated == B_TRUE) {
				/*
				 * The vnet device has negotiated to get phys
				 * link updates. Now that the handshake with
				 * the vnet device is complete, send an initial
				 * update with the current physical link state.
				 */
				vsw_send_physlink_msg(ldcp,
				    vswp->phys_link_state);
			}

		} else {
			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
			    __func__, lane_in->lstate,
			    lane_out->lstate);
		}
		break;

	case VSW_MILESTONE4:
		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
		    ldcp->ldc_id);
		break;

	default:
		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
		    ldcp->ldc_id, ldcp->hphase);
	}

	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
	    ldcp->hphase);
}

/*
 * Check if major version is supported.
 *
 * Returns 0 if finds supported major number, and if necessary
 * adjusts the minor field.
 *
 * Returns 1 if can't match major number exactly. Sets mjor/minor
 * to next lowest support values, or to zero if no other values possible.
 */
static int
vsw_supported_version(vio_ver_msg_t *vp)
{
	int	i;

	D1(NULL, "vsw_supported_version: enter");

	for (i = 0; i < VSW_NUM_VER; i++) {
		if (vsw_versions[i].ver_major == vp->ver_major) {
			/*
			 * Matching or lower major version found. Update
			 * minor number if necessary.
			 */
			if (vp->ver_minor > vsw_versions[i].ver_minor) {
				D2(NULL, "%s: adjusting minor value from %d "
				    "to %d", __func__, vp->ver_minor,
				    vsw_versions[i].ver_minor);
				vp->ver_minor = vsw_versions[i].ver_minor;
			}

			return (0);
		}

		/*
		 * If the message contains a higher major version number, set
		 * the message's major/minor versions to the current values
		 * and return false, so this message will get resent with
		 * these values.
		 */
		if (vsw_versions[i].ver_major < vp->ver_major) {
			D2(NULL, "%s: adjusting major and minor "
			    "values to %d, %d\n",
			    __func__, vsw_versions[i].ver_major,
			    vsw_versions[i].ver_minor);
			vp->ver_major = vsw_versions[i].ver_major;
			vp->ver_minor = vsw_versions[i].ver_minor;
			return (1);
		}
	}

	/* No match was possible, zero out fields */
	vp->ver_major = 0;
	vp->ver_minor = 0;

	D1(NULL, "vsw_supported_version: exit");

	return (1);
}

/*
 * Set vnet-protocol-version dependent functions based on version.
 */
static void
vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
{
	vsw_t	*vswp = ldcp->ldc_vswp;
	lane_t	*lp = &ldcp->lane_out;

	/*
	 * Setup the appropriate dring data processing routine and any
	 * associated thread based on the version.
	 *
	 * In versions < 1.6, we support only TxDring mode. In this mode, the
	 * msg worker thread processes all types of VIO msgs (ctrl and data).
	 *
	 * In versions >= 1.6, we also support RxDringData mode. In this mode,
	 * the rcv worker thread processes dring data messages (msgtype:
	 * VIO_TYPE_DATA, subtype: VIO_SUBTYPE_INFO, env: VIO_DRING_DATA). The
	 * rest of the data messages (including acks) and ctrl messages are
	 * handled directly by the callback (intr) thread.
	 *
	 * However, for versions >= 1.6, we could still fallback to TxDring
	 * mode. This could happen if RxDringData mode has been disabled (see
	 * below) on this guest or on the peer guest. This info is determined
	 * as part of attr exchange phase of handshake. Hence, we setup these
	 * pointers for v1.6 after attr msg phase completes during handshake.
	 */
	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
		/*
		 * Set data dring mode for vsw_send_attr(). We setup msg worker
		 * thread in TxDring mode or rcv worker thread in RxDringData
		 * mode when attr phase of handshake completes.
		 */
		if (vsw_mapin_avail(ldcp) == B_TRUE) {
			lp->dring_mode = (VIO_RX_DRING_DATA | VIO_TX_DRING);
		} else {
			lp->dring_mode = VIO_TX_DRING;
		}
	} else {
		lp->dring_mode = VIO_TX_DRING;
	}

	/*
	 * Setup the MTU for attribute negotiation based on the version.
	 */
	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
		/*
		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
		 * Support), set the mtu in our attributes to max_frame_size.
		 */
		lp->mtu = vswp->max_frame_size;
	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
		/*
		 * If the version negotiated with peer is == 1.3 (Vlan Tag
		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
		 */
		lp->mtu = ETHERMAX + VLAN_TAGSZ;
	} else {
		vsw_port_t	*portp = ldcp->ldc_port;
		/*
		 * Pre-1.3 peers expect max frame size of ETHERMAX.
		 * We can negotiate that size with those peers provided only
		 * pvid is defined for our peer and there are no vids. Then we
		 * can send/recv only untagged frames of max size ETHERMAX.
		 * Note that pvid of the peer can be different, as vsw has to
		 * serve the vnet in that vlan even if itself is not assigned
		 * to that vlan.
		 */
		if (portp->nvids == 0) {
			lp->mtu = ETHERMAX;
		}
	}

	/*
	 * Setup version dependent data processing functions.
	 */
	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
		/* Versions >= 1.2 */

		if (VSW_PRI_ETH_DEFINED(vswp)) {
			/*
			 * enable priority routines and pkt mode only if
			 * at least one pri-eth-type is specified in MD.
			 */
			ldcp->tx = vsw_ldctx_pri;
			ldcp->rx_pktdata = vsw_process_pkt_data;

			/* set xfer mode for vsw_send_attr() */
			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
		} else {
			/* no priority eth types defined in MD */

			ldcp->tx = vsw_ldctx;
			ldcp->rx_pktdata = vsw_process_pkt_data_nop;

			/* set xfer mode for vsw_send_attr() */
			lp->xfer_mode = VIO_DRING_MODE_V1_2;
		}

	} else {
		/* Versions prior to 1.2  */

		vsw_reset_vnet_proto_ops(ldcp);
	}
}

/*
 * Reset vnet-protocol-version dependent functions to v1.0.
 */
static void
vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
{
	lane_t	*lp = &ldcp->lane_out;

	ldcp->tx = vsw_ldctx;
	ldcp->rx_pktdata = vsw_process_pkt_data_nop;

	/* set xfer mode for vsw_send_attr() */
	lp->xfer_mode = VIO_DRING_MODE_V1_0;
}

static void
vsw_process_evt_read(vsw_ldc_t *ldcp)
{
	if (ldcp->msg_thread != NULL) {
		/*
		 * TxDring mode; wakeup message worker
		 * thread to process the VIO messages.
		 */
		mutex_exit(&ldcp->ldc_cblock);
		mutex_enter(&ldcp->msg_thr_lock);
		if (!(ldcp->msg_thr_flags & VSW_WTHR_DATARCVD)) {
			ldcp->msg_thr_flags |= VSW_WTHR_DATARCVD;
			cv_signal(&ldcp->msg_thr_cv);
		}
		mutex_exit(&ldcp->msg_thr_lock);
		mutex_enter(&ldcp->ldc_cblock);
	} else {
		/*
		 * We invoke vsw_process_pkt() in the context of the LDC
		 * callback (vsw_ldc_cb()) during handshake, until the dring
		 * mode is negotiated. After the dring mode is negotiated, the
		 * msgs are processed by the msg worker thread (above case) if
		 * the dring mode is TxDring. Otherwise (in RxDringData mode)
		 * we continue to process the msgs directly in the callback
		 * context.
		 */
		vsw_process_pkt(ldcp);
	}
}

/*
 * Main routine for processing messages received over LDC.
 */
void
vsw_process_pkt(void *arg)
{
	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
	vsw_t 		*vswp = ldcp->ldc_vswp;
	size_t		msglen;
	vio_msg_tag_t	*tagp;
	uint64_t	*ldcmsg;
	int 		rv = 0;


	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);

	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));

	ldcmsg = ldcp->ldcmsg;
	/*
	 * If channel is up read messages until channel is empty.
	 */
	do {
		msglen = ldcp->msglen;
		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);

		if (rv != 0) {
			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
			    __func__, ldcp->ldc_id, rv, msglen);
		}

		/* channel has been reset */
		if (rv == ECONNRESET) {
			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
			break;
		}

		if (msglen == 0) {
			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
			    ldcp->ldc_id);
			break;
		}

		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
		    ldcp->ldc_id, msglen);

		/*
		 * Figure out what sort of packet we have gotten by
		 * examining the msg tag, and then switch it appropriately.
		 */
		tagp = (vio_msg_tag_t *)ldcmsg;

		switch (tagp->vio_msgtype) {
		case VIO_TYPE_CTRL:
			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp, msglen);
			break;
		case VIO_TYPE_DATA:
			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
			break;
		case VIO_TYPE_ERR:
			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
			break;
		default:
			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
			break;
		}
	} while (msglen);

	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
}

/*
 * Dispatch a task to process a VIO control message.
 */
static void
vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp,
	int msglen)
{
	vsw_ctrl_task_t		*ctaskp = NULL;
	vsw_port_t		*port = ldcp->ldc_port;
	vsw_t			*vswp = port->p_vswp;

	D1(vswp, "%s: enter", __func__);

	/*
	 * We need to handle RDX ACK messages in-band as once they
	 * are exchanged it is possible that we will get an
	 * immediate (legitimate) data packet.
	 */
	if ((tagp->vio_subtype_env == VIO_RDX) &&
	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {

		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
			return;

		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
		D2(vswp, "%s (%ld) handling RDX_ACK in place "
		    "(ostate 0x%llx : hphase %d)", __func__,
		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
		vsw_next_milestone(ldcp);
		return;
	}

	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);

	if (ctaskp == NULL) {
		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
		return;
	}

	ctaskp->ldcp = ldcp;
	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, msglen);
	ctaskp->hss_id = ldcp->hss_id;

	/*
	 * Dispatch task to processing taskq if port is not in
	 * the process of being detached.
	 */
	mutex_enter(&port->state_lock);
	if (port->state == VSW_PORT_INIT) {
		if ((vswp->taskq_p == NULL) ||
		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
			mutex_exit(&port->state_lock);
			DERR(vswp, "%s: unable to dispatch task to taskq",
			    __func__);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
			return;
		}
	} else {
		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
		DWARN(vswp, "%s: port %d detaching, not dispatching "
		    "task", __func__, port->p_instance);
	}

	mutex_exit(&port->state_lock);

	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
	    ldcp->ldc_id);
	D1(vswp, "%s: exit", __func__);
}

/*
 * Process a VIO ctrl message. Invoked from taskq.
 */
static void
vsw_process_ctrl_pkt(void *arg)
{
	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
	vsw_ldc_t	*ldcp = ctaskp->ldcp;
	vsw_t 		*vswp = ldcp->ldc_vswp;
	vio_msg_tag_t	tag;
	uint16_t	env;

	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
	env = tag.vio_subtype_env;

	/* stale pkt check */
	if (ctaskp->hss_id < ldcp->hss_id) {
		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
		    " (%ld) handshake session", __func__, ctaskp->hss_id);
		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
		return;
	}

	/* session id check */
	if (ldcp->session_status & VSW_PEER_SESSION) {
		if (ldcp->peer_session != tag.vio_sid) {
			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
			    __func__, ldcp->ldc_id, tag.vio_sid);
			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return;
		}
	}

	/*
	 * Switch on vio_subtype envelope, then let lower routines
	 * decide if its an INFO, ACK or NACK packet.
	 */
	switch (env) {
	case VIO_VER_INFO:
		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
		break;
	case VIO_DRING_REG:
		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
		break;
	case VIO_DRING_UNREG:
		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
		break;
	case VIO_ATTR_INFO:
		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
		break;
	case VNET_MCAST_INFO:
		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
		break;
	case VIO_RDX:
		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
		break;
	case VIO_DDS_INFO:
		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
		break;

	case VNET_PHYSLINK_INFO:
		vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
		break;
	default:
		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
	}

	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Version negotiation. We can end up here either because our peer
 * has responded to a handshake message we have sent it, or our peer
 * has initiated a handshake with us. If its the former then can only
 * be ACK or NACK, if its the later can only be INFO.
 *
 * If its an ACK we move to the next stage of the handshake, namely
 * attribute exchange. If its a NACK we see if we can specify another
 * version, if we can't we stop.
 *
 * If it is an INFO we reset all params associated with communication
 * in that direction over this channel (remember connection is
 * essentially 2 independent simplex channels).
 */
void
vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
{
	vio_ver_msg_t	*ver_pkt;
	vsw_t 		*vswp = ldcp->ldc_vswp;

	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

	/*
	 * We know this is a ctrl/version packet so
	 * cast it into the correct structure.
	 */
	ver_pkt = (vio_ver_msg_t *)pkt;

	switch (ver_pkt->tag.vio_subtype) {
	case VIO_SUBTYPE_INFO:
		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");

		/*
		 * Record the session id, which we will use from now
		 * until we see another VER_INFO msg. Even then the
		 * session id in most cases will be unchanged, execpt
		 * if channel was reset.
		 */
		if ((ldcp->session_status & VSW_PEER_SESSION) &&
		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
			DERR(vswp, "%s: updating session id for chan %lld "
			    "from %llx to %llx", __func__, ldcp->ldc_id,
			    ldcp->peer_session, ver_pkt->tag.vio_sid);
		}

		ldcp->peer_session = ver_pkt->tag.vio_sid;
		ldcp->session_status |= VSW_PEER_SESSION;

		/* Legal message at this time ? */
		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
			return;

		/*
		 * First check the device class. Currently only expect
		 * to be talking to a network device. In the future may
		 * also talk to another switch.
		 */
		if (ver_pkt->dev_class != VDEV_NETWORK) {
			DERR(vswp, "%s: illegal device class %d", __func__,
			    ver_pkt->dev_class);

			ver_pkt->tag.vio_sid = ldcp->local_session;
			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);

			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
			    sizeof (vio_ver_msg_t), B_TRUE);

			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
			vsw_next_milestone(ldcp);
			return;
		} else {
			ldcp->dev_class = ver_pkt->dev_class;
		}

		/*
		 * Now check the version.
		 */
		if (vsw_supported_version(ver_pkt) == 0) {
			/*
			 * Support this major version and possibly
			 * adjusted minor version.
			 */

			D2(vswp, "%s: accepted ver %d:%d", __func__,
			    ver_pkt->ver_major, ver_pkt->ver_minor);

			/* Store accepted values */
			ldcp->lane_in.ver_major = ver_pkt->ver_major;
			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;

			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;

			if (vsw_obp_ver_proto_workaround == B_TRUE) {
				/*
				 * Send a version info message
				 * using the accepted version that
				 * we are about to ack. Also note that
				 * we send our ver info before we ack.
				 * Otherwise, as soon as receiving the
				 * ack, obp sends attr info msg, which
				 * breaks vsw_check_flag() invoked
				 * from vsw_process_ctrl_attr_pkt();
				 * as we also need VSW_VER_ACK_RECV to
				 * be set in lane_out.lstate, before
				 * we can receive attr info.
				 */
				vsw_send_ver(ldcp);
			}
		} else {
			/*
			 * NACK back with the next lower major/minor
			 * pairing we support (if don't suuport any more
			 * versions then they will be set to zero.
			 */

			D2(vswp, "%s: replying with ver %d:%d", __func__,
			    ver_pkt->ver_major, ver_pkt->ver_minor);

			/* Store updated values */
			ldcp->lane_in.ver_major = ver_pkt->ver_major;
			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;

			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
		}

		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
		ver_pkt->tag.vio_sid = ldcp->local_session;
		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
		    sizeof (vio_ver_msg_t), B_TRUE);

		vsw_next_milestone(ldcp);
		break;

	case VIO_SUBTYPE_ACK:
		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);

		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
			return;

		/* Store updated values */
		ldcp->lane_out.ver_major = ver_pkt->ver_major;
		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;

		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
		vsw_next_milestone(ldcp);

		break;

	case VIO_SUBTYPE_NACK:
		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);

		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
			return;

		/*
		 * If our peer sent us a NACK with the ver fields set to
		 * zero then there is nothing more we can do. Otherwise see
		 * if we support either the version suggested, or a lesser
		 * one.
		 */
		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
			DERR(vswp, "%s: peer unable to negotiate any "
			    "further.", __func__);
			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
			vsw_next_milestone(ldcp);
			return;
		}

		/*
		 * Check to see if we support this major version or
		 * a lower one. If we don't then maj/min will be set
		 * to zero.
		 */
		(void) vsw_supported_version(ver_pkt);
		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
			/* Nothing more we can do */
			DERR(vswp, "%s: version negotiation failed.\n",
			    __func__);
			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
			vsw_next_milestone(ldcp);
		} else {
			/* found a supported major version */
			ldcp->lane_out.ver_major = ver_pkt->ver_major;
			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;

			D2(vswp, "%s: resending with updated values (%x, %x)",
			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);

			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
			ver_pkt->tag.vio_sid = ldcp->local_session;
			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;

			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);

			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
			    sizeof (vio_ver_msg_t), B_TRUE);

			vsw_next_milestone(ldcp);

		}
		break;

	default:
		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
		    ver_pkt->tag.vio_subtype);
	}

	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
}

static int
vsw_process_attr_info(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
{
	vsw_t			*vswp = ldcp->ldc_vswp;
	vsw_port_t		*port = ldcp->ldc_port;
	struct ether_addr	ea;
	uint64_t		macaddr = 0;
	lane_t			*lane_out = &ldcp->lane_out;
	lane_t			*lane_in = &ldcp->lane_in;
	uint32_t		mtu;
	int			i;
	uint8_t			dring_mode;

	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

	if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) {
		return (1);
	}

	if ((msg->xfer_mode != VIO_DESC_MODE) &&
	    (msg->xfer_mode != lane_out->xfer_mode)) {
		D2(NULL, "%s: unknown mode %x\n", __func__, msg->xfer_mode);
		return (1);
	}

	/* Only support MAC addresses at moment. */
	if ((msg->addr_type != ADDR_TYPE_MAC) || (msg->addr == 0)) {
		D2(NULL, "%s: invalid addr_type %x, or address 0x%llx\n",
		    __func__, msg->addr_type, msg->addr);
		return (1);
	}

	/*
	 * MAC address supplied by device should match that stored
	 * in the vsw-port OBP node. Need to decide what to do if they
	 * don't match, for the moment just warn but don't fail.
	 */
	vnet_macaddr_ultostr(msg->addr, ea.ether_addr_octet);
	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
		DERR(NULL, "%s: device supplied address "
		    "0x%llx doesn't match node address 0x%llx\n",
		    __func__, msg->addr, port->p_macaddr);
	}

	/*
	 * Ack freq only makes sense in pkt mode, in shared
	 * mode the ring descriptors say whether or not to
	 * send back an ACK.
	 */
	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
	    (msg->xfer_mode & VIO_DRING_MODE_V1_2)) ||
	    (VSW_VER_LT(ldcp, 1, 2) &&
	    (msg->xfer_mode == VIO_DRING_MODE_V1_0))) {
		if (msg->ack_freq > 0) {
			D2(NULL, "%s: non zero ack freq in SHM mode\n",
			    __func__);
			return (1);
		}
	}

	/*
	 * Process dring mode attribute.
	 */
	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
		/*
		 * Versions >= 1.6:
		 * Though we are operating in v1.6 mode, it is possible that
		 * RxDringData mode has been disabled either on this guest or
		 * on the peer guest. If so, we revert to pre v1.6 behavior of
		 * TxDring mode. But this must be agreed upon in both
		 * directions of attr exchange. We first determine the mode
		 * that can be negotiated.
		 */
		if ((msg->options & VIO_RX_DRING_DATA) != 0 &&
		    vsw_mapin_avail(ldcp) == B_TRUE) {
			/*
			 * The peer is capable of handling RxDringData AND we
			 * are also capable of it; we enable RxDringData mode
			 * on this channel.
			 */
			dring_mode = VIO_RX_DRING_DATA;
		} else if ((msg->options & VIO_TX_DRING) != 0) {
			/*
			 * If the peer is capable of TxDring mode, we
			 * negotiate TxDring mode on this channel.
			 */
			dring_mode = VIO_TX_DRING;
		} else {
			/*
			 * We support only VIO_TX_DRING and VIO_RX_DRING_DATA
			 * modes. We don't support VIO_RX_DRING mode.
			 */
			return (1);
		}

		/*
		 * If we have received an ack for the attr info that we sent,
		 * then check if the dring mode matches what the peer had ack'd
		 * (saved in lane_out). If they don't match, we fail the
		 * handshake.
		 */
		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
			if (msg->options != lane_out->dring_mode) {
				/* send NACK */
				return (1);
			}
		} else {
			/*
			 * Save the negotiated dring mode in our attr
			 * parameters, so it gets sent in the attr info from us
			 * to the peer.
			 */
			lane_out->dring_mode = dring_mode;
		}

		/* save the negotiated dring mode in the msg to be replied */
		msg->options = dring_mode;
	}

	/*
	 * Process MTU attribute.
	 */
	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
		/*
		 * Versions >= 1.4:
		 * Validate mtu of the peer is at least ETHERMAX. Then, the mtu
		 * is negotiated down to the minimum of our mtu and peer's mtu.
		 */
		if (msg->mtu < ETHERMAX) {
			return (1);
		}

		mtu = MIN(msg->mtu, vswp->max_frame_size);

		/*
		 * If we have received an ack for the attr info
		 * that we sent, then check if the mtu computed
		 * above matches the mtu that the peer had ack'd
		 * (saved in local hparams). If they don't
		 * match, we fail the handshake.
		 */
		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
			if (mtu != lane_out->mtu) {
				/* send NACK */
				return (1);
			}
		} else {
			/*
			 * Save the mtu computed above in our
			 * attr parameters, so it gets sent in
			 * the attr info from us to the peer.
			 */
			lane_out->mtu = mtu;
		}

		/* save the MIN mtu in the msg to be replied */
		msg->mtu = mtu;
	} else {
		/* Versions < 1.4, mtu must match */
		if (msg->mtu != lane_out->mtu) {
			D2(NULL, "%s: invalid MTU (0x%llx)\n",
			    __func__, msg->mtu);
			return (1);
		}
	}

	/*
	 * Otherwise store attributes for this lane and update
	 * lane state.
	 */
	lane_in->mtu = msg->mtu;
	lane_in->addr = msg->addr;
	lane_in->addr_type = msg->addr_type;
	lane_in->xfer_mode = msg->xfer_mode;
	lane_in->ack_freq = msg->ack_freq;
	lane_in->physlink_update = msg->physlink_update;
	lane_in->dring_mode = msg->options;

	/*
	 * Check if the client has requested physlink state updates.
	 * If there is a physical device bound to this vswitch (L2
	 * mode), set the ack bits to indicate it is supported.
	 * Otherwise, set the nack bits.
	 */
	if (VSW_VER_GTEQ(ldcp, 1, 5)) {	/* Protocol ver >= 1.5 */

		/* Does the vnet need phys link state updates ? */
		if ((lane_in->physlink_update &
		    PHYSLINK_UPDATE_STATE_MASK) ==
		    PHYSLINK_UPDATE_STATE) {

			if (vswp->smode & VSW_LAYER2) {
				/* is a net-dev assigned to us ? */
				msg->physlink_update =
				    PHYSLINK_UPDATE_STATE_ACK;
				ldcp->pls_negotiated = B_TRUE;
			} else {
				/* not in L2 mode */
				msg->physlink_update =
				    PHYSLINK_UPDATE_STATE_NACK;
				ldcp->pls_negotiated = B_FALSE;
			}

		} else {
			msg->physlink_update =
			    PHYSLINK_UPDATE_NONE;
			ldcp->pls_negotiated = B_FALSE;
		}

	} else {
		/*
		 * physlink_update bits are ignored
		 * if set by clients < v1.5 protocol.
		 */
		msg->physlink_update = PHYSLINK_UPDATE_NONE;
		ldcp->pls_negotiated = B_FALSE;
	}

	macaddr = lane_in->addr;
	for (i = ETHERADDRL - 1; i >= 0; i--) {
		port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
		macaddr >>= 8;
	}

	/*
	 * Setup device specific xmit routines. Note this could be changed
	 * further in vsw_send_dring_info() for versions >= 1.6 if operating in
	 * RxDringData mode.
	 */
	mutex_enter(&port->tx_lock);

	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
	    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
	    (VSW_VER_LT(ldcp, 1, 2) &&
	    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
		D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
		port->transmit = vsw_dringsend;
	} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
		D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
		vsw_create_privring(ldcp);
		port->transmit = vsw_descrsend;
		lane_out->xfer_mode = VIO_DESC_MODE;
	}

	/*
	 * HybridIO is supported only vnet, not by OBP.
	 * So, set hio_capable to true only when in DRING mode.
	 */
	if (VSW_VER_GTEQ(ldcp, 1, 3) &&
	    (lane_in->xfer_mode != VIO_DESC_MODE)) {
		(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
	} else {
		(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
	}

	mutex_exit(&port->tx_lock);

	return (0);
}

static int
vsw_process_attr_ack(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
{
	vsw_t	*vswp = ldcp->ldc_vswp;
	lane_t	*lane_out = &ldcp->lane_out;
	lane_t	*lane_in = &ldcp->lane_in;

	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

	if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) {
		return (1);
	}

	/*
	 * Process dring mode attribute.
	 */
	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
		/*
		 * Versions >= 1.6:
		 * The ack msg sent by the peer contains the negotiated dring
		 * mode between our capability (that we had sent in our attr
		 * info) and the peer's capability.
		 */
		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
			/*
			 * If we have sent an ack for the attr info msg from
			 * the peer, check if the dring mode that was
			 * negotiated then (saved in lane_out) matches the
			 * mode that the peer has ack'd. If they don't match,
			 * we fail the handshake.
			 */
			if (lane_out->dring_mode != msg->options) {
				return (1);
			}
		} else {
			if ((msg->options & lane_out->dring_mode) == 0) {
				/*
				 * Peer ack'd with a mode that we don't
				 * support; we fail the handshake.
				 */
				return (1);
			}
			if ((msg->options & (VIO_TX_DRING|VIO_RX_DRING_DATA))
			    == (VIO_TX_DRING|VIO_RX_DRING_DATA)) {
				/*
				 * Peer must ack with only one negotiated mode.
				 * Otherwise fail handshake.
				 */
				return (1);
			}

			/*
			 * Save the negotiated mode, so we can validate it when
			 * we receive attr info from the peer.
			 */
			lane_out->dring_mode = msg->options;
		}
	}

	/*
	 * Process MTU attribute.
	 */
	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
		/*
		 * Versions >= 1.4:
		 * The ack msg sent by the peer contains the minimum of
		 * our mtu (that we had sent in our attr info) and the
		 * peer's mtu.
		 *
		 * If we have sent an ack for the attr info msg from
		 * the peer, check if the mtu that was computed then
		 * (saved in lane_out params) matches the mtu that the
		 * peer has ack'd. If they don't match, we fail the
		 * handshake.
		 */
		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
			if (lane_out->mtu != msg->mtu) {
				return (1);
			}
		} else {
			/*
			 * If the mtu ack'd by the peer is > our mtu
			 * fail handshake. Otherwise, save the mtu, so
			 * we can validate it when we receive attr info
			 * from our peer.
			 */
			if (msg->mtu <= lane_out->mtu) {
				lane_out->mtu = msg->mtu;
			} else {
				return (1);
			}
		}
	}

	return (0);
}

/*
 * Process an attribute packet. We can end up here either because our peer
 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
 * peer has sent us an attribute INFO message
 *
 * If its an ACK we then move to the next stage of the handshake which
 * is to send our descriptor ring info to our peer. If its a NACK then
 * there is nothing more we can (currently) do.
 *
 * If we get a valid/acceptable INFO packet (and we have already negotiated
 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
 * NACK back and reset channel state to INACTIV.
 *
 * FUTURE: in time we will probably negotiate over attributes, but for
 * the moment unacceptable attributes are regarded as a fatal error.
 *
 */
void
vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
{
	vnet_attr_msg_t	*attr_pkt;
	vsw_t		*vswp = ldcp->ldc_vswp;
	lane_t		*lane_out = &ldcp->lane_out;
	lane_t		*lane_in = &ldcp->lane_in;
	int		rv;

	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

	/*
	 * We know this is a ctrl/attr packet so
	 * cast it into the correct structure.
	 */
	attr_pkt = (vnet_attr_msg_t *)pkt;

	switch (attr_pkt->tag.vio_subtype) {
	case VIO_SUBTYPE_INFO:

		rv = vsw_process_attr_info(ldcp, attr_pkt);
		if (rv != 0) {
			vsw_free_lane_resources(ldcp, INBOUND);
			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
		} else {
			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
			lane_in->lstate |= VSW_ATTR_ACK_SENT;
		}
		attr_pkt->tag.vio_sid = ldcp->local_session;
		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
		    sizeof (vnet_attr_msg_t), B_TRUE);
		vsw_next_milestone(ldcp);
		break;

	case VIO_SUBTYPE_ACK:

		rv = vsw_process_attr_ack(ldcp, attr_pkt);
		if (rv != 0) {
			return;
		}
		lane_out->lstate |= VSW_ATTR_ACK_RECV;
		vsw_next_milestone(ldcp);
		break;

	case VIO_SUBTYPE_NACK:
		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
			return;

		lane_out->lstate |= VSW_ATTR_NACK_RECV;
		vsw_next_milestone(ldcp);
		break;

	default:
		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
		    attr_pkt->tag.vio_subtype);
	}

	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

static int
vsw_process_dring_reg_info(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
{
	int		rv;
	vsw_t		*vswp = ldcp->ldc_vswp;
	lane_t		*lp = &ldcp->lane_out;
	dring_info_t	*dp = NULL;

	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

	rv = vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV);
	if (rv != 0) {
		return (1);
	}

	if (VSW_VER_GTEQ(ldcp, 1, 6) &&
	    (lp->dring_mode != ((vio_dring_reg_msg_t *)tagp)->options)) {
		/*
		 * The earlier version of Solaris vnet driver doesn't set the
		 * option (VIO_TX_DRING in its case) correctly in its dring reg
		 * message. We workaround that here by doing the check only
		 * for versions >= v1.6.
		 */
		DWARN(vswp, "%s(%lld): Rcvd dring reg option (%d), "
		    "negotiated mode (%d)\n", __func__, ldcp->ldc_id,
		    ((vio_dring_reg_msg_t *)tagp)->options, lp->dring_mode);
		return (1);
	}

	/*
	 * Map dring exported by the peer.
	 */
	dp = vsw_map_dring(ldcp, (void *)tagp);
	if (dp == NULL) {
		return (1);
	}

	/*
	 * Map data buffers exported by the peer if we are in RxDringData mode.
	 */
	if (lp->dring_mode == VIO_RX_DRING_DATA) {
		rv = vsw_map_data(ldcp, dp, (void *)tagp);
		if (rv != 0) {
			vsw_unmap_dring(ldcp);
			return (1);
		}
	}

	return (0);
}

static int
vsw_process_dring_reg_ack(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
{
	vsw_t		*vswp = ldcp->ldc_vswp;
	dring_info_t	*dp;

	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

	if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) {
		return (1);
	}

	dp = ldcp->lane_out.dringp;

	/* save dring_ident acked by peer */
	dp->ident = ((vio_dring_reg_msg_t *)tagp)->dring_ident;

	return (0);
}

/*
 * Process a dring info packet. We can end up here either because our peer
 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
 * peer has sent us a dring INFO message.
 *
 * If we get a valid/acceptable INFO packet (and we have already negotiated
 * a version) we ACK back and update the lane state, otherwise we NACK back.
 *
 * FUTURE: nothing to stop client from sending us info on multiple dring's
 * but for the moment we will just use the first one we are given.
 *
 */
void
vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
	int		rv;
	int		msgsize;
	dring_info_t	*dp;
	vio_msg_tag_t	*tagp = (vio_msg_tag_t *)pkt;
	vsw_t		*vswp = ldcp->ldc_vswp;
	lane_t		*lane_out = &ldcp->lane_out;
	lane_t		*lane_in = &ldcp->lane_in;

	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

	switch (tagp->vio_subtype) {
	case VIO_SUBTYPE_INFO:
		rv = vsw_process_dring_reg_info(ldcp, tagp);
		if (rv != 0) {
			vsw_free_lane_resources(ldcp, INBOUND);
			tagp->vio_subtype = VIO_SUBTYPE_NACK;
			lane_in->lstate |= VSW_DRING_NACK_SENT;
		} else {
			tagp->vio_subtype = VIO_SUBTYPE_ACK;
			lane_in->lstate |= VSW_DRING_ACK_SENT;
		}
		tagp->vio_sid = ldcp->local_session;
		DUMP_TAG_PTR(tagp);
		if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
			dp = lane_in->dringp;
			msgsize =
			    VNET_DRING_REG_EXT_MSG_SIZE(dp->data_ncookies);
		} else {
			msgsize = sizeof (vio_dring_reg_msg_t);
		}
		(void) vsw_send_msg(ldcp, (void *)tagp, msgsize, B_TRUE);
		vsw_next_milestone(ldcp);
		break;

	case VIO_SUBTYPE_ACK:
		rv = vsw_process_dring_reg_ack(ldcp, tagp);
		if (rv != 0) {
			return;
		}
		lane_out->lstate |= VSW_DRING_ACK_RECV;
		vsw_next_milestone(ldcp);
		break;

	case VIO_SUBTYPE_NACK:
		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
			return;

		lane_out->lstate |= VSW_DRING_NACK_RECV;
		vsw_next_milestone(ldcp);
		break;

	default:
		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
		    tagp->vio_subtype);
	}

	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

/*
 * Process a request from peer to unregister a dring.
 *
 * For the moment we just restart the handshake if our
 * peer endpoint attempts to unregister a dring.
 */
void
vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
	vsw_t			*vswp = ldcp->ldc_vswp;
	vio_dring_unreg_msg_t	*dring_pkt;

	/*
	 * We know this is a ctrl/dring packet so
	 * cast it into the correct structure.
	 */
	dring_pkt = (vio_dring_unreg_msg_t *)pkt;

	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

	switch (dring_pkt->tag.vio_subtype) {
	case VIO_SUBTYPE_INFO:
		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

		DWARN(vswp, "%s: restarting handshake..", __func__);
		break;

	case VIO_SUBTYPE_ACK:
		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

		DWARN(vswp, "%s: restarting handshake..", __func__);
		break;

	case VIO_SUBTYPE_NACK:
		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

		DWARN(vswp, "%s: restarting handshake..", __func__);
		break;

	default:
		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
		    dring_pkt->tag.vio_subtype);
	}

	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);

	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

#define	SND_MCST_NACK(ldcp, pkt) \
	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
	pkt->tag.vio_sid = ldcp->local_session; \
	(void) vsw_send_msg(ldcp, (void *)pkt, \
			sizeof (vnet_mcast_msg_t), B_TRUE);

/*
 * Process a multicast request from a vnet.
 *
 * Vnet's specify a multicast address that they are interested in. This
 * address is used as a key into the hash table which forms the multicast
 * forwarding database (mFDB).
 *
 * The table keys are the multicast addresses, while the table entries
 * are pointers to lists of ports which wish to receive packets for the
 * specified multicast address.
 *
 * When a multicast packet is being switched we use the address as a key
 * into the hash table, and then walk the appropriate port list forwarding
 * the pkt to each port in turn.
 *
 * If a vnet is no longer interested in a particular multicast grouping
 * we simply find the correct location in the hash table and then delete
 * the relevant port from the port list.
 *
 * To deal with the case whereby a port is being deleted without first
 * removing itself from the lists in the hash table, we maintain a list
 * of multicast addresses the port has registered an interest in, within
 * the port structure itself. We then simply walk that list of addresses
 * using them as keys into the hash table and remove the port from the
 * appropriate lists.
 */
static void
vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
{
	vnet_mcast_msg_t	*mcst_pkt;
	vsw_port_t		*port = ldcp->ldc_port;
	vsw_t			*vswp = ldcp->ldc_vswp;
	int			i;

	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

	/*
	 * We know this is a ctrl/mcast packet so
	 * cast it into the correct structure.
	 */
	mcst_pkt = (vnet_mcast_msg_t *)pkt;

	switch (mcst_pkt->tag.vio_subtype) {
	case VIO_SUBTYPE_INFO:
		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

		/*
		 * Check if in correct state to receive a multicast
		 * message (i.e. handshake complete). If not reset
		 * the handshake.
		 */
		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
			return;

		/*
		 * Before attempting to add or remove address check
		 * that they are valid multicast addresses.
		 * If not, then NACK back.
		 */
		for (i = 0; i < mcst_pkt->count; i++) {
			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
				DERR(vswp, "%s: invalid multicast address",
				    __func__);
				SND_MCST_NACK(ldcp, mcst_pkt);
				return;
			}
		}

		/*
		 * Now add/remove the addresses. If this fails we
		 * NACK back.
		 */
		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
			SND_MCST_NACK(ldcp, mcst_pkt);
			return;
		}

		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
		mcst_pkt->tag.vio_sid = ldcp->local_session;

		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);

		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
		    sizeof (vnet_mcast_msg_t), B_TRUE);
		break;

	case VIO_SUBTYPE_ACK:
		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

		/*
		 * We shouldn't ever get a multicast ACK message as
		 * at the moment we never request multicast addresses
		 * to be set on some other device. This may change in
		 * the future if we have cascading switches.
		 */
		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
			return;

				/* Do nothing */
		break;

	case VIO_SUBTYPE_NACK:
		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

		/*
		 * We shouldn't get a multicast NACK packet for the
		 * same reasons as we shouldn't get a ACK packet.
		 */
		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
			return;

				/* Do nothing */
		break;

	default:
		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
		    mcst_pkt->tag.vio_subtype);
	}

	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
{
	vio_rdx_msg_t	*rdx_pkt;
	vsw_t		*vswp = ldcp->ldc_vswp;

	/*
	 * We know this is a ctrl/rdx packet so
	 * cast it into the correct structure.
	 */
	rdx_pkt = (vio_rdx_msg_t *)pkt;

	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

	switch (rdx_pkt->tag.vio_subtype) {
	case VIO_SUBTYPE_INFO:
		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
			return;

		rdx_pkt->tag.vio_sid = ldcp->local_session;
		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);

		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;

		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
		    sizeof (vio_rdx_msg_t), B_TRUE);

		vsw_next_milestone(ldcp);
		break;

	case VIO_SUBTYPE_ACK:
		/*
		 * Should be handled in-band by callback handler.
		 */
		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
		break;

	case VIO_SUBTYPE_NACK:
		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
			return;

		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
		vsw_next_milestone(ldcp);
		break;

	default:
		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
		    rdx_pkt->tag.vio_subtype);
	}

	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
{
	vnet_physlink_msg_t	*msgp;
	vsw_t			*vswp = ldcp->ldc_vswp;

	msgp = (vnet_physlink_msg_t *)pkt;

	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

	switch (msgp->tag.vio_subtype) {
	case VIO_SUBTYPE_INFO:

		/* vsw shouldn't recv physlink info */
		DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
		break;

	case VIO_SUBTYPE_ACK:

		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
		break;

	case VIO_SUBTYPE_NACK:

		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
		break;

	default:
		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
		    msgp->tag.vio_subtype);
	}

	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
	uint32_t msglen)
{
	uint16_t	env = tagp->vio_subtype_env;
	vsw_t		*vswp = ldcp->ldc_vswp;
	lane_t		*lp = &ldcp->lane_out;
	uint8_t		dring_mode = lp->dring_mode;

	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

	/* session id check */
	if (ldcp->session_status & VSW_PEER_SESSION) {
		if (ldcp->peer_session != tagp->vio_sid) {
			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
			    __func__, ldcp->ldc_id, tagp->vio_sid);
			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
			return;
		}
	}

	/*
	 * It is an error for us to be getting data packets
	 * before the handshake has completed.
	 */
	if (ldcp->hphase != VSW_MILESTONE4) {
		DERR(vswp, "%s: got data packet before handshake complete "
		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
		DUMP_FLAGS(ldcp->lane_in.lstate);
		DUMP_FLAGS(ldcp->lane_out.lstate);
		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
		return;
	}
	if (dring_mode == VIO_TX_DRING) {
		/*
		 * To reduce the locking contention, release the ldc_cblock
		 * here and re-acquire it once we are done receiving packets.
		 * We do this only in TxDring mode to allow further callbaks to
		 * continue while the msg worker thread processes the messages.
		 * In RxDringData mode, we process the messages in the callback
		 * itself and wake up rcv worker thread to process only data
		 * info messages.
		 */
		mutex_exit(&ldcp->ldc_cblock);
		mutex_enter(&ldcp->ldc_rxlock);
	}

	/*
	 * Switch on vio_subtype envelope, then let lower routines
	 * decide if its an INFO, ACK or NACK packet.
	 */
	if (env == VIO_DRING_DATA) {
		ldcp->rx_dringdata(ldcp, dpkt);
	} else if (env == VIO_PKT_DATA) {
		ldcp->rx_pktdata(ldcp, dpkt, msglen);
	} else if (env == VIO_DESC_DATA) {
		vsw_process_data_ibnd_pkt(ldcp, dpkt);
	} else {
		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n",
		    __func__, env);
	}

	if (dring_mode == VIO_TX_DRING) {
		mutex_exit(&ldcp->ldc_rxlock);
		mutex_enter(&ldcp->ldc_cblock);
	}

	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * dummy pkt data handler function for vnet protocol version 1.0
 */
static void
vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
{
	_NOTE(ARGUNUSED(arg1, arg2, msglen))
}

/*
 * This function handles raw pkt data messages received over the channel.
 * Currently, only priority-eth-type frames are received through this mechanism.
 * In this case, the frame(data) is present within the message itself which
 * is copied into an mblk before switching it.
 */
static void
vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
{
	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
	uint32_t		size;
	mblk_t			*mp;
	vio_mblk_t		*vmp;
	vsw_t			*vswp = ldcp->ldc_vswp;
	vgen_stats_t		*statsp = &ldcp->ldc_stats;
	lane_t			*lp = &ldcp->lane_out;

	size = msglen - VIO_PKT_DATA_HDRSIZE;
	if (size < ETHERMIN || size > lp->mtu) {
		(void) atomic_inc_32(&statsp->rx_pri_fail);
		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
		    ldcp->ldc_id, size);
		return;
	}

	vmp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
	if (vmp == NULL) {
		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
		if (mp == NULL) {
			(void) atomic_inc_32(&statsp->rx_pri_fail);
			DWARN(vswp, "%s(%lld) allocb failure, "
			    "unable to process priority frame\n", __func__,
			    ldcp->ldc_id);
			return;
		}
	} else {
		mp = vmp->mp;
	}

	/* skip over the extra space for vlan tag */
	mp->b_rptr += VLAN_TAGSZ;

	/* copy the frame from the payload of raw data msg into the mblk */
	bcopy(dpkt->data, mp->b_rptr, size);
	mp->b_wptr = mp->b_rptr + size;

	if (vmp != NULL) {
		vmp->state = VIO_MBLK_HAS_DATA;
	}

	/* update stats */
	(void) atomic_inc_64(&statsp->rx_pri_packets);
	(void) atomic_add_64(&statsp->rx_pri_bytes, size);

	/*
	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
	 */
	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);

	/* switch the frame to destination */
	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
}

/*
 * Process an in-band descriptor message (most likely from
 * OBP).
 */
static void
vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
{
	vnet_ibnd_desc_t	*ibnd_desc;
	dring_info_t		*dp = NULL;
	vsw_private_desc_t	*priv_addr = NULL;
	vsw_t			*vswp = ldcp->ldc_vswp;
	mblk_t			*mp = NULL;
	size_t			nbytes = 0;
	size_t			off = 0;
	uint64_t		idx = 0;
	uint32_t		num = 1, len, datalen = 0;
	uint64_t		ncookies = 0;
	int			i, rv;
	int			j = 0;

	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

	ibnd_desc = (vnet_ibnd_desc_t *)pkt;

	switch (ibnd_desc->hdr.tag.vio_subtype) {
	case VIO_SUBTYPE_INFO:
		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
			return;

		/*
		 * Data is padded to align on a 8 byte boundary,
		 * nbytes is actual data length, i.e. minus that
		 * padding.
		 */
		datalen = ibnd_desc->nbytes;

		D2(vswp, "%s(%lld): processing inband desc : "
		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);

		ncookies = ibnd_desc->ncookies;

		/*
		 * allocb(9F) returns an aligned data block. We
		 * need to ensure that we ask ldc for an aligned
		 * number of bytes also.
		 */
		nbytes = datalen;
		if (nbytes & 0x7) {
			off = 8 - (nbytes & 0x7);
			nbytes += off;
		}

		/* alloc extra space for VLAN_TAG */
		mp = allocb(datalen + 8, BPRI_MED);
		if (mp == NULL) {
			DERR(vswp, "%s(%lld): allocb failed",
			    __func__, ldcp->ldc_id);
			ldcp->ldc_stats.rx_allocb_fail++;
			return;
		}

		/* skip over the extra space for VLAN_TAG */
		mp->b_rptr += 8;

		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
		    LDC_COPY_IN);

		if (rv != 0) {
			DERR(vswp, "%s(%d): unable to copy in data from "
			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
			freemsg(mp);
			ldcp->ldc_stats.ierrors++;
			return;
		}

		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
		    __func__, ldcp->ldc_id, nbytes, ncookies);

		/* point to the actual end of data */
		mp->b_wptr = mp->b_rptr + datalen;
		ldcp->ldc_stats.ipackets++;
		ldcp->ldc_stats.rbytes += datalen;

		/*
		 * We ACK back every in-band descriptor message we process
		 */
		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
		    sizeof (vnet_ibnd_desc_t), B_TRUE);

		/*
		 * there is extra space alloc'd for VLAN_TAG
		 */
		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);

		/* send the packet to be switched */
		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
		    ldcp->ldc_port, NULL);

		break;

	case VIO_SUBTYPE_ACK:
		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

		/* Verify the ACK is valid */
		idx = ibnd_desc->hdr.desc_handle;

		if (idx >= vsw_num_descriptors) {
			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
			    "(idx %ld)", vswp->instance, idx);
			return;
		}

		if ((dp = ldcp->lane_out.dringp) == NULL) {
			DERR(vswp, "%s: no dring found", __func__);
			return;
		}

		len = dp->num_descriptors;
		/*
		 * If the descriptor we are being ACK'ed for is not the
		 * one we expected, then pkts were lost somwhere, either
		 * when we tried to send a msg, or a previous ACK msg from
		 * our peer. In either case we now reclaim the descriptors
		 * in the range from the last ACK we received up to the
		 * current ACK.
		 */
		if (idx != dp->last_ack_recv) {
			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
			    __func__, dp->last_ack_recv, idx);
			num = idx >= dp->last_ack_recv ?
			    idx - dp->last_ack_recv + 1:
			    (len - dp->last_ack_recv + 1) + idx;
		}

		/*
		 * When we sent the in-band message to our peer we
		 * marked the copy in our private ring as READY. We now
		 * check that the descriptor we are being ACK'ed for is in
		 * fact READY, i.e. it is one we have shared with our peer.
		 *
		 * If its not we flag an error, but still reset the descr
		 * back to FREE.
		 */
		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
			mutex_enter(&priv_addr->dstate_lock);
			if (priv_addr->dstate != VIO_DESC_READY) {
				DERR(vswp, "%s: (%ld) desc at index %ld not "
				    "READY (0x%lx)", __func__,
				    ldcp->ldc_id, idx, priv_addr->dstate);
				DERR(vswp, "%s: bound %d: ncookies %ld : "
				    "datalen %ld", __func__,
				    priv_addr->bound, priv_addr->ncookies,
				    priv_addr->datalen);
			}
			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
			    ldcp->ldc_id, idx);
			/* release resources associated with sent msg */
			priv_addr->datalen = 0;
			priv_addr->dstate = VIO_DESC_FREE;
			mutex_exit(&priv_addr->dstate_lock);
		}
		/* update to next expected value */
		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;

		break;

	case VIO_SUBTYPE_NACK:
		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

		/*
		 * We should only get a NACK if our peer doesn't like
		 * something about a message we have sent it. If this
		 * happens we just release the resources associated with
		 * the message. (We are relying on higher layers to decide
		 * whether or not to resend.
		 */

		/* limit check */
		idx = ibnd_desc->hdr.desc_handle;

		if (idx >= vsw_num_descriptors) {
			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
			    __func__, idx);
			return;
		}

		if ((dp = ldcp->lane_out.dringp) == NULL) {
			DERR(vswp, "%s: no dring found", __func__);
			return;
		}

		priv_addr = (vsw_private_desc_t *)dp->priv_addr;

		/* move to correct location in ring */
		priv_addr += idx;

		/* release resources associated with sent msg */
		mutex_enter(&priv_addr->dstate_lock);
		priv_addr->datalen = 0;
		priv_addr->dstate = VIO_DESC_FREE;
		mutex_exit(&priv_addr->dstate_lock);

		break;

	default:
		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
	}

	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
{
	_NOTE(ARGUNUSED(epkt))

	vsw_t		*vswp = ldcp->ldc_vswp;
	uint16_t	env = tagp->vio_subtype_env;

	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);

	/*
	 * Error vio_subtypes have yet to be defined. So for
	 * the moment we can't do anything.
	 */
	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);

	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
}

/* transmit the packet over the given port */
int
vsw_portsend(vsw_port_t *port, mblk_t *mp)
{
	mblk_t		*mpt;
	int		count;
	vsw_ldc_t 	*ldcp = port->ldcp;
	int		status = 0;

	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
	if (count != 0) {
		status = ldcp->tx(ldcp, mp, mpt, count);
	}
	return (status);
}

/*
 * Break up frames into 2 seperate chains: normal and
 * priority, based on the frame type. The number of
 * priority frames is also counted and returned.
 *
 * Params:
 * 	vswp:	pointer to the instance of vsw
 *	np:	head of packet chain to be broken
 *	npt:	tail of packet chain to be broken
 *
 * Returns:
 *	np:	head of normal data packets
 *	npt:	tail of normal data packets
 *	hp:	head of high priority packets
 *	hpt:	tail of high priority packets
 */
static uint32_t
vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
	mblk_t **hp, mblk_t **hpt)
{
	mblk_t			*tmp = NULL;
	mblk_t			*smp = NULL;
	mblk_t			*hmp = NULL;	/* high prio pkts head */
	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
	mblk_t			*nmp = NULL;	/* normal pkts head */
	mblk_t			*nmpt = NULL;	/* normal pkts tail */
	uint32_t		count = 0;
	int			i;
	struct ether_header	*ehp;
	uint32_t		num_types;
	uint16_t		*types;

	tmp = *np;
	while (tmp != NULL) {

		smp = tmp;
		tmp = tmp->b_next;
		smp->b_next = NULL;
		smp->b_prev = NULL;

		ehp = (struct ether_header *)smp->b_rptr;
		num_types = vswp->pri_num_types;
		types = vswp->pri_types;
		for (i = 0; i < num_types; i++) {
			if (ehp->ether_type == types[i]) {
				/* high priority frame */

				if (hmp != NULL) {
					hmpt->b_next = smp;
					hmpt = smp;
				} else {
					hmp = hmpt = smp;
				}
				count++;
				break;
			}
		}
		if (i == num_types) {
			/* normal data frame */

			if (nmp != NULL) {
				nmpt->b_next = smp;
				nmpt = smp;
			} else {
				nmp = nmpt = smp;
			}
		}
	}

	*hp = hmp;
	*hpt = hmpt;
	*np = nmp;
	*npt = nmpt;

	return (count);
}

/*
 * Wrapper function to transmit normal and/or priority frames over the channel.
 */
static int
vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
{
	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
	mblk_t			*tmp;
	mblk_t			*smp;
	mblk_t			*hmp;	/* high prio pkts head */
	mblk_t			*hmpt;	/* high prio pkts tail */
	mblk_t			*nmp;	/* normal pkts head */
	mblk_t			*nmpt;	/* normal pkts tail */
	uint32_t		n = 0;
	vsw_t			*vswp = ldcp->ldc_vswp;

	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
	ASSERT(count != 0);

	nmp = mp;
	nmpt = mpt;

	/* gather any priority frames from the chain of packets */
	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);

	/* transmit priority frames */
	tmp = hmp;
	while (tmp != NULL) {
		smp = tmp;
		tmp = tmp->b_next;
		smp->b_next = NULL;
		vsw_ldcsend_pkt(ldcp, smp);
	}

	count -= n;

	if (count == 0) {
		/* no normal data frames to process */
		return (0);
	}

	return (vsw_ldctx(ldcp, nmp, nmpt, count));
}

/*
 * Wrapper function to transmit normal frames over the channel.
 */
static int
vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
{
	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
	mblk_t		*tmp = NULL;

	ASSERT(count != 0);
	/*
	 * If the TX thread is enabled, then queue the
	 * ordinary frames and signal the tx thread.
	 */
	if (ldcp->tx_thread != NULL) {

		mutex_enter(&ldcp->tx_thr_lock);

		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
			/*
			 * If we reached queue limit,
			 * do not queue new packets,
			 * drop them.
			 */
			ldcp->ldc_stats.tx_qfull += count;
			mutex_exit(&ldcp->tx_thr_lock);
			freemsgchain(mp);
			goto exit;
		}
		if (ldcp->tx_mhead == NULL) {
			ldcp->tx_mhead = mp;
			ldcp->tx_mtail = mpt;
			cv_signal(&ldcp->tx_thr_cv);
		} else {
			ldcp->tx_mtail->b_next = mp;
			ldcp->tx_mtail = mpt;
		}
		ldcp->tx_cnt += count;
		mutex_exit(&ldcp->tx_thr_lock);
	} else {
		while (mp != NULL) {
			tmp = mp->b_next;
			mp->b_next = mp->b_prev = NULL;
			(void) vsw_ldcsend(ldcp, mp, 1);
			mp = tmp;
		}
	}

exit:
	return (0);
}

/*
 * This function transmits the frame in the payload of a raw data
 * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
 * send special frames with high priorities, without going through
 * the normal data path which uses descriptor ring mechanism.
 */
static void
vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
{
	vio_raw_data_msg_t	*pkt;
	mblk_t			*bp;
	mblk_t			*nmp = NULL;
	vio_mblk_t		*vmp;
	caddr_t			dst;
	uint32_t		mblksz;
	uint32_t		size;
	uint32_t		nbytes;
	int			rv;
	vsw_t			*vswp = ldcp->ldc_vswp;
	vgen_stats_t		*statsp = &ldcp->ldc_stats;

	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
		(void) atomic_inc_32(&statsp->tx_pri_fail);
		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
		    ldcp->lane_out.lstate);
		goto send_pkt_exit;
	}

	size = msgsize(mp);

	/* frame size bigger than available payload len of raw data msg ? */
	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
		(void) atomic_inc_32(&statsp->tx_pri_fail);
		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
		    ldcp->ldc_id, size);
		goto send_pkt_exit;
	}

	if (size < ETHERMIN)
		size = ETHERMIN;

	/* alloc space for a raw data message */
	vmp = vio_allocb(vswp->pri_tx_vmp);
	if (vmp == NULL) {
		(void) atomic_inc_32(&statsp->tx_pri_fail);
		DWARN(vswp, "vio_allocb failed\n");
		goto send_pkt_exit;
	} else {
		nmp = vmp->mp;
	}
	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;

	/* copy frame into the payload of raw data message */
	dst = (caddr_t)pkt->data;
	for (bp = mp; bp != NULL; bp = bp->b_cont) {
		mblksz = MBLKL(bp);
		bcopy(bp->b_rptr, dst, mblksz);
		dst += mblksz;
	}

	vmp->state = VIO_MBLK_HAS_DATA;

	/* setup the raw data msg */
	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
	pkt->tag.vio_sid = ldcp->local_session;
	nbytes = VIO_PKT_DATA_HDRSIZE + size;

	/* send the msg over ldc */
	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
	if (rv != 0) {
		(void) atomic_inc_32(&statsp->tx_pri_fail);
		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
		    ldcp->ldc_id);
		goto send_pkt_exit;
	}

	/* update stats */
	(void) atomic_inc_64(&statsp->tx_pri_packets);
	(void) atomic_add_64(&statsp->tx_pri_packets, size);

send_pkt_exit:
	if (nmp != NULL)
		freemsg(nmp);
	freemsg(mp);
}

/*
 * Transmit the packet over the given LDC channel.
 *
 * The 'retries' argument indicates how many times a packet
 * is retried before it is dropped. Note, the retry is done
 * only for a resource related failure, for all other failures
 * the packet is dropped immediately.
 */
static int
vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
{
	int		i;
	int		rc;
	int		status = 0;
	vsw_port_t	*port = ldcp->ldc_port;
	dring_info_t	*dp = NULL;
	lane_t		*lp = &ldcp->lane_out;

	for (i = 0; i < retries; ) {
		/*
		 * Send the message out using the appropriate
		 * transmit function which will free mblock when it
		 * is finished with it.
		 */
		mutex_enter(&port->tx_lock);
		if (port->transmit != NULL) {
			status = (*port->transmit)(ldcp, mp);
		}
		if (status == LDC_TX_SUCCESS) {
			mutex_exit(&port->tx_lock);
			break;
		}
		i++;	/* increment the counter here */

		/* If its the last retry, then update the oerror */
		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
			ldcp->ldc_stats.oerrors++;
		}
		mutex_exit(&port->tx_lock);

		if (status != LDC_TX_NORESOURCES) {
			/*
			 * No retrying required for errors un-related
			 * to resources.
			 */
			break;
		}
		if (((dp = ldcp->lane_out.dringp) != NULL) &&
		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
		    ((VSW_VER_LT(ldcp, 1, 2) &&
		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {

			/* Need to reclaim in TxDring mode. */
			if (lp->dring_mode == VIO_TX_DRING) {
				rc = vsw_reclaim_dring(dp, dp->end_idx);
			}

		} else {
			/*
			 * If there is no dring or the xfer_mode is
			 * set to DESC_MODE(ie., OBP), then simply break here.
			 */
			break;
		}

		/*
		 * Delay only if none were reclaimed
		 * and its not the last retry.
		 */
		if ((rc == 0) && (i < retries)) {
			delay(drv_usectohz(vsw_ldc_tx_delay));
		}
	}
	freemsg(mp);
	return (status);
}

/*
 * Send an in-band descriptor message over ldc.
 */
static int
vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
{
	vsw_t			*vswp = ldcp->ldc_vswp;
	vnet_ibnd_desc_t	ibnd_msg;
	vsw_private_desc_t	*priv_desc = NULL;
	dring_info_t		*dp = NULL;
	size_t			n, size = 0;
	caddr_t			bufp;
	mblk_t			*bp;
	int			idx, i;
	int			status = LDC_TX_SUCCESS;
	static int		warn_msg = 1;
	lane_t			*lp = &ldcp->lane_out;

	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

	ASSERT(mp != NULL);

	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
		    __func__, ldcp->ldc_id, ldcp->ldc_status,
		    ldcp->lane_out.lstate);
		ldcp->ldc_stats.oerrors++;
		return (LDC_TX_FAILURE);
	}

	/*
	 * The dring here is as an internal buffer,
	 * rather than a transfer channel.
	 */
	if ((dp = ldcp->lane_out.dringp) == NULL) {
		DERR(vswp, "%s(%lld): no dring for outbound lane",
		    __func__, ldcp->ldc_id);
		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
		ldcp->ldc_stats.oerrors++;
		return (LDC_TX_FAILURE);
	}

	size = msgsize(mp);
	if (size > (size_t)lp->mtu) {
		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
		    ldcp->ldc_id, size);
		ldcp->ldc_stats.oerrors++;
		return (LDC_TX_FAILURE);
	}

	/*
	 * Find a free descriptor in our buffer ring
	 */
	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
		if (warn_msg) {
			DERR(vswp, "%s(%lld): no descriptor available for ring "
			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
			warn_msg = 0;
		}

		/* nothing more we can do */
		status = LDC_TX_NORESOURCES;
		goto vsw_descrsend_free_exit;
	} else {
		D2(vswp, "%s(%lld): free private descriptor found at pos "
		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
		warn_msg = 1;
	}

	/* copy data into the descriptor */
	bufp = priv_desc->datap;
	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
		n = MBLKL(bp);
		bcopy(bp->b_rptr, bufp, n);
		bufp += n;
	}

	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;

	/* create and send the in-band descp msg */
	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;

	/*
	 * Copy the mem cookies describing the data from the
	 * private region of the descriptor ring into the inband
	 * descriptor.
	 */
	for (i = 0; i < priv_desc->ncookies; i++) {
		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
		    sizeof (ldc_mem_cookie_t));
	}

	ibnd_msg.hdr.desc_handle = idx;
	ibnd_msg.ncookies = priv_desc->ncookies;
	ibnd_msg.nbytes = size;

	ldcp->ldc_stats.opackets++;
	ldcp->ldc_stats.obytes += size;

	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
	    sizeof (vnet_ibnd_desc_t), B_TRUE);

vsw_descrsend_free_exit:

	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
	return (status);
}

static void
vsw_send_ver(void *arg)
{
	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
	vsw_t		*vswp = ldcp->ldc_vswp;
	lane_t		*lp = &ldcp->lane_out;
	vio_ver_msg_t	ver_msg;

	D1(vswp, "%s enter", __func__);

	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
	ver_msg.tag.vio_sid = ldcp->local_session;

	if (vsw_obp_ver_proto_workaround == B_FALSE) {
		ver_msg.ver_major = vsw_versions[0].ver_major;
		ver_msg.ver_minor = vsw_versions[0].ver_minor;
	} else {
		/* use the major,minor that we've ack'd */
		lane_t	*lpi = &ldcp->lane_in;
		ver_msg.ver_major = lpi->ver_major;
		ver_msg.ver_minor = lpi->ver_minor;
	}
	ver_msg.dev_class = VDEV_NETWORK_SWITCH;

	lp->lstate |= VSW_VER_INFO_SENT;
	lp->ver_major = ver_msg.ver_major;
	lp->ver_minor = ver_msg.ver_minor;

	DUMP_TAG(ver_msg.tag);

	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);

	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_attr(vsw_ldc_t *ldcp)
{
	vsw_t			*vswp = ldcp->ldc_vswp;
	lane_t			*lp = &ldcp->lane_out;
	vnet_attr_msg_t		attr_msg;

	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);

	/*
	 * Subtype is set to INFO by default
	 */
	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
	attr_msg.tag.vio_sid = ldcp->local_session;

	/* payload copied from default settings for lane */
	attr_msg.mtu = lp->mtu;
	attr_msg.addr_type = lp->addr_type;
	attr_msg.xfer_mode = lp->xfer_mode;
	attr_msg.ack_freq = lp->xfer_mode;
	attr_msg.options = lp->dring_mode;

	READ_ENTER(&vswp->if_lockrw);
	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
	RW_EXIT(&vswp->if_lockrw);

	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;

	DUMP_TAG(attr_msg.tag);

	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);

	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_dring_info(vsw_ldc_t *ldcp)
{
	int		msgsize;
	void		*msg;
	vsw_t		*vswp = ldcp->ldc_vswp;
	vsw_port_t	*port = ldcp->ldc_port;
	lane_t		*lp = &ldcp->lane_out;
	vgen_stats_t	*statsp = &ldcp->ldc_stats;

	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);

	/* dring mode has been negotiated in attr phase; save in stats */
	statsp->dring_mode = lp->dring_mode;

	if (lp->dring_mode == VIO_RX_DRING_DATA) {
		/*
		 * Change the transmit routine for RxDringData mode.
		 */
		port->transmit = vsw_dringsend_shm;
		msg = (void *) vsw_create_rx_dring_info(ldcp);
		if (msg == NULL) {
			return;
		}
		msgsize =
		    VNET_DRING_REG_EXT_MSG_SIZE(lp->dringp->data_ncookies);
		ldcp->rcv_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
		    vsw_ldc_rcv_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
		ldcp->rx_dringdata = vsw_process_dringdata_shm;
	} else {
		msg = (void *) vsw_create_tx_dring_info(ldcp);
		if (msg == NULL) {
			return;
		}
		msgsize = sizeof (vio_dring_reg_msg_t);
		ldcp->msg_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
		    vsw_ldc_msg_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
		ldcp->rx_dringdata = vsw_process_dringdata;
	}

	lp->lstate |= VSW_DRING_INFO_SENT;
	DUMP_TAG_PTR((vio_msg_tag_t *)msg);
	(void) vsw_send_msg(ldcp, msg, msgsize, B_TRUE);
	kmem_free(msg, msgsize);

	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_rdx(vsw_ldc_t *ldcp)
{
	vsw_t		*vswp = ldcp->ldc_vswp;
	vio_rdx_msg_t	rdx_msg;

	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);

	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
	rdx_msg.tag.vio_subtype_env = VIO_RDX;
	rdx_msg.tag.vio_sid = ldcp->local_session;

	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;

	DUMP_TAG(rdx_msg.tag);

	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);

	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}

/*
 * Remove the specified address from the list of address maintained
 * in this port node.
 */
mcst_addr_t *
vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
{
	vsw_t		*vswp = NULL;
	vsw_port_t	*port = NULL;
	mcst_addr_t	*prev_p = NULL;
	mcst_addr_t	*curr_p = NULL;

	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
	    __func__, devtype, addr);

	if (devtype == VSW_VNETPORT) {
		port = (vsw_port_t *)arg;
		mutex_enter(&port->mca_lock);
		prev_p = curr_p = port->mcap;
	} else {
		vswp = (vsw_t *)arg;
		mutex_enter(&vswp->mca_lock);
		prev_p = curr_p = vswp->mcap;
	}

	while (curr_p != NULL) {
		if (curr_p->addr == addr) {
			D2(NULL, "%s: address found", __func__);
			/* match found */
			if (prev_p == curr_p) {
				/* list head */
				if (devtype == VSW_VNETPORT)
					port->mcap = curr_p->nextp;
				else
					vswp->mcap = curr_p->nextp;
			} else {
				prev_p->nextp = curr_p->nextp;
			}
			break;
		} else {
			prev_p = curr_p;
			curr_p = curr_p->nextp;
		}
	}

	if (devtype == VSW_VNETPORT)
		mutex_exit(&port->mca_lock);
	else
		mutex_exit(&vswp->mca_lock);

	D1(NULL, "%s: exit", __func__);

	return (curr_p);
}

/*
 * Create a ring consisting of just a private portion and link
 * it into the list of rings for the outbound lane.
 *
 * These type of rings are used primarily for temporary data
 * storage (i.e. as data buffers).
 */
void
vsw_create_privring(vsw_ldc_t *ldcp)
{
	dring_info_t		*dp;
	vsw_t			*vswp = ldcp->ldc_vswp;

	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
	ldcp->lane_out.dringp = dp;

	/* no public section */
	dp->pub_addr = NULL;
	dp->priv_addr = kmem_zalloc(
	    (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP);
	dp->num_descriptors = vsw_num_descriptors;

	if (vsw_setup_tx_dring(ldcp, dp)) {
		DERR(vswp, "%s: setup of ring failed", __func__);
		vsw_destroy_tx_dring(ldcp);
		return;
	}

	/* haven't used any descriptors yet */
	dp->end_idx = 0;
	dp->restart_reqd = B_TRUE;

	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Set the default lane attributes. These are copied into
 * the attr msg we send to our peer. If they are not acceptable
 * then (currently) the handshake ends.
 */
static void
vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
{
	bzero(lp, sizeof (lane_t));

	READ_ENTER(&vswp->if_lockrw);
	ether_copy(&(vswp->if_addr), &(lp->addr));
	RW_EXIT(&vswp->if_lockrw);

	lp->mtu = vswp->max_frame_size;
	lp->addr_type = ADDR_TYPE_MAC;
	lp->xfer_mode = VIO_DRING_MODE_V1_0;
	lp->ack_freq = 0;	/* for shared mode */
	lp->seq_num = VNET_ISS;
}

/*
 * Map the descriptor ring exported by the peer.
 */
static dring_info_t *
vsw_map_dring(vsw_ldc_t *ldcp, void *pkt)
{
	dring_info_t	*dp = NULL;
	lane_t		*lp = &ldcp->lane_out;

	if (lp->dring_mode == VIO_RX_DRING_DATA) {
		/*
		 * In RxDringData mode, dring that we map in
		 * becomes our transmit descriptor ring.
		 */
		dp =  vsw_map_tx_dring(ldcp, pkt);
	} else {
		/*
		 * In TxDring mode, dring that we map in
		 * becomes our receive descriptor ring.
		 */
		dp =  vsw_map_rx_dring(ldcp, pkt);
	}
	return (dp);
}

/*
 * Common dring mapping function used in both TxDring and RxDringData modes.
 */
dring_info_t *
vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt)
{
	int		rv;
	dring_info_t	*dp;
	ldc_mem_info_t	minfo;
	vsw_t		*vswp = ldcp->ldc_vswp;

	/*
	 * If the dring params are unacceptable then we NACK back.
	 */
	if ((dring_pkt->num_descriptors == 0) ||
	    (dring_pkt->descriptor_size == 0) ||
	    (dring_pkt->ncookies != 1)) {
		DERR(vswp, "%s (%lld): invalid dring info",
		    __func__, ldcp->ldc_id);
		return (NULL);
	}

	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);

	dp->num_descriptors = dring_pkt->num_descriptors;
	dp->descriptor_size = dring_pkt->descriptor_size;
	dp->options = dring_pkt->options;
	dp->dring_ncookies = dring_pkt->ncookies;

	/*
	 * Note: should only get one cookie. Enforced in
	 * the ldc layer.
	 */
	bcopy(&dring_pkt->cookie[0], &dp->dring_cookie[0],
	    sizeof (ldc_mem_cookie_t));

	rv = ldc_mem_dring_map(ldcp->ldc_handle, &dp->dring_cookie[0],
	    dp->dring_ncookies, dp->num_descriptors, dp->descriptor_size,
	    LDC_DIRECT_MAP, &(dp->dring_handle));
	if (rv != 0) {
		goto fail;
	}

	rv = ldc_mem_dring_info(dp->dring_handle, &minfo);
	if (rv != 0) {
		goto fail;
	}
	/* store the address of the ring */
	dp->pub_addr = minfo.vaddr;

	/* cache the dring mtype */
	dp->dring_mtype = minfo.mtype;

	/* no private section as we are importing */
	dp->priv_addr = NULL;

	/*
	 * Using simple mono increasing int for ident at the moment.
	 */
	dp->ident = ldcp->next_ident;
	ldcp->next_ident++;

	/*
	 * Acknowledge it; we send back a unique dring identifier that
	 * the sending side will use in future to refer to this
	 * descriptor ring.
	 */
	dring_pkt->dring_ident = dp->ident;

	return (dp);
fail:
	if (dp->dring_handle != NULL) {
		(void) ldc_mem_dring_unmap(dp->dring_handle);
	}
	kmem_free(dp, sizeof (*dp));
	return (NULL);
}

/*
 * Unmap the descriptor ring exported by the peer.
 */
static void
vsw_unmap_dring(vsw_ldc_t *ldcp)
{
	lane_t	*lane_out = &ldcp->lane_out;

	if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
		vsw_unmap_tx_dring(ldcp);
	} else {
		vsw_unmap_rx_dring(ldcp);
	}
}

/*
 * Map the shared memory data buffer area exported by the peer.
 * Used in RxDringData mode only.
 */
static int
vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt)
{
	int			rv;
	vio_dring_reg_ext_msg_t	*emsg;
	vio_dring_reg_msg_t	*msg = pkt;
	uint8_t			*buf = (uint8_t *)msg->cookie;
	vsw_t			*vswp = ldcp->ldc_vswp;
	ldc_mem_info_t		minfo;

	/* skip over dring cookies */
	ASSERT(msg->ncookies == 1);
	buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));

	emsg = (vio_dring_reg_ext_msg_t *)buf;
	if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
		return (1);
	}

	/* save # of data area cookies */
	dp->data_ncookies = emsg->data_ncookies;

	/* save data area size */
	dp->data_sz = emsg->data_area_size;

	/* allocate ldc mem handle for data area */
	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &dp->data_handle);
	if (rv != 0) {
		cmn_err(CE_WARN, "ldc_mem_alloc_handle failed\n");
		DWARN(vswp, "%s (%lld) ldc_mem_alloc_handle() failed: %d\n",
		    __func__, ldcp->ldc_id, rv);
		return (1);
	}

	/* map the data area */
	rv = ldc_mem_map(dp->data_handle, emsg->data_cookie,
	    emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_R,
	    (caddr_t *)&dp->data_addr, NULL);
	if (rv != 0) {
		cmn_err(CE_WARN, "ldc_mem_map failed\n");
		DWARN(vswp, "%s (%lld) ldc_mem_map() failed: %d\n",
		    __func__, ldcp->ldc_id, rv);
		return (1);
	}

	/* get the map info */
	rv = ldc_mem_info(dp->data_handle, &minfo);
	if (rv != 0) {
		cmn_err(CE_WARN, "ldc_mem_info failed\n");
		DWARN(vswp, "%s (%lld) ldc_mem_info() failed: %d\n",
		    __func__, ldcp->ldc_id, rv);
		return (1);
	}

	if (minfo.mtype != LDC_DIRECT_MAP) {
		DWARN(vswp, "%s (%lld) mtype(%d) is not direct map\n",
		    __func__, ldcp->ldc_id, minfo.mtype);
		return (1);
	}

	/* allocate memory for data area cookies */
	dp->data_cookie = kmem_zalloc(emsg->data_ncookies *
	    sizeof (ldc_mem_cookie_t), KM_SLEEP);

	/* save data area cookies */
	bcopy(emsg->data_cookie, dp->data_cookie,
	    emsg->data_ncookies * sizeof (ldc_mem_cookie_t));

	return (0);
}

/*
 * Reset and free all the resources associated with the channel.
 */
static void
vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
{
	lane_t	*lp;

	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);

	if (dir == INBOUND) {
		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
		    " of channel %lld", __func__, ldcp->ldc_id);
		lp = &ldcp->lane_in;
	} else {
		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
		    " of channel %lld", __func__, ldcp->ldc_id);
		lp = &ldcp->lane_out;
	}

	lp->lstate = VSW_LANE_INACTIV;
	lp->seq_num = VNET_ISS;

	if (dir == INBOUND) {
		/* Unmap the remote dring which is imported from the peer */
		vsw_unmap_dring(ldcp);
	} else {
		/* Destroy the local dring which is exported to the peer */
		vsw_destroy_dring(ldcp);
	}

	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Destroy the descriptor ring.
 */
static void
vsw_destroy_dring(vsw_ldc_t *ldcp)
{
	lane_t	*lp = &ldcp->lane_out;

	if (lp->dring_mode == VIO_RX_DRING_DATA) {
		vsw_destroy_rx_dring(ldcp);
	} else {
		vsw_destroy_tx_dring(ldcp);
	}
}

/*
 * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
 * This thread is woken up by the vsw_portsend to transmit
 * packets.
 */
static void
vsw_ldc_tx_worker(void *arg)
{
	callb_cpr_t	cprinfo;
	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
	vsw_t *vswp = ldcp->ldc_vswp;
	mblk_t *mp;
	mblk_t *tmp;

	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
	    "vnet_tx_thread");
	mutex_enter(&ldcp->tx_thr_lock);
	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {

		CALLB_CPR_SAFE_BEGIN(&cprinfo);
		/*
		 * Wait until the data is received or a stop
		 * request is received.
		 */
		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
		    (ldcp->tx_mhead == NULL)) {
			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
		}
		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)

		/*
		 * First process the stop request.
		 */
		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
			D2(vswp, "%s(%lld):tx thread stopped\n",
			    __func__, ldcp->ldc_id);
			break;
		}
		mp = ldcp->tx_mhead;
		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
		ldcp->tx_cnt = 0;
		mutex_exit(&ldcp->tx_thr_lock);
		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
		    __func__, ldcp->ldc_id);
		while (mp != NULL) {
			tmp = mp->b_next;
			mp->b_next = mp->b_prev = NULL;
			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
			mp = tmp;
		}
		mutex_enter(&ldcp->tx_thr_lock);
	}

	/*
	 * Update the run status and wakeup the thread that
	 * has sent the stop request.
	 */
	ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
	ldcp->tx_thread = NULL;
	CALLB_CPR_EXIT(&cprinfo);
	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
	thread_exit();
}

/* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
static void
vsw_stop_tx_thread(vsw_ldc_t *ldcp)
{
	kt_did_t	tid = 0;
	vsw_t		*vswp = ldcp->ldc_vswp;

	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
	/*
	 * Send a stop request by setting the stop flag and
	 * wait until the receive thread stops.
	 */
	mutex_enter(&ldcp->tx_thr_lock);
	if (ldcp->tx_thread != NULL) {
		tid = ldcp->tx_thread->t_did;
		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
		cv_signal(&ldcp->tx_thr_cv);
	}
	mutex_exit(&ldcp->tx_thr_lock);

	if (tid != 0) {
		thread_join(tid);
	}

	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
}

static int
vsw_mapin_avail(vsw_ldc_t *ldcp)
{
	int		rv;
	ldc_info_t	info;
	uint64_t	mapin_sz_req;
	uint64_t	dblk_sz;
	vsw_t		*vswp = ldcp->ldc_vswp;

	rv = ldc_info(ldcp->ldc_handle, &info);
	if (rv != 0) {
		return (B_FALSE);
	}

	dblk_sz = RXDRING_DBLK_SZ(vswp->max_frame_size);
	mapin_sz_req = (VSW_RXDRING_NRBUFS * dblk_sz);

	if (info.direct_map_size_max >= mapin_sz_req) {
		return (B_TRUE);
	}

	return (B_FALSE);
}

/*
 * Debugging routines
 */
static void
display_state(void)
{
	vsw_t		*vswp;
	vsw_port_list_t	*plist;
	vsw_port_t 	*port;
	vsw_ldc_t 	*ldcp;
	extern vsw_t 	*vsw_head;

	cmn_err(CE_NOTE, "***** system state *****");

	for (vswp = vsw_head; vswp; vswp = vswp->next) {
		plist = &vswp->plist;
		READ_ENTER(&plist->lockrw);
		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
		    vswp->instance, plist->num_ports);

		for (port = plist->head; port != NULL; port = port->p_next) {
			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
			    port->p_instance, port->num_ldcs);
			ldcp = port->ldcp;
			cmn_err(CE_CONT, "chan %lu : dev %d : "
			    "status %d : phase %u\n",
			    ldcp->ldc_id, ldcp->dev_class,
			    ldcp->ldc_status, ldcp->hphase);
			cmn_err(CE_CONT, "chan %lu : lsession %lu : "
			    "psession %lu\n", ldcp->ldc_id,
			    ldcp->local_session, ldcp->peer_session);

			cmn_err(CE_CONT, "Inbound lane:\n");
			display_lane(&ldcp->lane_in);
			cmn_err(CE_CONT, "Outbound lane:\n");
			display_lane(&ldcp->lane_out);
		}
		RW_EXIT(&plist->lockrw);
	}
	cmn_err(CE_NOTE, "***** system state *****");
}

static void
display_lane(lane_t *lp)
{
	dring_info_t	*drp = lp->dringp;

	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
	    lp->addr_type, lp->addr, lp->xfer_mode);
	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);

	cmn_err(CE_CONT, "Dring info:\n");
	cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
	    drp->num_descriptors, drp->descriptor_size);
	cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->dring_handle);
	cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
	    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
	cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
	    drp->ident, drp->end_idx);
	display_ring(drp);
}

static void
display_ring(dring_info_t *dringp)
{
	uint64_t		i;
	uint64_t		priv_count = 0;
	uint64_t		pub_count = 0;
	vnet_public_desc_t	*pub_addr = NULL;
	vsw_private_desc_t	*priv_addr = NULL;

	for (i = 0; i < vsw_num_descriptors; i++) {
		if (dringp->pub_addr != NULL) {
			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;

			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
				pub_count++;
		}

		if (dringp->priv_addr != NULL) {
			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;

			if (priv_addr->dstate == VIO_DESC_FREE)
				priv_count++;
		}
	}
	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
	    i, priv_count, pub_count);
}

static void
dump_flags(uint64_t state)
{
	int	i;

	typedef struct flag_name {
		int	flag_val;
		char	*flag_name;
	} flag_name_t;

	flag_name_t	flags[] = {
		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};

	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
		if (state & flags[i].flag_val)
			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
	}
}