xref: /titanic_52/usr/src/uts/sun4v/io/vsw_ldc.c (revision 655967ab7780f789dd00eece4464be60cd9aff5e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/debug.h>
29 #include <sys/time.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/user.h>
33 #include <sys/stropts.h>
34 #include <sys/stream.h>
35 #include <sys/strlog.h>
36 #include <sys/strsubr.h>
37 #include <sys/cmn_err.h>
38 #include <sys/cpu.h>
39 #include <sys/kmem.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/ksynch.h>
44 #include <sys/stat.h>
45 #include <sys/kstat.h>
46 #include <sys/vtrace.h>
47 #include <sys/strsun.h>
48 #include <sys/dlpi.h>
49 #include <sys/ethernet.h>
50 #include <net/if.h>
51 #include <sys/varargs.h>
52 #include <sys/machsystm.h>
53 #include <sys/modctl.h>
54 #include <sys/modhash.h>
55 #include <sys/mac.h>
56 #include <sys/mac_ether.h>
57 #include <sys/taskq.h>
58 #include <sys/note.h>
59 #include <sys/mach_descrip.h>
60 #include <sys/mdeg.h>
61 #include <sys/ldc.h>
62 #include <sys/vsw_fdb.h>
63 #include <sys/vsw.h>
64 #include <sys/vio_mailbox.h>
65 #include <sys/vnet_mailbox.h>
66 #include <sys/vnet_common.h>
67 #include <sys/vio_util.h>
68 #include <sys/sdt.h>
69 #include <sys/atomic.h>
70 #include <sys/callb.h>
71 #include <sys/vlan.h>
72 
73 /* Port add/deletion/etc routines */
74 static	void vsw_port_delete(vsw_port_t *port);
75 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
76 static	void vsw_ldc_detach(vsw_ldc_t *ldcp);
77 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
78 static	void vsw_ldc_uninit(vsw_ldc_t *ldcp);
79 static	void vsw_ldc_drain(vsw_ldc_t *ldcp);
80 static	void vsw_drain_port_taskq(vsw_port_t *port);
81 static	void vsw_marker_task(void *);
82 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
83 void vsw_detach_ports(vsw_t *vswp);
84 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
85 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
86 int vsw_port_detach(vsw_t *vswp, int p_instance);
87 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
88 int vsw_port_attach(vsw_port_t *portp);
89 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
90 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
91 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
92 void vsw_reset_ports(vsw_t *vswp);
93 void vsw_port_reset(vsw_port_t *portp);
94 void vsw_physlink_update_ports(vsw_t *vswp);
95 static	void vsw_port_physlink_update(vsw_port_t *portp);
96 
97 /* Interrupt routines */
98 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
99 
100 /* Handshake routines */
101 static	void vsw_ldc_reinit(vsw_ldc_t *);
102 static	void vsw_conn_task(void *);
103 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
104 static	void vsw_next_milestone(vsw_ldc_t *);
105 static	int vsw_supported_version(vio_ver_msg_t *);
106 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
107 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
108 void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
109 
110 /* Data processing routines */
111 void vsw_process_pkt(void *);
112 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *, int);
113 static void vsw_process_ctrl_pkt(void *);
114 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
115 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
116 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
117 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
121 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
122 	uint32_t);
123 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
124 static void vsw_process_pkt_data(void *, void *, uint32_t);
125 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
126 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
127 static void vsw_process_evt_read(vsw_ldc_t *ldcp);
128 static void vsw_ldc_rcv(vsw_ldc_t *ldcp);
129 
130 /* Switching/data transmit routines */
131 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
132 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
133 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
134 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
135 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
136 
137 /* Packet creation routines */
138 static void vsw_send_ver(void *);
139 static void vsw_send_attr(vsw_ldc_t *);
140 static void vsw_send_dring_info(vsw_ldc_t *);
141 static void vsw_send_rdx(vsw_ldc_t *);
142 static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);
143 
144 /* Dring routines */
145 static void vsw_create_privring(vsw_ldc_t *);
146 static dring_info_t *vsw_map_dring(vsw_ldc_t *ldcp, void *pkt);
147 static void vsw_unmap_dring(vsw_ldc_t *ldcp);
148 static void vsw_destroy_dring(vsw_ldc_t *ldcp);
149 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
150 static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt);
151 static void vsw_set_lane_attr(vsw_t *, lane_t *);
152 dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp,
153     vio_dring_reg_msg_t *dring_pkt);
154 static int vsw_mapin_avail(vsw_ldc_t *ldcp);
155 
156 /* tx/msg/rcv thread routines */
157 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
158 static void vsw_ldc_tx_worker(void *arg);
159 
160 /* Misc support routines */
161 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
162 static int vsw_get_same_dest_list(struct ether_header *ehp,
163     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
164 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
165 
166 /* Debugging routines */
167 static void dump_flags(uint64_t);
168 static void display_state(void);
169 static void display_lane(lane_t *);
170 static void display_ring(dring_info_t *);
171 
172 /*
173  * Functions imported from other files.
174  */
175 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
176 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
177 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
178 extern void vsw_del_mcst_port(vsw_port_t *port);
179 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
180 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
181 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
182 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
183 extern void vsw_create_vlans(void *arg, int type);
184 extern void vsw_destroy_vlans(void *arg, int type);
185 extern void vsw_vlan_add_ids(void *arg, int type);
186 extern void vsw_vlan_remove_ids(void *arg, int type);
187 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
188 	struct ether_header *ehp, uint16_t *vidp);
189 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
190 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
191 	mblk_t **npt);
192 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
193 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
194 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
195 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
196 extern void vsw_hio_stop_port(vsw_port_t *portp);
197 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
198 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
199 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
200 extern void vsw_destroy_rxpools(void *arg);
201 extern void vsw_stop_msg_thread(vsw_ldc_t *ldcp);
202 extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
203 extern int vsw_dringsend(vsw_ldc_t *, mblk_t *);
204 extern int vsw_reclaim_dring(dring_info_t *dp, int start);
205 extern int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
206     int *);
207 extern vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *);
208 extern int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp);
209 extern void vsw_destroy_tx_dring(vsw_ldc_t *ldcp);
210 extern dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt);
211 extern void vsw_unmap_rx_dring(vsw_ldc_t *ldcp);
212 extern void vsw_ldc_msg_worker(void *arg);
213 extern void vsw_process_dringdata(void *, void *);
214 extern vio_dring_reg_msg_t *vsw_create_rx_dring_info(vsw_ldc_t *);
215 extern void vsw_destroy_rx_dring(vsw_ldc_t *ldcp);
216 extern dring_info_t *vsw_map_tx_dring(vsw_ldc_t *ldcp, void *pkt);
217 extern void vsw_unmap_tx_dring(vsw_ldc_t *ldcp);
218 extern void vsw_ldc_rcv_worker(void *arg);
219 extern void vsw_stop_rcv_thread(vsw_ldc_t *ldcp);
220 extern int vsw_dringsend_shm(vsw_ldc_t *, mblk_t *);
221 extern void vsw_process_dringdata_shm(void *, void *);
222 
223 /*
224  * Tunables used in this file.
225  */
226 extern int vsw_num_handshakes;
227 extern int vsw_ldc_tx_delay;
228 extern int vsw_ldc_tx_retries;
229 extern int vsw_ldc_retries;
230 extern int vsw_ldc_delay;
231 extern boolean_t vsw_ldc_rxthr_enabled;
232 extern boolean_t vsw_ldc_txthr_enabled;
233 extern uint32_t vsw_num_descriptors;
234 extern uint8_t  vsw_dring_mode;
235 extern uint32_t vsw_max_tx_qcount;
236 extern boolean_t vsw_obp_ver_proto_workaround;
237 extern uint32_t vsw_publish_macaddr_count;
238 extern uint32_t vsw_nrbufs_factor;
239 
240 #define	LDC_ENTER_LOCK(ldcp)	\
241 				mutex_enter(&((ldcp)->ldc_cblock));\
242 				mutex_enter(&((ldcp)->ldc_rxlock));\
243 				mutex_enter(&((ldcp)->ldc_txlock));
244 #define	LDC_EXIT_LOCK(ldcp)	\
245 				mutex_exit(&((ldcp)->ldc_txlock));\
246 				mutex_exit(&((ldcp)->ldc_rxlock));\
247 				mutex_exit(&((ldcp)->ldc_cblock));
248 
249 #define	VSW_VER_EQ(ldcp, major, minor)	\
250 	((ldcp)->lane_out.ver_major == (major) &&	\
251 	    (ldcp)->lane_out.ver_minor == (minor))
252 
253 #define	VSW_VER_LT(ldcp, major, minor)	\
254 	(((ldcp)->lane_out.ver_major < (major)) ||	\
255 	    ((ldcp)->lane_out.ver_major == (major) &&	\
256 	    (ldcp)->lane_out.ver_minor < (minor)))
257 
258 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
259 	(((ldcp)->lane_out.ver_major > (major)) ||	\
260 	    ((ldcp)->lane_out.ver_major == (major) &&	\
261 	    (ldcp)->lane_out.ver_minor >= (minor)))
262 
263 #define	VSW_VER_LTEQ(ldcp, major, minor)	\
264 	(((ldcp)->lane_out.ver_major < (major)) ||	\
265 	    ((ldcp)->lane_out.ver_major == (major) &&	\
266 	    (ldcp)->lane_out.ver_minor <= (minor)))
267 
268 /*
269  * VIO Protocol Version Info:
270  *
271  * The version specified below represents the version of protocol currently
272  * supported in the driver. It means the driver can negotiate with peers with
273  * versions <= this version. Here is a summary of the feature(s) that are
274  * supported at each version of the protocol:
275  *
276  * 1.0			Basic VIO protocol.
277  * 1.1			vDisk protocol update (no virtual network update).
278  * 1.2			Support for priority frames (priority-ether-types).
279  * 1.3			VLAN and HybridIO support.
280  * 1.4			Jumbo Frame support.
281  * 1.5			Link State Notification support with optional support
282  * 			for Physical Link information.
283  * 1.6			Support for RxDringData mode.
284  */
285 static	ver_sup_t	vsw_versions[] = { {1, 6} };
286 
287 /*
288  * For the moment the state dump routines have their own
289  * private flag.
290  */
291 #define	DUMP_STATE	0
292 
293 #if DUMP_STATE
294 
295 #define	DUMP_TAG(tag) \
296 {			\
297 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
298 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
299 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
300 }
301 
302 #define	DUMP_TAG_PTR(tag) \
303 {			\
304 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
305 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
306 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
307 }
308 
309 #define	DUMP_FLAGS(flags) dump_flags(flags);
310 #define	DISPLAY_STATE()	display_state()
311 
312 #else
313 
314 #define	DUMP_TAG(tag)
315 #define	DUMP_TAG_PTR(tag)
316 #define	DUMP_FLAGS(state)
317 #define	DISPLAY_STATE()
318 
319 #endif	/* DUMP_STATE */
320 
321 /*
322  * Attach the specified port.
323  *
324  * Returns 0 on success, 1 on failure.
325  */
326 int
327 vsw_port_attach(vsw_port_t *port)
328 {
329 	vsw_t			*vswp = port->p_vswp;
330 	vsw_port_list_t		*plist = &vswp->plist;
331 	vsw_port_t		*p, **pp;
332 	int			nids = port->num_ldcs;
333 	uint64_t		*ldcids;
334 	int			rv;
335 
336 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
337 
338 	/* port already exists? */
339 	READ_ENTER(&plist->lockrw);
340 	for (p = plist->head; p != NULL; p = p->p_next) {
341 		if (p->p_instance == port->p_instance) {
342 			DWARN(vswp, "%s: port instance %d already attached",
343 			    __func__, p->p_instance);
344 			RW_EXIT(&plist->lockrw);
345 			return (1);
346 		}
347 	}
348 	RW_EXIT(&plist->lockrw);
349 
350 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
351 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
352 	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
353 
354 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
355 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
356 	port->state = VSW_PORT_INIT;
357 
358 	D2(vswp, "%s: %d nids", __func__, nids);
359 	ldcids = port->ldc_ids;
360 	D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[0]);
361 	if (vsw_ldc_attach(port, (uint64_t)ldcids[0]) != 0) {
362 		DERR(vswp, "%s: ldc_attach failed", __func__);
363 		goto exit_error;
364 	}
365 
366 	if (vswp->switching_setup_done == B_TRUE) {
367 		/*
368 		 * If the underlying network device has been setup,
369 		 * then open a mac client and porgram the mac address
370 		 * for this port.
371 		 */
372 		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
373 		if (rv != 0) {
374 			goto exit_error;
375 		}
376 	}
377 
378 	/* create the fdb entry for this port/mac address */
379 	vsw_fdbe_add(vswp, port);
380 
381 	vsw_create_vlans(port, VSW_VNETPORT);
382 
383 	WRITE_ENTER(&plist->lockrw);
384 
385 	/* link it into the list of ports for this vsw instance */
386 	pp = (vsw_port_t **)(&plist->head);
387 	port->p_next = *pp;
388 	*pp = port;
389 	plist->num_ports++;
390 
391 	RW_EXIT(&plist->lockrw);
392 
393 	/*
394 	 * Initialise the port and any ldc's under it.
395 	 */
396 	(void) vsw_ldc_init(port->ldcp);
397 
398 	/* announce macaddr of vnet to the physical switch */
399 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
400 		vsw_publish_macaddr(vswp, port);
401 	}
402 
403 	D1(vswp, "%s: exit", __func__);
404 	return (0);
405 
406 exit_error:
407 
408 	cv_destroy(&port->state_cv);
409 	mutex_destroy(&port->state_lock);
410 
411 	rw_destroy(&port->maccl_rwlock);
412 	mutex_destroy(&port->tx_lock);
413 	mutex_destroy(&port->mca_lock);
414 	kmem_free(port, sizeof (vsw_port_t));
415 	return (1);
416 }
417 
418 /*
419  * Detach the specified port.
420  *
421  * Returns 0 on success, 1 on failure.
422  */
423 int
424 vsw_port_detach(vsw_t *vswp, int p_instance)
425 {
426 	vsw_port_t	*port = NULL;
427 	vsw_port_list_t	*plist = &vswp->plist;
428 
429 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
430 
431 	WRITE_ENTER(&plist->lockrw);
432 
433 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
434 		RW_EXIT(&plist->lockrw);
435 		return (1);
436 	}
437 
438 	if (vsw_plist_del_node(vswp, port)) {
439 		RW_EXIT(&plist->lockrw);
440 		return (1);
441 	}
442 
443 	/* cleanup any HybridIO for this port */
444 	vsw_hio_stop_port(port);
445 
446 	/*
447 	 * No longer need to hold writer lock on port list now
448 	 * that we have unlinked the target port from the list.
449 	 */
450 	RW_EXIT(&plist->lockrw);
451 
452 	/* Cleanup and close the mac client */
453 	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
454 
455 	/* Remove the fdb entry for this port/mac address */
456 	vsw_fdbe_del(vswp, &(port->p_macaddr));
457 	vsw_destroy_vlans(port, VSW_VNETPORT);
458 
459 	/* Remove any multicast addresses.. */
460 	vsw_del_mcst_port(port);
461 
462 	vsw_port_delete(port);
463 
464 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
465 	return (0);
466 }
467 
468 /*
469  * Detach all active ports.
470  */
471 void
472 vsw_detach_ports(vsw_t *vswp)
473 {
474 	vsw_port_list_t 	*plist = &vswp->plist;
475 	vsw_port_t		*port = NULL;
476 
477 	D1(vswp, "%s: enter", __func__);
478 
479 	WRITE_ENTER(&plist->lockrw);
480 
481 	while ((port = plist->head) != NULL) {
482 		(void) vsw_plist_del_node(vswp, port);
483 
484 		/* cleanup any HybridIO for this port */
485 		vsw_hio_stop_port(port);
486 
487 		/* Cleanup and close the mac client */
488 		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
489 
490 		/* Remove the fdb entry for this port/mac address */
491 		vsw_fdbe_del(vswp, &(port->p_macaddr));
492 		vsw_destroy_vlans(port, VSW_VNETPORT);
493 
494 		/* Remove any multicast addresses.. */
495 		vsw_del_mcst_port(port);
496 
497 		/*
498 		 * No longer need to hold the lock on the port list
499 		 * now that we have unlinked the target port from the
500 		 * list.
501 		 */
502 		RW_EXIT(&plist->lockrw);
503 		vsw_port_delete(port);
504 		WRITE_ENTER(&plist->lockrw);
505 	}
506 	RW_EXIT(&plist->lockrw);
507 
508 	D1(vswp, "%s: exit", __func__);
509 }
510 
511 /*
512  * Delete the specified port.
513  */
514 static void
515 vsw_port_delete(vsw_port_t *port)
516 {
517 	vsw_t			*vswp = port->p_vswp;
518 
519 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
520 
521 	vsw_ldc_uninit(port->ldcp);
522 
523 	/*
524 	 * Wait for any pending ctrl msg tasks which reference this
525 	 * port to finish.
526 	 */
527 	vsw_drain_port_taskq(port);
528 
529 	/*
530 	 * Wait for any active callbacks to finish
531 	 */
532 	vsw_ldc_drain(port->ldcp);
533 
534 	vsw_ldc_detach(port->ldcp);
535 
536 	rw_destroy(&port->maccl_rwlock);
537 	mutex_destroy(&port->mca_lock);
538 	mutex_destroy(&port->tx_lock);
539 
540 	cv_destroy(&port->state_cv);
541 	mutex_destroy(&port->state_lock);
542 
543 	if (port->num_ldcs != 0) {
544 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
545 		port->num_ldcs = 0;
546 	}
547 
548 	if (port->nvids != 0) {
549 		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
550 	}
551 
552 	kmem_free(port, sizeof (vsw_port_t));
553 
554 	D1(vswp, "%s: exit", __func__);
555 }
556 
557 /*
558  * Attach a logical domain channel (ldc) under a specified port.
559  *
560  * Returns 0 on success, 1 on failure.
561  */
562 static int
563 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
564 {
565 	vsw_t 		*vswp = port->p_vswp;
566 	vsw_ldc_t 	*ldcp = NULL;
567 	ldc_attr_t 	attr;
568 	ldc_status_t	istatus;
569 	int 		status = DDI_FAILURE;
570 	char		kname[MAXNAMELEN];
571 	enum		{ PROG_init = 0x0,
572 			    PROG_callback = 0x1,
573 			    PROG_tx_thread = 0x2}
574 			progress;
575 
576 	progress = PROG_init;
577 
578 	D1(vswp, "%s: enter", __func__);
579 
580 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
581 	if (ldcp == NULL) {
582 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
583 		return (1);
584 	}
585 	ldcp->ldc_id = ldc_id;
586 
587 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
588 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
589 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
590 	ldcp->msg_thr_flags = 0;
591 	mutex_init(&ldcp->msg_thr_lock, NULL, MUTEX_DRIVER, NULL);
592 	cv_init(&ldcp->msg_thr_cv, NULL, CV_DRIVER, NULL);
593 	ldcp->rcv_thr_flags = 0;
594 	mutex_init(&ldcp->rcv_thr_lock, NULL, MUTEX_DRIVER, NULL);
595 	cv_init(&ldcp->rcv_thr_cv, NULL, CV_DRIVER, NULL);
596 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
597 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
598 
599 	/* required for handshake with peer */
600 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
601 	ldcp->peer_session = 0;
602 	ldcp->session_status = 0;
603 	ldcp->hss_id = 1;	/* Initial handshake session id */
604 	ldcp->hphase = VSW_MILESTONE0;
605 
606 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
607 
608 	/* only set for outbound lane, inbound set by peer */
609 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
610 
611 	attr.devclass = LDC_DEV_NT_SVC;
612 	attr.instance = ddi_get_instance(vswp->dip);
613 	attr.mode = LDC_MODE_UNRELIABLE;
614 	attr.mtu = VSW_LDC_MTU;
615 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
616 	if (status != 0) {
617 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
618 		    __func__, ldc_id, status);
619 		goto ldc_attach_fail;
620 	}
621 
622 	if (vsw_ldc_txthr_enabled) {
623 		ldcp->tx_thr_flags = 0;
624 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
625 
626 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
627 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
628 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
629 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
630 
631 		progress |= PROG_tx_thread;
632 		if (ldcp->tx_thread == NULL) {
633 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
634 			    __func__, ldc_id);
635 			goto ldc_attach_fail;
636 		}
637 	}
638 
639 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
640 	if (status != 0) {
641 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
642 		    __func__, ldc_id, status);
643 		(void) ldc_fini(ldcp->ldc_handle);
644 		goto ldc_attach_fail;
645 	}
646 	/*
647 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
648 	 * data msgs, including raw data msgs used to recv priority frames.
649 	 */
650 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
651 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
652 
653 	progress |= PROG_callback;
654 
655 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
656 
657 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
658 		DERR(vswp, "%s: ldc_status failed", __func__);
659 		mutex_destroy(&ldcp->status_lock);
660 		goto ldc_attach_fail;
661 	}
662 
663 	ldcp->ldc_status = istatus;
664 	ldcp->ldc_port = port;
665 	ldcp->ldc_vswp = vswp;
666 
667 	vsw_reset_vnet_proto_ops(ldcp);
668 
669 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
670 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
671 	    kname, &ldcp->ldc_stats);
672 	if (ldcp->ksp == NULL) {
673 		DERR(vswp, "%s: kstats setup failed", __func__);
674 		goto ldc_attach_fail;
675 	}
676 
677 	/* link it into this port */
678 	port->ldcp = ldcp;
679 
680 	D1(vswp, "%s: exit", __func__);
681 	return (0);
682 
683 ldc_attach_fail:
684 
685 	if (progress & PROG_callback) {
686 		(void) ldc_unreg_callback(ldcp->ldc_handle);
687 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
688 	}
689 
690 	if (progress & PROG_tx_thread) {
691 		if (ldcp->tx_thread != NULL) {
692 			vsw_stop_tx_thread(ldcp);
693 		}
694 		mutex_destroy(&ldcp->tx_thr_lock);
695 		cv_destroy(&ldcp->tx_thr_cv);
696 	}
697 	if (ldcp->ksp != NULL) {
698 		vgen_destroy_kstats(ldcp->ksp);
699 	}
700 	mutex_destroy(&ldcp->msg_thr_lock);
701 	mutex_destroy(&ldcp->rcv_thr_lock);
702 	mutex_destroy(&ldcp->ldc_txlock);
703 	mutex_destroy(&ldcp->ldc_rxlock);
704 	mutex_destroy(&ldcp->ldc_cblock);
705 	mutex_destroy(&ldcp->drain_cv_lock);
706 	cv_destroy(&ldcp->msg_thr_cv);
707 	cv_destroy(&ldcp->rcv_thr_cv);
708 	cv_destroy(&ldcp->drain_cv);
709 
710 	kmem_free(ldcp, sizeof (vsw_ldc_t));
711 
712 	return (1);
713 }
714 
715 /*
716  * Detach a logical domain channel (ldc) belonging to a
717  * particular port.
718  */
719 static void
720 vsw_ldc_detach(vsw_ldc_t *ldcp)
721 {
722 	int 		rv;
723 	vsw_t 		*vswp = ldcp->ldc_port->p_vswp;
724 	int		retries = 0;
725 
726 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
727 
728 	/* Stop msg/rcv thread */
729 	if (ldcp->rcv_thread != NULL) {
730 		vsw_stop_rcv_thread(ldcp);
731 	} else if (ldcp->msg_thread != NULL) {
732 		vsw_stop_msg_thread(ldcp);
733 	}
734 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
735 
736 	/* Stop the tx thread */
737 	if (ldcp->tx_thread != NULL) {
738 		vsw_stop_tx_thread(ldcp);
739 		mutex_destroy(&ldcp->tx_thr_lock);
740 		cv_destroy(&ldcp->tx_thr_cv);
741 		if (ldcp->tx_mhead != NULL) {
742 			freemsgchain(ldcp->tx_mhead);
743 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
744 			ldcp->tx_cnt = 0;
745 		}
746 	}
747 
748 	/* Destory kstats */
749 	vgen_destroy_kstats(ldcp->ksp);
750 
751 	/*
752 	 * Before we can close the channel we must release any mapped
753 	 * resources (e.g. drings).
754 	 */
755 	vsw_free_lane_resources(ldcp, INBOUND);
756 	vsw_free_lane_resources(ldcp, OUTBOUND);
757 
758 	/*
759 	 * Close the channel, retry on EAAGIN.
760 	 */
761 	while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
762 		if (++retries > vsw_ldc_retries) {
763 			break;
764 		}
765 		drv_usecwait(vsw_ldc_delay);
766 	}
767 	if (rv != 0) {
768 		cmn_err(CE_NOTE,
769 		    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
770 		    vswp->instance, rv, ldcp->ldc_id);
771 	}
772 
773 	(void) ldc_fini(ldcp->ldc_handle);
774 
775 	ldcp->ldc_status = LDC_INIT;
776 	ldcp->ldc_handle = NULL;
777 	ldcp->ldc_vswp = NULL;
778 
779 	mutex_destroy(&ldcp->msg_thr_lock);
780 	mutex_destroy(&ldcp->rcv_thr_lock);
781 	mutex_destroy(&ldcp->ldc_txlock);
782 	mutex_destroy(&ldcp->ldc_rxlock);
783 	mutex_destroy(&ldcp->ldc_cblock);
784 	mutex_destroy(&ldcp->drain_cv_lock);
785 	mutex_destroy(&ldcp->status_lock);
786 	cv_destroy(&ldcp->msg_thr_cv);
787 	cv_destroy(&ldcp->rcv_thr_cv);
788 	cv_destroy(&ldcp->drain_cv);
789 
790 	kmem_free(ldcp, sizeof (vsw_ldc_t));
791 }
792 
793 /*
794  * Open and attempt to bring up the channel. Note that channel
795  * can only be brought up if peer has also opened channel.
796  *
797  * Returns 0 if can open and bring up channel, otherwise
798  * returns 1.
799  */
800 static int
801 vsw_ldc_init(vsw_ldc_t *ldcp)
802 {
803 	vsw_t 		*vswp = ldcp->ldc_vswp;
804 	ldc_status_t	istatus = 0;
805 	int		rv;
806 
807 	D1(vswp, "%s: enter", __func__);
808 
809 	LDC_ENTER_LOCK(ldcp);
810 
811 	/* don't start at 0 in case clients don't like that */
812 	ldcp->next_ident = 1;
813 
814 	rv = ldc_open(ldcp->ldc_handle);
815 	if (rv != 0) {
816 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
817 		    __func__, ldcp->ldc_id, rv);
818 		LDC_EXIT_LOCK(ldcp);
819 		return (1);
820 	}
821 
822 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
823 		DERR(vswp, "%s: unable to get status", __func__);
824 		LDC_EXIT_LOCK(ldcp);
825 		return (1);
826 
827 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
828 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
829 		    __func__, ldcp->ldc_id, istatus);
830 		LDC_EXIT_LOCK(ldcp);
831 		return (1);
832 	}
833 
834 	mutex_enter(&ldcp->status_lock);
835 	ldcp->ldc_status = istatus;
836 	mutex_exit(&ldcp->status_lock);
837 
838 	rv = ldc_up(ldcp->ldc_handle);
839 	if (rv != 0) {
840 		/*
841 		 * Not a fatal error for ldc_up() to fail, as peer
842 		 * end point may simply not be ready yet.
843 		 */
844 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
845 		    ldcp->ldc_id, rv);
846 		LDC_EXIT_LOCK(ldcp);
847 		return (1);
848 	}
849 
850 	/*
851 	 * ldc_up() call is non-blocking so need to explicitly
852 	 * check channel status to see if in fact the channel
853 	 * is UP.
854 	 */
855 	mutex_enter(&ldcp->status_lock);
856 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
857 		DERR(vswp, "%s: unable to get status", __func__);
858 		mutex_exit(&ldcp->status_lock);
859 		LDC_EXIT_LOCK(ldcp);
860 		return (1);
861 
862 	}
863 
864 	if (ldcp->ldc_status == LDC_UP) {
865 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
866 		    ldcp->ldc_id, istatus);
867 		mutex_exit(&ldcp->status_lock);
868 		LDC_EXIT_LOCK(ldcp);
869 
870 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
871 		return (0);
872 	}
873 
874 	mutex_exit(&ldcp->status_lock);
875 	LDC_EXIT_LOCK(ldcp);
876 
877 	D1(vswp, "%s: exit", __func__);
878 	return (0);
879 }
880 
881 /* disable callbacks on the channel */
882 static void
883 vsw_ldc_uninit(vsw_ldc_t *ldcp)
884 {
885 	vsw_t	*vswp = ldcp->ldc_vswp;
886 	int	rv;
887 
888 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
889 
890 	LDC_ENTER_LOCK(ldcp);
891 
892 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
893 	if (rv != 0) {
894 		cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
895 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
896 	}
897 
898 	mutex_enter(&ldcp->status_lock);
899 	ldcp->ldc_status = LDC_INIT;
900 	mutex_exit(&ldcp->status_lock);
901 
902 	LDC_EXIT_LOCK(ldcp);
903 
904 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
905 }
906 
907 /*
908  * Wait until the callback(s) associated with the ldcs under the specified
909  * port have completed.
910  *
911  * Prior to this function being invoked each channel under this port
912  * should have been quiesced via ldc_set_cb_mode(DISABLE).
913  *
914  * A short explaination of what we are doing below..
915  *
916  * The simplest approach would be to have a reference counter in
917  * the ldc structure which is increment/decremented by the callbacks as
918  * they use the channel. The drain function could then simply disable any
919  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
920  * there is a tiny window here - before the callback is able to get the lock
921  * on the channel it is interrupted and this function gets to execute. It
922  * sees that the ref count is zero and believes its free to delete the
923  * associated data structures.
924  *
925  * We get around this by taking advantage of the fact that before the ldc
926  * framework invokes a callback it sets a flag to indicate that there is a
927  * callback active (or about to become active). If when we attempt to
928  * unregister a callback when this active flag is set then the unregister
929  * will fail with EWOULDBLOCK.
930  *
931  * If the unregister fails we do a cv_timedwait. We will either be signaled
932  * by the callback as it is exiting (note we have to wait a short period to
933  * allow the callback to return fully to the ldc framework and it to clear
934  * the active flag), or by the timer expiring. In either case we again attempt
935  * the unregister. We repeat this until we can succesfully unregister the
936  * callback.
937  *
938  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
939  * the case where the callback has finished but the ldc framework has not yet
940  * cleared the active flag. In this case we would never get a cv_signal.
941  */
942 static void
943 vsw_ldc_drain(vsw_ldc_t *ldcp)
944 {
945 	vsw_t	*vswp = ldcp->ldc_port->p_vswp;
946 
947 	D1(vswp, "%s: enter", __func__);
948 
949 	/*
950 	 * If we can unregister the channel callback then we
951 	 * know that there is no callback either running or
952 	 * scheduled to run for this channel so move on to next
953 	 * channel in the list.
954 	 */
955 	mutex_enter(&ldcp->drain_cv_lock);
956 
957 	/* prompt active callbacks to quit */
958 	ldcp->drain_state = VSW_LDC_DRAINING;
959 
960 	if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
961 		D2(vswp, "%s: unreg callback for chan %ld", __func__,
962 		    ldcp->ldc_id);
963 		mutex_exit(&ldcp->drain_cv_lock);
964 	} else {
965 		/*
966 		 * If we end up here we know that either 1) a callback
967 		 * is currently executing, 2) is about to start (i.e.
968 		 * the ldc framework has set the active flag but
969 		 * has not actually invoked the callback yet, or 3)
970 		 * has finished and has returned to the ldc framework
971 		 * but the ldc framework has not yet cleared the
972 		 * active bit.
973 		 *
974 		 * Wait for it to finish.
975 		 */
976 		while (ldc_unreg_callback(ldcp->ldc_handle) == EWOULDBLOCK) {
977 			(void) cv_timedwait(&ldcp->drain_cv,
978 			    &ldcp->drain_cv_lock, ddi_get_lbolt() + hz);
979 		}
980 
981 		mutex_exit(&ldcp->drain_cv_lock);
982 		D2(vswp, "%s: unreg callback for chan %ld after "
983 		    "timeout", __func__, ldcp->ldc_id);
984 	}
985 
986 	D1(vswp, "%s: exit", __func__);
987 }
988 
989 /*
990  * Wait until all tasks which reference this port have completed.
991  *
992  * Prior to this function being invoked each channel under this port
993  * should have been quiesced via ldc_set_cb_mode(DISABLE).
994  */
995 static void
996 vsw_drain_port_taskq(vsw_port_t *port)
997 {
998 	vsw_t		*vswp = port->p_vswp;
999 
1000 	D1(vswp, "%s: enter", __func__);
1001 
1002 	/*
1003 	 * Mark the port as in the process of being detached, and
1004 	 * dispatch a marker task to the queue so we know when all
1005 	 * relevant tasks have completed.
1006 	 */
1007 	mutex_enter(&port->state_lock);
1008 	port->state = VSW_PORT_DETACHING;
1009 
1010 	if ((vswp->taskq_p == NULL) ||
1011 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1012 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1013 		cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
1014 		    vswp->instance);
1015 		mutex_exit(&port->state_lock);
1016 		return;
1017 	}
1018 
1019 	/*
1020 	 * Wait for the marker task to finish.
1021 	 */
1022 	while (port->state != VSW_PORT_DETACHABLE)
1023 		cv_wait(&port->state_cv, &port->state_lock);
1024 
1025 	mutex_exit(&port->state_lock);
1026 
1027 	D1(vswp, "%s: exit", __func__);
1028 }
1029 
1030 static void
1031 vsw_marker_task(void *arg)
1032 {
1033 	vsw_port_t	*port = arg;
1034 	vsw_t		*vswp = port->p_vswp;
1035 
1036 	D1(vswp, "%s: enter", __func__);
1037 
1038 	mutex_enter(&port->state_lock);
1039 
1040 	/*
1041 	 * No further tasks should be dispatched which reference
1042 	 * this port so ok to mark it as safe to detach.
1043 	 */
1044 	port->state = VSW_PORT_DETACHABLE;
1045 
1046 	cv_signal(&port->state_cv);
1047 
1048 	mutex_exit(&port->state_lock);
1049 
1050 	D1(vswp, "%s: exit", __func__);
1051 }
1052 
1053 vsw_port_t *
1054 vsw_lookup_port(vsw_t *vswp, int p_instance)
1055 {
1056 	vsw_port_list_t *plist = &vswp->plist;
1057 	vsw_port_t	*port;
1058 
1059 	for (port = plist->head; port != NULL; port = port->p_next) {
1060 		if (port->p_instance == p_instance) {
1061 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1062 			return (port);
1063 		}
1064 	}
1065 
1066 	return (NULL);
1067 }
1068 
1069 void
1070 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1071 {
1072 	vsw_ldc_t	*ldcp = portp->ldcp;
1073 
1074 	mutex_enter(&ldcp->ldc_cblock);
1075 
1076 	/*
1077 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1078 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1079 	 */
1080 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1081 	    portp->nvids != 0) {
1082 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1083 	}
1084 
1085 	mutex_exit(&ldcp->ldc_cblock);
1086 }
1087 
1088 void
1089 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1090 {
1091 	vsw_ldc_t	*ldcp = portp->ldcp;
1092 
1093 	mutex_enter(&ldcp->ldc_cblock);
1094 
1095 	/*
1096 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1097 	 * to trigger re-negotiation, which inturn trigger HybridIO
1098 	 * setup/cleanup.
1099 	 */
1100 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1101 	    (portp->p_hio_capable == B_TRUE)) {
1102 		if (immediate == B_TRUE) {
1103 			(void) ldc_down(ldcp->ldc_handle);
1104 		} else {
1105 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1106 		}
1107 	}
1108 
1109 	mutex_exit(&ldcp->ldc_cblock);
1110 }
1111 
1112 void
1113 vsw_port_reset(vsw_port_t *portp)
1114 {
1115 	vsw_ldc_t	*ldcp = portp->ldcp;
1116 
1117 	mutex_enter(&ldcp->ldc_cblock);
1118 
1119 	/*
1120 	 * reset channel and terminate the connection.
1121 	 */
1122 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1123 
1124 	mutex_exit(&ldcp->ldc_cblock);
1125 }
1126 
1127 void
1128 vsw_reset_ports(vsw_t *vswp)
1129 {
1130 	vsw_port_list_t	*plist = &vswp->plist;
1131 	vsw_port_t	*portp;
1132 
1133 	READ_ENTER(&plist->lockrw);
1134 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1135 		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1136 			vsw_hio_stop_port(portp);
1137 		}
1138 		vsw_port_reset(portp);
1139 	}
1140 	RW_EXIT(&plist->lockrw);
1141 }
1142 
1143 static void
1144 vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
1145 {
1146 	vnet_physlink_msg_t	msg;
1147 	vnet_physlink_msg_t	*msgp = &msg;
1148 	uint32_t		physlink_info = 0;
1149 
1150 	if (plink_state == LINK_STATE_UP) {
1151 		physlink_info |= VNET_PHYSLINK_STATE_UP;
1152 	} else {
1153 		physlink_info |= VNET_PHYSLINK_STATE_DOWN;
1154 	}
1155 
1156 	msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
1157 	msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
1158 	msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
1159 	msgp->tag.vio_sid = ldcp->local_session;
1160 	msgp->physlink_info = physlink_info;
1161 
1162 	(void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
1163 }
1164 
1165 static void
1166 vsw_port_physlink_update(vsw_port_t *portp)
1167 {
1168 	vsw_ldc_t	*ldcp;
1169 	vsw_t		*vswp;
1170 
1171 	vswp = portp->p_vswp;
1172 	ldcp = portp->ldcp;
1173 
1174 	mutex_enter(&ldcp->ldc_cblock);
1175 
1176 	/*
1177 	 * If handshake has completed successfully and if the vnet device
1178 	 * has negotiated to get physical link state updates, send a message
1179 	 * with the current state.
1180 	 */
1181 	if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
1182 		vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
1183 	}
1184 
1185 	mutex_exit(&ldcp->ldc_cblock);
1186 }
1187 
1188 void
1189 vsw_physlink_update_ports(vsw_t *vswp)
1190 {
1191 	vsw_port_list_t	*plist = &vswp->plist;
1192 	vsw_port_t	*portp;
1193 
1194 	READ_ENTER(&plist->lockrw);
1195 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1196 		vsw_port_physlink_update(portp);
1197 	}
1198 	RW_EXIT(&plist->lockrw);
1199 }
1200 
1201 /*
1202  * Search for and remove the specified port from the port
1203  * list. Returns 0 if able to locate and remove port, otherwise
1204  * returns 1.
1205  */
1206 static int
1207 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1208 {
1209 	vsw_port_list_t *plist = &vswp->plist;
1210 	vsw_port_t	*curr_p, *prev_p;
1211 
1212 	if (plist->head == NULL)
1213 		return (1);
1214 
1215 	curr_p = prev_p = plist->head;
1216 
1217 	while (curr_p != NULL) {
1218 		if (curr_p == port) {
1219 			if (prev_p == curr_p) {
1220 				plist->head = curr_p->p_next;
1221 			} else {
1222 				prev_p->p_next = curr_p->p_next;
1223 			}
1224 			plist->num_ports--;
1225 			break;
1226 		} else {
1227 			prev_p = curr_p;
1228 			curr_p = curr_p->p_next;
1229 		}
1230 	}
1231 	return (0);
1232 }
1233 
1234 /*
1235  * Interrupt handler for ldc messages.
1236  */
1237 static uint_t
1238 vsw_ldc_cb(uint64_t event, caddr_t arg)
1239 {
1240 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1241 	vsw_t 		*vswp = ldcp->ldc_vswp;
1242 
1243 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1244 
1245 	mutex_enter(&ldcp->ldc_cblock);
1246 	ldcp->ldc_stats.callbacks++;
1247 
1248 	mutex_enter(&ldcp->status_lock);
1249 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1250 		mutex_exit(&ldcp->status_lock);
1251 		mutex_exit(&ldcp->ldc_cblock);
1252 		return (LDC_SUCCESS);
1253 	}
1254 	mutex_exit(&ldcp->status_lock);
1255 
1256 	if (event & LDC_EVT_UP) {
1257 		/*
1258 		 * Channel has come up.
1259 		 */
1260 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1261 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1262 
1263 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1264 
1265 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1266 	}
1267 
1268 	if (event & LDC_EVT_READ) {
1269 		/*
1270 		 * Data available for reading.
1271 		 */
1272 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1273 		    __func__, ldcp->ldc_id, event);
1274 
1275 		vsw_process_evt_read(ldcp);
1276 
1277 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1278 
1279 		goto vsw_cb_exit;
1280 	}
1281 
1282 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1283 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1284 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1285 
1286 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1287 	}
1288 
1289 	/*
1290 	 * Catch either LDC_EVT_WRITE which we don't support or any
1291 	 * unknown event.
1292 	 */
1293 	if (event &
1294 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1295 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1296 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1297 	}
1298 
1299 vsw_cb_exit:
1300 	mutex_exit(&ldcp->ldc_cblock);
1301 
1302 	/*
1303 	 * Let the drain function know we are finishing if it
1304 	 * is waiting.
1305 	 */
1306 	mutex_enter(&ldcp->drain_cv_lock);
1307 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1308 		cv_signal(&ldcp->drain_cv);
1309 	mutex_exit(&ldcp->drain_cv_lock);
1310 
1311 	return (LDC_SUCCESS);
1312 }
1313 
1314 /*
1315  * Reinitialise data structures associated with the channel.
1316  */
1317 static void
1318 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1319 {
1320 	vsw_t		*vswp = ldcp->ldc_vswp;
1321 	vsw_port_t	*port;
1322 
1323 	D1(vswp, "%s: enter", __func__);
1324 
1325 	port = ldcp->ldc_port;
1326 
1327 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1328 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1329 
1330 	vsw_free_lane_resources(ldcp, INBOUND);
1331 	vsw_free_lane_resources(ldcp, OUTBOUND);
1332 
1333 	ldcp->lane_in.lstate = 0;
1334 	ldcp->lane_out.lstate = 0;
1335 
1336 	/*
1337 	 * Remove parent port from any multicast groups
1338 	 * it may have registered with. Client must resend
1339 	 * multicast add command after handshake completes.
1340 	 */
1341 	vsw_del_mcst_port(port);
1342 
1343 	ldcp->peer_session = 0;
1344 	ldcp->session_status = 0;
1345 	ldcp->hcnt = 0;
1346 	ldcp->hphase = VSW_MILESTONE0;
1347 
1348 	vsw_reset_vnet_proto_ops(ldcp);
1349 
1350 	D1(vswp, "%s: exit", __func__);
1351 }
1352 
1353 /*
1354  * Process a connection event.
1355  */
1356 void
1357 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1358 {
1359 	vsw_t		*vswp = ldcp->ldc_vswp;
1360 	vsw_conn_evt_t	*conn = NULL;
1361 
1362 	D1(vswp, "%s: enter", __func__);
1363 
1364 	/*
1365 	 * Check if either a reset or restart event is pending
1366 	 * or in progress. If so just return.
1367 	 *
1368 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1369 	 * being received by the callback handler, or a ECONNRESET error
1370 	 * code being returned from a ldc_read() or ldc_write() call.
1371 	 *
1372 	 * A VSW_CONN_RESTART event occurs when some error checking code
1373 	 * decides that there is a problem with data from the channel,
1374 	 * and that the handshake should be restarted.
1375 	 */
1376 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1377 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1378 		return;
1379 
1380 	/*
1381 	 * If it is an LDC_UP event we first check the recorded
1382 	 * state of the channel. If this is UP then we know that
1383 	 * the channel moving to the UP state has already been dealt
1384 	 * with and don't need to dispatch a  new task.
1385 	 *
1386 	 * The reason for this check is that when we do a ldc_up(),
1387 	 * depending on the state of the peer, we may or may not get
1388 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1389 	 * every time we do ldc_up() we explicitly check the channel
1390 	 * status to see has it come up (ldc_up() is asynch and will
1391 	 * complete at some undefined time), and take the appropriate
1392 	 * action.
1393 	 *
1394 	 * The flip side of this is that we may get a LDC_UP event
1395 	 * when we have already seen that the channel is up and have
1396 	 * dealt with that.
1397 	 */
1398 	mutex_enter(&ldcp->status_lock);
1399 	if (evt == VSW_CONN_UP) {
1400 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1401 			mutex_exit(&ldcp->status_lock);
1402 			return;
1403 		}
1404 	}
1405 	mutex_exit(&ldcp->status_lock);
1406 
1407 	/*
1408 	 * The transaction group id allows us to identify and discard
1409 	 * any tasks which are still pending on the taskq and refer
1410 	 * to the handshake session we are about to restart or reset.
1411 	 * These stale messages no longer have any real meaning.
1412 	 */
1413 	(void) atomic_inc_32(&ldcp->hss_id);
1414 
1415 	ASSERT(vswp->taskq_p != NULL);
1416 
1417 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1418 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1419 		    " connection event", vswp->instance);
1420 		goto err_exit;
1421 	}
1422 
1423 	conn->evt = evt;
1424 	conn->ldcp = ldcp;
1425 
1426 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1427 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1428 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1429 		    vswp->instance);
1430 
1431 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1432 		goto err_exit;
1433 	}
1434 
1435 	D1(vswp, "%s: exit", __func__);
1436 	return;
1437 
1438 err_exit:
1439 	/*
1440 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1441 	 * that future requests will at least be attempted and will hopefully
1442 	 * succeed.
1443 	 */
1444 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1445 		ldcp->reset_active = 0;
1446 }
1447 
1448 /*
1449  * Deal with events relating to a connection. Invoked from a taskq.
1450  */
1451 static void
1452 vsw_conn_task(void *arg)
1453 {
1454 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1455 	vsw_ldc_t	*ldcp = NULL;
1456 	vsw_port_t	*portp;
1457 	vsw_t		*vswp = NULL;
1458 	uint16_t	evt;
1459 	ldc_status_t	curr_status;
1460 
1461 	ldcp = conn->ldcp;
1462 	evt = conn->evt;
1463 	vswp = ldcp->ldc_vswp;
1464 	portp = ldcp->ldc_port;
1465 
1466 	D1(vswp, "%s: enter", __func__);
1467 
1468 	/* can safely free now have copied out data */
1469 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1470 
1471 	if (ldcp->rcv_thread != NULL) {
1472 		vsw_stop_rcv_thread(ldcp);
1473 	} else if (ldcp->msg_thread != NULL) {
1474 		vsw_stop_msg_thread(ldcp);
1475 	}
1476 
1477 	mutex_enter(&ldcp->status_lock);
1478 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1479 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1480 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1481 		mutex_exit(&ldcp->status_lock);
1482 		return;
1483 	}
1484 
1485 	/*
1486 	 * If we wish to restart the handshake on this channel, then if
1487 	 * the channel is UP we bring it DOWN to flush the underlying
1488 	 * ldc queue.
1489 	 */
1490 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1491 		(void) ldc_down(ldcp->ldc_handle);
1492 
1493 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1494 		vsw_hio_stop(vswp, ldcp);
1495 	}
1496 
1497 	/*
1498 	 * re-init all the associated data structures.
1499 	 */
1500 	vsw_ldc_reinit(ldcp);
1501 
1502 	/*
1503 	 * Bring the channel back up (note it does no harm to
1504 	 * do this even if the channel is already UP, Just
1505 	 * becomes effectively a no-op).
1506 	 */
1507 	(void) ldc_up(ldcp->ldc_handle);
1508 
1509 	/*
1510 	 * Check if channel is now UP. This will only happen if
1511 	 * peer has also done a ldc_up().
1512 	 */
1513 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1514 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1515 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1516 		mutex_exit(&ldcp->status_lock);
1517 		return;
1518 	}
1519 
1520 	ldcp->ldc_status = curr_status;
1521 
1522 	/* channel UP so restart handshake by sending version info */
1523 	if (curr_status == LDC_UP) {
1524 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1525 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1526 			    " handshake attempts (%d) on channel %ld",
1527 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1528 			mutex_exit(&ldcp->status_lock);
1529 			return;
1530 		}
1531 
1532 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1533 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1534 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1535 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1536 			    vswp->instance);
1537 
1538 			/*
1539 			 * Don't count as valid restart attempt if couldn't
1540 			 * send version msg.
1541 			 */
1542 			if (ldcp->hcnt > 0)
1543 				ldcp->hcnt--;
1544 		}
1545 	}
1546 
1547 	/*
1548 	 * Mark that the process is complete by clearing the flag.
1549 	 *
1550 	 * Note is it possible that the taskq dispatch above may have failed,
1551 	 * most likely due to memory shortage. We still clear the flag so
1552 	 * future attempts will at least be attempted and will hopefully
1553 	 * succeed.
1554 	 */
1555 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1556 		ldcp->reset_active = 0;
1557 
1558 	mutex_exit(&ldcp->status_lock);
1559 
1560 	D1(vswp, "%s: exit", __func__);
1561 }
1562 
1563 /*
1564  * returns 0 if legal for event signified by flag to have
1565  * occured at the time it did. Otherwise returns 1.
1566  */
1567 int
1568 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1569 {
1570 	vsw_t		*vswp = ldcp->ldc_vswp;
1571 	uint64_t	state;
1572 	uint64_t	phase;
1573 
1574 	if (dir == INBOUND)
1575 		state = ldcp->lane_in.lstate;
1576 	else
1577 		state = ldcp->lane_out.lstate;
1578 
1579 	phase = ldcp->hphase;
1580 
1581 	switch (flag) {
1582 	case VSW_VER_INFO_RECV:
1583 		if (phase > VSW_MILESTONE0) {
1584 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1585 			    " when in state %d\n", ldcp->ldc_id, phase);
1586 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1587 			return (1);
1588 		}
1589 		break;
1590 
1591 	case VSW_VER_ACK_RECV:
1592 	case VSW_VER_NACK_RECV:
1593 		if (!(state & VSW_VER_INFO_SENT)) {
1594 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1595 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1596 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1597 			return (1);
1598 		} else
1599 			state &= ~VSW_VER_INFO_SENT;
1600 		break;
1601 
1602 	case VSW_ATTR_INFO_RECV:
1603 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1604 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1605 			    " when in state %d\n", ldcp->ldc_id, phase);
1606 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1607 			return (1);
1608 		}
1609 		break;
1610 
1611 	case VSW_ATTR_ACK_RECV:
1612 	case VSW_ATTR_NACK_RECV:
1613 		if (!(state & VSW_ATTR_INFO_SENT)) {
1614 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1615 			    " or ATTR_NACK when in state %d\n",
1616 			    ldcp->ldc_id, phase);
1617 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1618 			return (1);
1619 		} else
1620 			state &= ~VSW_ATTR_INFO_SENT;
1621 		break;
1622 
1623 	case VSW_DRING_INFO_RECV:
1624 		if (phase < VSW_MILESTONE1) {
1625 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1626 			    " when in state %d\n", ldcp->ldc_id, phase);
1627 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1628 			return (1);
1629 		}
1630 		break;
1631 
1632 	case VSW_DRING_ACK_RECV:
1633 	case VSW_DRING_NACK_RECV:
1634 		if (!(state & VSW_DRING_INFO_SENT)) {
1635 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1636 			    " or DRING_NACK when in state %d\n",
1637 			    ldcp->ldc_id, phase);
1638 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1639 			return (1);
1640 		} else
1641 			state &= ~VSW_DRING_INFO_SENT;
1642 		break;
1643 
1644 	case VSW_RDX_INFO_RECV:
1645 		if (phase < VSW_MILESTONE3) {
1646 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1647 			    " when in state %d\n", ldcp->ldc_id, phase);
1648 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1649 			return (1);
1650 		}
1651 		break;
1652 
1653 	case VSW_RDX_ACK_RECV:
1654 	case VSW_RDX_NACK_RECV:
1655 		if (!(state & VSW_RDX_INFO_SENT)) {
1656 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1657 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1658 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1659 			return (1);
1660 		} else
1661 			state &= ~VSW_RDX_INFO_SENT;
1662 		break;
1663 
1664 	case VSW_MCST_INFO_RECV:
1665 		if (phase < VSW_MILESTONE3) {
1666 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1667 			    " when in state %d\n", ldcp->ldc_id, phase);
1668 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1669 			return (1);
1670 		}
1671 		break;
1672 
1673 	default:
1674 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1675 		    ldcp->ldc_id, flag);
1676 		return (1);
1677 	}
1678 
1679 	if (dir == INBOUND)
1680 		ldcp->lane_in.lstate = state;
1681 	else
1682 		ldcp->lane_out.lstate = state;
1683 
1684 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1685 
1686 	return (0);
1687 }
1688 
1689 void
1690 vsw_next_milestone(vsw_ldc_t *ldcp)
1691 {
1692 	vsw_t		*vswp = ldcp->ldc_vswp;
1693 	vsw_port_t	*portp = ldcp->ldc_port;
1694 	lane_t		*lane_out = &ldcp->lane_out;
1695 	lane_t		*lane_in = &ldcp->lane_in;
1696 
1697 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1698 	    ldcp->ldc_id, ldcp->hphase);
1699 
1700 	DUMP_FLAGS(lane_in->lstate);
1701 	DUMP_FLAGS(lane_out->lstate);
1702 
1703 	switch (ldcp->hphase) {
1704 
1705 	case VSW_MILESTONE0:
1706 		/*
1707 		 * If we haven't started to handshake with our peer,
1708 		 * start to do so now.
1709 		 */
1710 		if (lane_out->lstate == 0) {
1711 			D2(vswp, "%s: (chan %lld) starting handshake "
1712 			    "with peer", __func__, ldcp->ldc_id);
1713 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1714 		}
1715 
1716 		/*
1717 		 * Only way to pass this milestone is to have successfully
1718 		 * negotiated version info.
1719 		 */
1720 		if ((lane_in->lstate & VSW_VER_ACK_SENT) &&
1721 		    (lane_out->lstate & VSW_VER_ACK_RECV)) {
1722 
1723 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1724 			    __func__, ldcp->ldc_id);
1725 
1726 			vsw_set_vnet_proto_ops(ldcp);
1727 
1728 			/*
1729 			 * Next milestone is passed when attribute
1730 			 * information has been successfully exchanged.
1731 			 */
1732 			ldcp->hphase = VSW_MILESTONE1;
1733 			vsw_send_attr(ldcp);
1734 
1735 		}
1736 		break;
1737 
1738 	case VSW_MILESTONE1:
1739 		/*
1740 		 * Only way to pass this milestone is to have successfully
1741 		 * negotiated attribute information, in both directions.
1742 		 */
1743 		if (!((lane_in->lstate & VSW_ATTR_ACK_SENT) &&
1744 		    (lane_out->lstate & VSW_ATTR_ACK_RECV))) {
1745 			break;
1746 		}
1747 
1748 		ldcp->hphase = VSW_MILESTONE2;
1749 
1750 		/*
1751 		 * If the peer device has said it wishes to
1752 		 * use descriptor rings then we send it our ring
1753 		 * info, otherwise we just set up a private ring
1754 		 * which we use an internal buffer
1755 		 */
1756 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1757 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1758 		    (VSW_VER_LT(ldcp, 1, 2) &&
1759 		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
1760 			vsw_send_dring_info(ldcp);
1761 			break;
1762 		}
1763 
1764 		/*
1765 		 * The peer doesn't operate in dring mode; we
1766 		 * can simply fallthru to the RDX phase from
1767 		 * here.
1768 		 */
1769 		/*FALLTHRU*/
1770 
1771 	case VSW_MILESTONE2:
1772 		/*
1773 		 * If peer has indicated in its attribute message that
1774 		 * it wishes to use descriptor rings then the only way
1775 		 * to pass this milestone is for us to have received
1776 		 * valid dring info.
1777 		 *
1778 		 * If peer is not using descriptor rings then just fall
1779 		 * through.
1780 		 */
1781 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1782 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1783 		    (VSW_VER_LT(ldcp, 1, 2) &&
1784 		    (lane_in->xfer_mode ==
1785 		    VIO_DRING_MODE_V1_0))) {
1786 			if (!(lane_in->lstate & VSW_DRING_ACK_SENT))
1787 				break;
1788 		}
1789 
1790 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1791 		    __func__, ldcp->ldc_id);
1792 
1793 		ldcp->hphase = VSW_MILESTONE3;
1794 		vsw_send_rdx(ldcp);
1795 		break;
1796 
1797 	case VSW_MILESTONE3:
1798 		/*
1799 		 * Pass this milestone when all paramaters have been
1800 		 * successfully exchanged and RDX sent in both directions.
1801 		 *
1802 		 * Mark the relevant lane as available to transmit data. In
1803 		 * RxDringData mode, lane_in is associated with transmit and
1804 		 * lane_out is associated with receive. It is the reverse in
1805 		 * TxDring mode.
1806 		 */
1807 		if ((lane_out->lstate & VSW_RDX_ACK_SENT) &&
1808 		    (lane_in->lstate & VSW_RDX_ACK_RECV)) {
1809 
1810 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1811 			    __func__, ldcp->ldc_id);
1812 			D2(vswp, "%s: ** handshake complete (0x%llx : "
1813 			    "0x%llx) **", __func__, lane_in->lstate,
1814 			    lane_out->lstate);
1815 			if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
1816 				lane_in->lstate |= VSW_LANE_ACTIVE;
1817 			} else {
1818 				lane_out->lstate |= VSW_LANE_ACTIVE;
1819 			}
1820 			ldcp->hphase = VSW_MILESTONE4;
1821 			ldcp->hcnt = 0;
1822 			DISPLAY_STATE();
1823 			/* Start HIO if enabled and capable */
1824 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1825 				D2(vswp, "%s: start HybridIO setup", __func__);
1826 				vsw_hio_start(vswp, ldcp);
1827 			}
1828 
1829 			if (ldcp->pls_negotiated == B_TRUE) {
1830 				/*
1831 				 * The vnet device has negotiated to get phys
1832 				 * link updates. Now that the handshake with
1833 				 * the vnet device is complete, send an initial
1834 				 * update with the current physical link state.
1835 				 */
1836 				vsw_send_physlink_msg(ldcp,
1837 				    vswp->phys_link_state);
1838 			}
1839 
1840 		} else {
1841 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1842 			    __func__, lane_in->lstate,
1843 			    lane_out->lstate);
1844 		}
1845 		break;
1846 
1847 	case VSW_MILESTONE4:
1848 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1849 		    ldcp->ldc_id);
1850 		break;
1851 
1852 	default:
1853 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1854 		    ldcp->ldc_id, ldcp->hphase);
1855 	}
1856 
1857 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1858 	    ldcp->hphase);
1859 }
1860 
1861 /*
1862  * Check if major version is supported.
1863  *
1864  * Returns 0 if finds supported major number, and if necessary
1865  * adjusts the minor field.
1866  *
1867  * Returns 1 if can't match major number exactly. Sets mjor/minor
1868  * to next lowest support values, or to zero if no other values possible.
1869  */
1870 static int
1871 vsw_supported_version(vio_ver_msg_t *vp)
1872 {
1873 	int	i;
1874 
1875 	D1(NULL, "vsw_supported_version: enter");
1876 
1877 	for (i = 0; i < VSW_NUM_VER; i++) {
1878 		if (vsw_versions[i].ver_major == vp->ver_major) {
1879 			/*
1880 			 * Matching or lower major version found. Update
1881 			 * minor number if necessary.
1882 			 */
1883 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1884 				D2(NULL, "%s: adjusting minor value from %d "
1885 				    "to %d", __func__, vp->ver_minor,
1886 				    vsw_versions[i].ver_minor);
1887 				vp->ver_minor = vsw_versions[i].ver_minor;
1888 			}
1889 
1890 			return (0);
1891 		}
1892 
1893 		/*
1894 		 * If the message contains a higher major version number, set
1895 		 * the message's major/minor versions to the current values
1896 		 * and return false, so this message will get resent with
1897 		 * these values.
1898 		 */
1899 		if (vsw_versions[i].ver_major < vp->ver_major) {
1900 			D2(NULL, "%s: adjusting major and minor "
1901 			    "values to %d, %d\n",
1902 			    __func__, vsw_versions[i].ver_major,
1903 			    vsw_versions[i].ver_minor);
1904 			vp->ver_major = vsw_versions[i].ver_major;
1905 			vp->ver_minor = vsw_versions[i].ver_minor;
1906 			return (1);
1907 		}
1908 	}
1909 
1910 	/* No match was possible, zero out fields */
1911 	vp->ver_major = 0;
1912 	vp->ver_minor = 0;
1913 
1914 	D1(NULL, "vsw_supported_version: exit");
1915 
1916 	return (1);
1917 }
1918 
1919 /*
1920  * Set vnet-protocol-version dependent functions based on version.
1921  */
1922 static void
1923 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1924 {
1925 	vsw_t	*vswp = ldcp->ldc_vswp;
1926 	lane_t	*lp = &ldcp->lane_out;
1927 
1928 	/*
1929 	 * Setup the appropriate dring data processing routine and any
1930 	 * associated thread based on the version.
1931 	 *
1932 	 * In versions < 1.6, we support only TxDring mode. In this mode, the
1933 	 * msg worker thread processes all types of VIO msgs (ctrl and data).
1934 	 *
1935 	 * In versions >= 1.6, we also support RxDringData mode. In this mode,
1936 	 * the rcv worker thread processes dring data messages (msgtype:
1937 	 * VIO_TYPE_DATA, subtype: VIO_SUBTYPE_INFO, env: VIO_DRING_DATA). The
1938 	 * rest of the data messages (including acks) and ctrl messages are
1939 	 * handled directly by the callback (intr) thread.
1940 	 *
1941 	 * However, for versions >= 1.6, we could still fallback to TxDring
1942 	 * mode. This could happen if RxDringData mode has been disabled (see
1943 	 * below) on this guest or on the peer guest. This info is determined
1944 	 * as part of attr exchange phase of handshake. Hence, we setup these
1945 	 * pointers for v1.6 after attr msg phase completes during handshake.
1946 	 */
1947 	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
1948 		/*
1949 		 * Set data dring mode for vsw_send_attr(). We setup msg worker
1950 		 * thread in TxDring mode or rcv worker thread in RxDringData
1951 		 * mode when attr phase of handshake completes.
1952 		 */
1953 		if (vsw_mapin_avail(ldcp) == B_TRUE) {
1954 			lp->dring_mode = (VIO_RX_DRING_DATA | VIO_TX_DRING);
1955 		} else {
1956 			lp->dring_mode = VIO_TX_DRING;
1957 		}
1958 	} else {
1959 		lp->dring_mode = VIO_TX_DRING;
1960 	}
1961 
1962 	/*
1963 	 * Setup the MTU for attribute negotiation based on the version.
1964 	 */
1965 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
1966 		/*
1967 		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
1968 		 * Support), set the mtu in our attributes to max_frame_size.
1969 		 */
1970 		lp->mtu = vswp->max_frame_size;
1971 	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
1972 		/*
1973 		 * If the version negotiated with peer is == 1.3 (Vlan Tag
1974 		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
1975 		 */
1976 		lp->mtu = ETHERMAX + VLAN_TAGSZ;
1977 	} else {
1978 		vsw_port_t	*portp = ldcp->ldc_port;
1979 		/*
1980 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
1981 		 * We can negotiate that size with those peers provided only
1982 		 * pvid is defined for our peer and there are no vids. Then we
1983 		 * can send/recv only untagged frames of max size ETHERMAX.
1984 		 * Note that pvid of the peer can be different, as vsw has to
1985 		 * serve the vnet in that vlan even if itself is not assigned
1986 		 * to that vlan.
1987 		 */
1988 		if (portp->nvids == 0) {
1989 			lp->mtu = ETHERMAX;
1990 		}
1991 	}
1992 
1993 	/*
1994 	 * Setup version dependent data processing functions.
1995 	 */
1996 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
1997 		/* Versions >= 1.2 */
1998 
1999 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2000 			/*
2001 			 * enable priority routines and pkt mode only if
2002 			 * at least one pri-eth-type is specified in MD.
2003 			 */
2004 			ldcp->tx = vsw_ldctx_pri;
2005 			ldcp->rx_pktdata = vsw_process_pkt_data;
2006 
2007 			/* set xfer mode for vsw_send_attr() */
2008 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2009 		} else {
2010 			/* no priority eth types defined in MD */
2011 
2012 			ldcp->tx = vsw_ldctx;
2013 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2014 
2015 			/* set xfer mode for vsw_send_attr() */
2016 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2017 		}
2018 
2019 	} else {
2020 		/* Versions prior to 1.2  */
2021 
2022 		vsw_reset_vnet_proto_ops(ldcp);
2023 	}
2024 }
2025 
2026 /*
2027  * Reset vnet-protocol-version dependent functions to v1.0.
2028  */
2029 static void
2030 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2031 {
2032 	lane_t	*lp = &ldcp->lane_out;
2033 
2034 	ldcp->tx = vsw_ldctx;
2035 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2036 
2037 	/* set xfer mode for vsw_send_attr() */
2038 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2039 }
2040 
2041 static void
2042 vsw_process_evt_read(vsw_ldc_t *ldcp)
2043 {
2044 	if (ldcp->msg_thread != NULL) {
2045 		/*
2046 		 * TxDring mode; wakeup message worker
2047 		 * thread to process the VIO messages.
2048 		 */
2049 		mutex_exit(&ldcp->ldc_cblock);
2050 		mutex_enter(&ldcp->msg_thr_lock);
2051 		if (!(ldcp->msg_thr_flags & VSW_WTHR_DATARCVD)) {
2052 			ldcp->msg_thr_flags |= VSW_WTHR_DATARCVD;
2053 			cv_signal(&ldcp->msg_thr_cv);
2054 		}
2055 		mutex_exit(&ldcp->msg_thr_lock);
2056 		mutex_enter(&ldcp->ldc_cblock);
2057 	} else {
2058 		/*
2059 		 * We invoke vsw_process_pkt() in the context of the LDC
2060 		 * callback (vsw_ldc_cb()) during handshake, until the dring
2061 		 * mode is negotiated. After the dring mode is negotiated, the
2062 		 * msgs are processed by the msg worker thread (above case) if
2063 		 * the dring mode is TxDring. Otherwise (in RxDringData mode)
2064 		 * we continue to process the msgs directly in the callback
2065 		 * context.
2066 		 */
2067 		vsw_process_pkt(ldcp);
2068 	}
2069 }
2070 
2071 /*
2072  * Main routine for processing messages received over LDC.
2073  */
2074 void
2075 vsw_process_pkt(void *arg)
2076 {
2077 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2078 	vsw_t 		*vswp = ldcp->ldc_vswp;
2079 	size_t		msglen;
2080 	vio_msg_tag_t	*tagp;
2081 	uint64_t	*ldcmsg;
2082 	int 		rv = 0;
2083 
2084 
2085 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2086 
2087 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2088 
2089 	ldcmsg = ldcp->ldcmsg;
2090 	/*
2091 	 * If channel is up read messages until channel is empty.
2092 	 */
2093 	do {
2094 		msglen = ldcp->msglen;
2095 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2096 
2097 		if (rv != 0) {
2098 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2099 			    __func__, ldcp->ldc_id, rv, msglen);
2100 		}
2101 
2102 		/* channel has been reset */
2103 		if (rv == ECONNRESET) {
2104 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2105 			break;
2106 		}
2107 
2108 		if (msglen == 0) {
2109 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2110 			    ldcp->ldc_id);
2111 			break;
2112 		}
2113 
2114 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2115 		    ldcp->ldc_id, msglen);
2116 
2117 		/*
2118 		 * Figure out what sort of packet we have gotten by
2119 		 * examining the msg tag, and then switch it appropriately.
2120 		 */
2121 		tagp = (vio_msg_tag_t *)ldcmsg;
2122 
2123 		switch (tagp->vio_msgtype) {
2124 		case VIO_TYPE_CTRL:
2125 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp, msglen);
2126 			break;
2127 		case VIO_TYPE_DATA:
2128 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2129 			break;
2130 		case VIO_TYPE_ERR:
2131 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2132 			break;
2133 		default:
2134 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2135 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2136 			break;
2137 		}
2138 	} while (msglen);
2139 
2140 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2141 }
2142 
2143 /*
2144  * Dispatch a task to process a VIO control message.
2145  */
2146 static void
2147 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp,
2148 	int msglen)
2149 {
2150 	vsw_ctrl_task_t		*ctaskp = NULL;
2151 	vsw_port_t		*port = ldcp->ldc_port;
2152 	vsw_t			*vswp = port->p_vswp;
2153 
2154 	D1(vswp, "%s: enter", __func__);
2155 
2156 	/*
2157 	 * We need to handle RDX ACK messages in-band as once they
2158 	 * are exchanged it is possible that we will get an
2159 	 * immediate (legitimate) data packet.
2160 	 */
2161 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2162 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2163 
2164 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2165 			return;
2166 
2167 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2168 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2169 		    "(ostate 0x%llx : hphase %d)", __func__,
2170 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2171 		vsw_next_milestone(ldcp);
2172 		return;
2173 	}
2174 
2175 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2176 
2177 	if (ctaskp == NULL) {
2178 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2179 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2180 		return;
2181 	}
2182 
2183 	ctaskp->ldcp = ldcp;
2184 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, msglen);
2185 	ctaskp->hss_id = ldcp->hss_id;
2186 
2187 	/*
2188 	 * Dispatch task to processing taskq if port is not in
2189 	 * the process of being detached.
2190 	 */
2191 	mutex_enter(&port->state_lock);
2192 	if (port->state == VSW_PORT_INIT) {
2193 		if ((vswp->taskq_p == NULL) ||
2194 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2195 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2196 			mutex_exit(&port->state_lock);
2197 			DERR(vswp, "%s: unable to dispatch task to taskq",
2198 			    __func__);
2199 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2200 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2201 			return;
2202 		}
2203 	} else {
2204 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2205 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2206 		    "task", __func__, port->p_instance);
2207 	}
2208 
2209 	mutex_exit(&port->state_lock);
2210 
2211 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2212 	    ldcp->ldc_id);
2213 	D1(vswp, "%s: exit", __func__);
2214 }
2215 
2216 /*
2217  * Process a VIO ctrl message. Invoked from taskq.
2218  */
2219 static void
2220 vsw_process_ctrl_pkt(void *arg)
2221 {
2222 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2223 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2224 	vsw_t 		*vswp = ldcp->ldc_vswp;
2225 	vio_msg_tag_t	tag;
2226 	uint16_t	env;
2227 
2228 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2229 
2230 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2231 	env = tag.vio_subtype_env;
2232 
2233 	/* stale pkt check */
2234 	if (ctaskp->hss_id < ldcp->hss_id) {
2235 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2236 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2237 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2238 		return;
2239 	}
2240 
2241 	/* session id check */
2242 	if (ldcp->session_status & VSW_PEER_SESSION) {
2243 		if (ldcp->peer_session != tag.vio_sid) {
2244 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2245 			    __func__, ldcp->ldc_id, tag.vio_sid);
2246 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2247 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2248 			return;
2249 		}
2250 	}
2251 
2252 	/*
2253 	 * Switch on vio_subtype envelope, then let lower routines
2254 	 * decide if its an INFO, ACK or NACK packet.
2255 	 */
2256 	switch (env) {
2257 	case VIO_VER_INFO:
2258 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2259 		break;
2260 	case VIO_DRING_REG:
2261 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2262 		break;
2263 	case VIO_DRING_UNREG:
2264 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2265 		break;
2266 	case VIO_ATTR_INFO:
2267 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2268 		break;
2269 	case VNET_MCAST_INFO:
2270 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2271 		break;
2272 	case VIO_RDX:
2273 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2274 		break;
2275 	case VIO_DDS_INFO:
2276 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2277 		break;
2278 
2279 	case VNET_PHYSLINK_INFO:
2280 		vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
2281 		break;
2282 	default:
2283 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2284 	}
2285 
2286 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2287 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2288 }
2289 
2290 /*
2291  * Version negotiation. We can end up here either because our peer
2292  * has responded to a handshake message we have sent it, or our peer
2293  * has initiated a handshake with us. If its the former then can only
2294  * be ACK or NACK, if its the later can only be INFO.
2295  *
2296  * If its an ACK we move to the next stage of the handshake, namely
2297  * attribute exchange. If its a NACK we see if we can specify another
2298  * version, if we can't we stop.
2299  *
2300  * If it is an INFO we reset all params associated with communication
2301  * in that direction over this channel (remember connection is
2302  * essentially 2 independent simplex channels).
2303  */
2304 void
2305 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2306 {
2307 	vio_ver_msg_t	*ver_pkt;
2308 	vsw_t 		*vswp = ldcp->ldc_vswp;
2309 
2310 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2311 
2312 	/*
2313 	 * We know this is a ctrl/version packet so
2314 	 * cast it into the correct structure.
2315 	 */
2316 	ver_pkt = (vio_ver_msg_t *)pkt;
2317 
2318 	switch (ver_pkt->tag.vio_subtype) {
2319 	case VIO_SUBTYPE_INFO:
2320 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2321 
2322 		/*
2323 		 * Record the session id, which we will use from now
2324 		 * until we see another VER_INFO msg. Even then the
2325 		 * session id in most cases will be unchanged, execpt
2326 		 * if channel was reset.
2327 		 */
2328 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2329 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2330 			DERR(vswp, "%s: updating session id for chan %lld "
2331 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2332 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2333 		}
2334 
2335 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2336 		ldcp->session_status |= VSW_PEER_SESSION;
2337 
2338 		/* Legal message at this time ? */
2339 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2340 			return;
2341 
2342 		/*
2343 		 * First check the device class. Currently only expect
2344 		 * to be talking to a network device. In the future may
2345 		 * also talk to another switch.
2346 		 */
2347 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2348 			DERR(vswp, "%s: illegal device class %d", __func__,
2349 			    ver_pkt->dev_class);
2350 
2351 			ver_pkt->tag.vio_sid = ldcp->local_session;
2352 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2353 
2354 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2355 
2356 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2357 			    sizeof (vio_ver_msg_t), B_TRUE);
2358 
2359 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2360 			vsw_next_milestone(ldcp);
2361 			return;
2362 		} else {
2363 			ldcp->dev_class = ver_pkt->dev_class;
2364 		}
2365 
2366 		/*
2367 		 * Now check the version.
2368 		 */
2369 		if (vsw_supported_version(ver_pkt) == 0) {
2370 			/*
2371 			 * Support this major version and possibly
2372 			 * adjusted minor version.
2373 			 */
2374 
2375 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2376 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2377 
2378 			/* Store accepted values */
2379 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2380 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2381 
2382 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2383 
2384 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2385 
2386 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2387 				/*
2388 				 * Send a version info message
2389 				 * using the accepted version that
2390 				 * we are about to ack. Also note that
2391 				 * we send our ver info before we ack.
2392 				 * Otherwise, as soon as receiving the
2393 				 * ack, obp sends attr info msg, which
2394 				 * breaks vsw_check_flag() invoked
2395 				 * from vsw_process_ctrl_attr_pkt();
2396 				 * as we also need VSW_VER_ACK_RECV to
2397 				 * be set in lane_out.lstate, before
2398 				 * we can receive attr info.
2399 				 */
2400 				vsw_send_ver(ldcp);
2401 			}
2402 		} else {
2403 			/*
2404 			 * NACK back with the next lower major/minor
2405 			 * pairing we support (if don't suuport any more
2406 			 * versions then they will be set to zero.
2407 			 */
2408 
2409 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2410 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2411 
2412 			/* Store updated values */
2413 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2414 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2415 
2416 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2417 
2418 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2419 		}
2420 
2421 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2422 		ver_pkt->tag.vio_sid = ldcp->local_session;
2423 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2424 		    sizeof (vio_ver_msg_t), B_TRUE);
2425 
2426 		vsw_next_milestone(ldcp);
2427 		break;
2428 
2429 	case VIO_SUBTYPE_ACK:
2430 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2431 
2432 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2433 			return;
2434 
2435 		/* Store updated values */
2436 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2437 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2438 
2439 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2440 		vsw_next_milestone(ldcp);
2441 
2442 		break;
2443 
2444 	case VIO_SUBTYPE_NACK:
2445 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2446 
2447 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2448 			return;
2449 
2450 		/*
2451 		 * If our peer sent us a NACK with the ver fields set to
2452 		 * zero then there is nothing more we can do. Otherwise see
2453 		 * if we support either the version suggested, or a lesser
2454 		 * one.
2455 		 */
2456 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2457 			DERR(vswp, "%s: peer unable to negotiate any "
2458 			    "further.", __func__);
2459 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2460 			vsw_next_milestone(ldcp);
2461 			return;
2462 		}
2463 
2464 		/*
2465 		 * Check to see if we support this major version or
2466 		 * a lower one. If we don't then maj/min will be set
2467 		 * to zero.
2468 		 */
2469 		(void) vsw_supported_version(ver_pkt);
2470 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2471 			/* Nothing more we can do */
2472 			DERR(vswp, "%s: version negotiation failed.\n",
2473 			    __func__);
2474 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2475 			vsw_next_milestone(ldcp);
2476 		} else {
2477 			/* found a supported major version */
2478 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2479 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2480 
2481 			D2(vswp, "%s: resending with updated values (%x, %x)",
2482 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2483 
2484 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2485 			ver_pkt->tag.vio_sid = ldcp->local_session;
2486 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2487 
2488 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2489 
2490 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2491 			    sizeof (vio_ver_msg_t), B_TRUE);
2492 
2493 			vsw_next_milestone(ldcp);
2494 
2495 		}
2496 		break;
2497 
2498 	default:
2499 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2500 		    ver_pkt->tag.vio_subtype);
2501 	}
2502 
2503 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2504 }
2505 
2506 static int
2507 vsw_process_attr_info(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2508 {
2509 	vsw_t			*vswp = ldcp->ldc_vswp;
2510 	vsw_port_t		*port = ldcp->ldc_port;
2511 	struct ether_addr	ea;
2512 	uint64_t		macaddr = 0;
2513 	lane_t			*lane_out = &ldcp->lane_out;
2514 	lane_t			*lane_in = &ldcp->lane_in;
2515 	uint32_t		mtu;
2516 	int			i;
2517 	uint8_t			dring_mode;
2518 
2519 	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2520 
2521 	if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) {
2522 		return (1);
2523 	}
2524 
2525 	if ((msg->xfer_mode != VIO_DESC_MODE) &&
2526 	    (msg->xfer_mode != lane_out->xfer_mode)) {
2527 		D2(NULL, "%s: unknown mode %x\n", __func__, msg->xfer_mode);
2528 		return (1);
2529 	}
2530 
2531 	/* Only support MAC addresses at moment. */
2532 	if ((msg->addr_type != ADDR_TYPE_MAC) || (msg->addr == 0)) {
2533 		D2(NULL, "%s: invalid addr_type %x, or address 0x%llx\n",
2534 		    __func__, msg->addr_type, msg->addr);
2535 		return (1);
2536 	}
2537 
2538 	/*
2539 	 * MAC address supplied by device should match that stored
2540 	 * in the vsw-port OBP node. Need to decide what to do if they
2541 	 * don't match, for the moment just warn but don't fail.
2542 	 */
2543 	vnet_macaddr_ultostr(msg->addr, ea.ether_addr_octet);
2544 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
2545 		DERR(NULL, "%s: device supplied address "
2546 		    "0x%llx doesn't match node address 0x%llx\n",
2547 		    __func__, msg->addr, port->p_macaddr);
2548 	}
2549 
2550 	/*
2551 	 * Ack freq only makes sense in pkt mode, in shared
2552 	 * mode the ring descriptors say whether or not to
2553 	 * send back an ACK.
2554 	 */
2555 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2556 	    (msg->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2557 	    (VSW_VER_LT(ldcp, 1, 2) &&
2558 	    (msg->xfer_mode == VIO_DRING_MODE_V1_0))) {
2559 		if (msg->ack_freq > 0) {
2560 			D2(NULL, "%s: non zero ack freq in SHM mode\n",
2561 			    __func__);
2562 			return (1);
2563 		}
2564 	}
2565 
2566 	/*
2567 	 * Process dring mode attribute.
2568 	 */
2569 	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2570 		/*
2571 		 * Versions >= 1.6:
2572 		 * Though we are operating in v1.6 mode, it is possible that
2573 		 * RxDringData mode has been disabled either on this guest or
2574 		 * on the peer guest. If so, we revert to pre v1.6 behavior of
2575 		 * TxDring mode. But this must be agreed upon in both
2576 		 * directions of attr exchange. We first determine the mode
2577 		 * that can be negotiated.
2578 		 */
2579 		if ((msg->options & VIO_RX_DRING_DATA) != 0 &&
2580 		    vsw_mapin_avail(ldcp) == B_TRUE) {
2581 			/*
2582 			 * The peer is capable of handling RxDringData AND we
2583 			 * are also capable of it; we enable RxDringData mode
2584 			 * on this channel.
2585 			 */
2586 			dring_mode = VIO_RX_DRING_DATA;
2587 		} else if ((msg->options & VIO_TX_DRING) != 0) {
2588 			/*
2589 			 * If the peer is capable of TxDring mode, we
2590 			 * negotiate TxDring mode on this channel.
2591 			 */
2592 			dring_mode = VIO_TX_DRING;
2593 		} else {
2594 			/*
2595 			 * We support only VIO_TX_DRING and VIO_RX_DRING_DATA
2596 			 * modes. We don't support VIO_RX_DRING mode.
2597 			 */
2598 			return (1);
2599 		}
2600 
2601 		/*
2602 		 * If we have received an ack for the attr info that we sent,
2603 		 * then check if the dring mode matches what the peer had ack'd
2604 		 * (saved in lane_out). If they don't match, we fail the
2605 		 * handshake.
2606 		 */
2607 		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2608 			if (msg->options != lane_out->dring_mode) {
2609 				/* send NACK */
2610 				return (1);
2611 			}
2612 		} else {
2613 			/*
2614 			 * Save the negotiated dring mode in our attr
2615 			 * parameters, so it gets sent in the attr info from us
2616 			 * to the peer.
2617 			 */
2618 			lane_out->dring_mode = dring_mode;
2619 		}
2620 
2621 		/* save the negotiated dring mode in the msg to be replied */
2622 		msg->options = dring_mode;
2623 	}
2624 
2625 	/*
2626 	 * Process MTU attribute.
2627 	 */
2628 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2629 		/*
2630 		 * Versions >= 1.4:
2631 		 * Validate mtu of the peer is at least ETHERMAX. Then, the mtu
2632 		 * is negotiated down to the minimum of our mtu and peer's mtu.
2633 		 */
2634 		if (msg->mtu < ETHERMAX) {
2635 			return (1);
2636 		}
2637 
2638 		mtu = MIN(msg->mtu, vswp->max_frame_size);
2639 
2640 		/*
2641 		 * If we have received an ack for the attr info
2642 		 * that we sent, then check if the mtu computed
2643 		 * above matches the mtu that the peer had ack'd
2644 		 * (saved in local hparams). If they don't
2645 		 * match, we fail the handshake.
2646 		 */
2647 		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2648 			if (mtu != lane_out->mtu) {
2649 				/* send NACK */
2650 				return (1);
2651 			}
2652 		} else {
2653 			/*
2654 			 * Save the mtu computed above in our
2655 			 * attr parameters, so it gets sent in
2656 			 * the attr info from us to the peer.
2657 			 */
2658 			lane_out->mtu = mtu;
2659 		}
2660 
2661 		/* save the MIN mtu in the msg to be replied */
2662 		msg->mtu = mtu;
2663 	} else {
2664 		/* Versions < 1.4, mtu must match */
2665 		if (msg->mtu != lane_out->mtu) {
2666 			D2(NULL, "%s: invalid MTU (0x%llx)\n",
2667 			    __func__, msg->mtu);
2668 			return (1);
2669 		}
2670 	}
2671 
2672 	/*
2673 	 * Otherwise store attributes for this lane and update
2674 	 * lane state.
2675 	 */
2676 	lane_in->mtu = msg->mtu;
2677 	lane_in->addr = msg->addr;
2678 	lane_in->addr_type = msg->addr_type;
2679 	lane_in->xfer_mode = msg->xfer_mode;
2680 	lane_in->ack_freq = msg->ack_freq;
2681 	lane_in->physlink_update = msg->physlink_update;
2682 	lane_in->dring_mode = msg->options;
2683 
2684 	/*
2685 	 * Check if the client has requested physlink state updates.
2686 	 * If there is a physical device bound to this vswitch (L2
2687 	 * mode), set the ack bits to indicate it is supported.
2688 	 * Otherwise, set the nack bits.
2689 	 */
2690 	if (VSW_VER_GTEQ(ldcp, 1, 5)) {	/* Protocol ver >= 1.5 */
2691 
2692 		/* Does the vnet need phys link state updates ? */
2693 		if ((lane_in->physlink_update &
2694 		    PHYSLINK_UPDATE_STATE_MASK) ==
2695 		    PHYSLINK_UPDATE_STATE) {
2696 
2697 			if (vswp->smode & VSW_LAYER2) {
2698 				/* is a net-dev assigned to us ? */
2699 				msg->physlink_update =
2700 				    PHYSLINK_UPDATE_STATE_ACK;
2701 				ldcp->pls_negotiated = B_TRUE;
2702 			} else {
2703 				/* not in L2 mode */
2704 				msg->physlink_update =
2705 				    PHYSLINK_UPDATE_STATE_NACK;
2706 				ldcp->pls_negotiated = B_FALSE;
2707 			}
2708 
2709 		} else {
2710 			msg->physlink_update =
2711 			    PHYSLINK_UPDATE_NONE;
2712 			ldcp->pls_negotiated = B_FALSE;
2713 		}
2714 
2715 	} else {
2716 		/*
2717 		 * physlink_update bits are ignored
2718 		 * if set by clients < v1.5 protocol.
2719 		 */
2720 		msg->physlink_update = PHYSLINK_UPDATE_NONE;
2721 		ldcp->pls_negotiated = B_FALSE;
2722 	}
2723 
2724 	macaddr = lane_in->addr;
2725 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2726 		port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2727 		macaddr >>= 8;
2728 	}
2729 
2730 	/*
2731 	 * Setup device specific xmit routines. Note this could be changed
2732 	 * further in vsw_send_dring_info() for versions >= 1.6 if operating in
2733 	 * RxDringData mode.
2734 	 */
2735 	mutex_enter(&port->tx_lock);
2736 
2737 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2738 	    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2739 	    (VSW_VER_LT(ldcp, 1, 2) &&
2740 	    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2741 		D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2742 		port->transmit = vsw_dringsend;
2743 	} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2744 		D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2745 		vsw_create_privring(ldcp);
2746 		port->transmit = vsw_descrsend;
2747 		lane_out->xfer_mode = VIO_DESC_MODE;
2748 	}
2749 
2750 	/*
2751 	 * HybridIO is supported only vnet, not by OBP.
2752 	 * So, set hio_capable to true only when in DRING mode.
2753 	 */
2754 	if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2755 	    (lane_in->xfer_mode != VIO_DESC_MODE)) {
2756 		(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2757 	} else {
2758 		(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2759 	}
2760 
2761 	mutex_exit(&port->tx_lock);
2762 
2763 	return (0);
2764 }
2765 
2766 static int
2767 vsw_process_attr_ack(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2768 {
2769 	vsw_t	*vswp = ldcp->ldc_vswp;
2770 	lane_t	*lane_out = &ldcp->lane_out;
2771 	lane_t	*lane_in = &ldcp->lane_in;
2772 
2773 	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2774 
2775 	if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) {
2776 		return (1);
2777 	}
2778 
2779 	/*
2780 	 * Process dring mode attribute.
2781 	 */
2782 	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2783 		/*
2784 		 * Versions >= 1.6:
2785 		 * The ack msg sent by the peer contains the negotiated dring
2786 		 * mode between our capability (that we had sent in our attr
2787 		 * info) and the peer's capability.
2788 		 */
2789 		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2790 			/*
2791 			 * If we have sent an ack for the attr info msg from
2792 			 * the peer, check if the dring mode that was
2793 			 * negotiated then (saved in lane_out) matches the
2794 			 * mode that the peer has ack'd. If they don't match,
2795 			 * we fail the handshake.
2796 			 */
2797 			if (lane_out->dring_mode != msg->options) {
2798 				return (1);
2799 			}
2800 		} else {
2801 			if ((msg->options & lane_out->dring_mode) == 0) {
2802 				/*
2803 				 * Peer ack'd with a mode that we don't
2804 				 * support; we fail the handshake.
2805 				 */
2806 				return (1);
2807 			}
2808 			if ((msg->options & (VIO_TX_DRING|VIO_RX_DRING_DATA))
2809 			    == (VIO_TX_DRING|VIO_RX_DRING_DATA)) {
2810 				/*
2811 				 * Peer must ack with only one negotiated mode.
2812 				 * Otherwise fail handshake.
2813 				 */
2814 				return (1);
2815 			}
2816 
2817 			/*
2818 			 * Save the negotiated mode, so we can validate it when
2819 			 * we receive attr info from the peer.
2820 			 */
2821 			lane_out->dring_mode = msg->options;
2822 		}
2823 	}
2824 
2825 	/*
2826 	 * Process MTU attribute.
2827 	 */
2828 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2829 		/*
2830 		 * Versions >= 1.4:
2831 		 * The ack msg sent by the peer contains the minimum of
2832 		 * our mtu (that we had sent in our attr info) and the
2833 		 * peer's mtu.
2834 		 *
2835 		 * If we have sent an ack for the attr info msg from
2836 		 * the peer, check if the mtu that was computed then
2837 		 * (saved in lane_out params) matches the mtu that the
2838 		 * peer has ack'd. If they don't match, we fail the
2839 		 * handshake.
2840 		 */
2841 		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2842 			if (lane_out->mtu != msg->mtu) {
2843 				return (1);
2844 			}
2845 		} else {
2846 			/*
2847 			 * If the mtu ack'd by the peer is > our mtu
2848 			 * fail handshake. Otherwise, save the mtu, so
2849 			 * we can validate it when we receive attr info
2850 			 * from our peer.
2851 			 */
2852 			if (msg->mtu <= lane_out->mtu) {
2853 				lane_out->mtu = msg->mtu;
2854 			} else {
2855 				return (1);
2856 			}
2857 		}
2858 	}
2859 
2860 	return (0);
2861 }
2862 
2863 /*
2864  * Process an attribute packet. We can end up here either because our peer
2865  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2866  * peer has sent us an attribute INFO message
2867  *
2868  * If its an ACK we then move to the next stage of the handshake which
2869  * is to send our descriptor ring info to our peer. If its a NACK then
2870  * there is nothing more we can (currently) do.
2871  *
2872  * If we get a valid/acceptable INFO packet (and we have already negotiated
2873  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2874  * NACK back and reset channel state to INACTIV.
2875  *
2876  * FUTURE: in time we will probably negotiate over attributes, but for
2877  * the moment unacceptable attributes are regarded as a fatal error.
2878  *
2879  */
2880 void
2881 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2882 {
2883 	vnet_attr_msg_t	*attr_pkt;
2884 	vsw_t		*vswp = ldcp->ldc_vswp;
2885 	lane_t		*lane_out = &ldcp->lane_out;
2886 	lane_t		*lane_in = &ldcp->lane_in;
2887 	int		rv;
2888 
2889 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2890 
2891 	/*
2892 	 * We know this is a ctrl/attr packet so
2893 	 * cast it into the correct structure.
2894 	 */
2895 	attr_pkt = (vnet_attr_msg_t *)pkt;
2896 
2897 	switch (attr_pkt->tag.vio_subtype) {
2898 	case VIO_SUBTYPE_INFO:
2899 
2900 		rv = vsw_process_attr_info(ldcp, attr_pkt);
2901 		if (rv != 0) {
2902 			vsw_free_lane_resources(ldcp, INBOUND);
2903 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2904 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2905 		} else {
2906 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2907 			lane_in->lstate |= VSW_ATTR_ACK_SENT;
2908 		}
2909 		attr_pkt->tag.vio_sid = ldcp->local_session;
2910 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2911 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2912 		    sizeof (vnet_attr_msg_t), B_TRUE);
2913 		vsw_next_milestone(ldcp);
2914 		break;
2915 
2916 	case VIO_SUBTYPE_ACK:
2917 
2918 		rv = vsw_process_attr_ack(ldcp, attr_pkt);
2919 		if (rv != 0) {
2920 			return;
2921 		}
2922 		lane_out->lstate |= VSW_ATTR_ACK_RECV;
2923 		vsw_next_milestone(ldcp);
2924 		break;
2925 
2926 	case VIO_SUBTYPE_NACK:
2927 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2928 
2929 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2930 			return;
2931 
2932 		lane_out->lstate |= VSW_ATTR_NACK_RECV;
2933 		vsw_next_milestone(ldcp);
2934 		break;
2935 
2936 	default:
2937 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2938 		    attr_pkt->tag.vio_subtype);
2939 	}
2940 
2941 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2942 }
2943 
2944 static int
2945 vsw_process_dring_reg_info(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2946 {
2947 	int		rv;
2948 	vsw_t		*vswp = ldcp->ldc_vswp;
2949 	lane_t		*lp = &ldcp->lane_out;
2950 	dring_info_t	*dp = NULL;
2951 
2952 	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2953 
2954 	rv = vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV);
2955 	if (rv != 0) {
2956 		return (1);
2957 	}
2958 
2959 	if (VSW_VER_GTEQ(ldcp, 1, 6) &&
2960 	    (lp->dring_mode != ((vio_dring_reg_msg_t *)tagp)->options)) {
2961 		/*
2962 		 * The earlier version of Solaris vnet driver doesn't set the
2963 		 * option (VIO_TX_DRING in its case) correctly in its dring reg
2964 		 * message. We workaround that here by doing the check only
2965 		 * for versions >= v1.6.
2966 		 */
2967 		DWARN(vswp, "%s(%lld): Rcvd dring reg option (%d), "
2968 		    "negotiated mode (%d)\n", __func__, ldcp->ldc_id,
2969 		    ((vio_dring_reg_msg_t *)tagp)->options, lp->dring_mode);
2970 		return (1);
2971 	}
2972 
2973 	/*
2974 	 * Map dring exported by the peer.
2975 	 */
2976 	dp = vsw_map_dring(ldcp, (void *)tagp);
2977 	if (dp == NULL) {
2978 		return (1);
2979 	}
2980 
2981 	/*
2982 	 * Map data buffers exported by the peer if we are in RxDringData mode.
2983 	 */
2984 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
2985 		rv = vsw_map_data(ldcp, dp, (void *)tagp);
2986 		if (rv != 0) {
2987 			vsw_unmap_dring(ldcp);
2988 			return (1);
2989 		}
2990 	}
2991 
2992 	return (0);
2993 }
2994 
2995 static int
2996 vsw_process_dring_reg_ack(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2997 {
2998 	vsw_t		*vswp = ldcp->ldc_vswp;
2999 	dring_info_t	*dp;
3000 
3001 	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3002 
3003 	if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) {
3004 		return (1);
3005 	}
3006 
3007 	dp = ldcp->lane_out.dringp;
3008 
3009 	/* save dring_ident acked by peer */
3010 	dp->ident = ((vio_dring_reg_msg_t *)tagp)->dring_ident;
3011 
3012 	return (0);
3013 }
3014 
3015 /*
3016  * Process a dring info packet. We can end up here either because our peer
3017  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3018  * peer has sent us a dring INFO message.
3019  *
3020  * If we get a valid/acceptable INFO packet (and we have already negotiated
3021  * a version) we ACK back and update the lane state, otherwise we NACK back.
3022  *
3023  * FUTURE: nothing to stop client from sending us info on multiple dring's
3024  * but for the moment we will just use the first one we are given.
3025  *
3026  */
3027 void
3028 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3029 {
3030 	int		rv;
3031 	int		msgsize;
3032 	dring_info_t	*dp;
3033 	vio_msg_tag_t	*tagp = (vio_msg_tag_t *)pkt;
3034 	vsw_t		*vswp = ldcp->ldc_vswp;
3035 	lane_t		*lane_out = &ldcp->lane_out;
3036 	lane_t		*lane_in = &ldcp->lane_in;
3037 
3038 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3039 
3040 	switch (tagp->vio_subtype) {
3041 	case VIO_SUBTYPE_INFO:
3042 		rv = vsw_process_dring_reg_info(ldcp, tagp);
3043 		if (rv != 0) {
3044 			vsw_free_lane_resources(ldcp, INBOUND);
3045 			tagp->vio_subtype = VIO_SUBTYPE_NACK;
3046 			lane_in->lstate |= VSW_DRING_NACK_SENT;
3047 		} else {
3048 			tagp->vio_subtype = VIO_SUBTYPE_ACK;
3049 			lane_in->lstate |= VSW_DRING_ACK_SENT;
3050 		}
3051 		tagp->vio_sid = ldcp->local_session;
3052 		DUMP_TAG_PTR(tagp);
3053 		if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
3054 			dp = lane_in->dringp;
3055 			msgsize =
3056 			    VNET_DRING_REG_EXT_MSG_SIZE(dp->data_ncookies);
3057 		} else {
3058 			msgsize = sizeof (vio_dring_reg_msg_t);
3059 		}
3060 		(void) vsw_send_msg(ldcp, (void *)tagp, msgsize, B_TRUE);
3061 		vsw_next_milestone(ldcp);
3062 		break;
3063 
3064 	case VIO_SUBTYPE_ACK:
3065 		rv = vsw_process_dring_reg_ack(ldcp, tagp);
3066 		if (rv != 0) {
3067 			return;
3068 		}
3069 		lane_out->lstate |= VSW_DRING_ACK_RECV;
3070 		vsw_next_milestone(ldcp);
3071 		break;
3072 
3073 	case VIO_SUBTYPE_NACK:
3074 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3075 
3076 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3077 			return;
3078 
3079 		lane_out->lstate |= VSW_DRING_NACK_RECV;
3080 		vsw_next_milestone(ldcp);
3081 		break;
3082 
3083 	default:
3084 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3085 		    tagp->vio_subtype);
3086 	}
3087 
3088 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3089 }
3090 
3091 /*
3092  * Process a request from peer to unregister a dring.
3093  *
3094  * For the moment we just restart the handshake if our
3095  * peer endpoint attempts to unregister a dring.
3096  */
3097 void
3098 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3099 {
3100 	vsw_t			*vswp = ldcp->ldc_vswp;
3101 	vio_dring_unreg_msg_t	*dring_pkt;
3102 
3103 	/*
3104 	 * We know this is a ctrl/dring packet so
3105 	 * cast it into the correct structure.
3106 	 */
3107 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3108 
3109 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3110 
3111 	switch (dring_pkt->tag.vio_subtype) {
3112 	case VIO_SUBTYPE_INFO:
3113 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3114 
3115 		DWARN(vswp, "%s: restarting handshake..", __func__);
3116 		break;
3117 
3118 	case VIO_SUBTYPE_ACK:
3119 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3120 
3121 		DWARN(vswp, "%s: restarting handshake..", __func__);
3122 		break;
3123 
3124 	case VIO_SUBTYPE_NACK:
3125 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3126 
3127 		DWARN(vswp, "%s: restarting handshake..", __func__);
3128 		break;
3129 
3130 	default:
3131 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3132 		    dring_pkt->tag.vio_subtype);
3133 	}
3134 
3135 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3136 
3137 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3138 }
3139 
3140 #define	SND_MCST_NACK(ldcp, pkt) \
3141 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3142 	pkt->tag.vio_sid = ldcp->local_session; \
3143 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3144 			sizeof (vnet_mcast_msg_t), B_TRUE);
3145 
3146 /*
3147  * Process a multicast request from a vnet.
3148  *
3149  * Vnet's specify a multicast address that they are interested in. This
3150  * address is used as a key into the hash table which forms the multicast
3151  * forwarding database (mFDB).
3152  *
3153  * The table keys are the multicast addresses, while the table entries
3154  * are pointers to lists of ports which wish to receive packets for the
3155  * specified multicast address.
3156  *
3157  * When a multicast packet is being switched we use the address as a key
3158  * into the hash table, and then walk the appropriate port list forwarding
3159  * the pkt to each port in turn.
3160  *
3161  * If a vnet is no longer interested in a particular multicast grouping
3162  * we simply find the correct location in the hash table and then delete
3163  * the relevant port from the port list.
3164  *
3165  * To deal with the case whereby a port is being deleted without first
3166  * removing itself from the lists in the hash table, we maintain a list
3167  * of multicast addresses the port has registered an interest in, within
3168  * the port structure itself. We then simply walk that list of addresses
3169  * using them as keys into the hash table and remove the port from the
3170  * appropriate lists.
3171  */
3172 static void
3173 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3174 {
3175 	vnet_mcast_msg_t	*mcst_pkt;
3176 	vsw_port_t		*port = ldcp->ldc_port;
3177 	vsw_t			*vswp = ldcp->ldc_vswp;
3178 	int			i;
3179 
3180 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3181 
3182 	/*
3183 	 * We know this is a ctrl/mcast packet so
3184 	 * cast it into the correct structure.
3185 	 */
3186 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3187 
3188 	switch (mcst_pkt->tag.vio_subtype) {
3189 	case VIO_SUBTYPE_INFO:
3190 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3191 
3192 		/*
3193 		 * Check if in correct state to receive a multicast
3194 		 * message (i.e. handshake complete). If not reset
3195 		 * the handshake.
3196 		 */
3197 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3198 			return;
3199 
3200 		/*
3201 		 * Before attempting to add or remove address check
3202 		 * that they are valid multicast addresses.
3203 		 * If not, then NACK back.
3204 		 */
3205 		for (i = 0; i < mcst_pkt->count; i++) {
3206 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3207 				DERR(vswp, "%s: invalid multicast address",
3208 				    __func__);
3209 				SND_MCST_NACK(ldcp, mcst_pkt);
3210 				return;
3211 			}
3212 		}
3213 
3214 		/*
3215 		 * Now add/remove the addresses. If this fails we
3216 		 * NACK back.
3217 		 */
3218 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3219 			SND_MCST_NACK(ldcp, mcst_pkt);
3220 			return;
3221 		}
3222 
3223 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3224 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3225 
3226 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3227 
3228 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3229 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3230 		break;
3231 
3232 	case VIO_SUBTYPE_ACK:
3233 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3234 
3235 		/*
3236 		 * We shouldn't ever get a multicast ACK message as
3237 		 * at the moment we never request multicast addresses
3238 		 * to be set on some other device. This may change in
3239 		 * the future if we have cascading switches.
3240 		 */
3241 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3242 			return;
3243 
3244 				/* Do nothing */
3245 		break;
3246 
3247 	case VIO_SUBTYPE_NACK:
3248 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3249 
3250 		/*
3251 		 * We shouldn't get a multicast NACK packet for the
3252 		 * same reasons as we shouldn't get a ACK packet.
3253 		 */
3254 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3255 			return;
3256 
3257 				/* Do nothing */
3258 		break;
3259 
3260 	default:
3261 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3262 		    mcst_pkt->tag.vio_subtype);
3263 	}
3264 
3265 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3266 }
3267 
3268 static void
3269 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3270 {
3271 	vio_rdx_msg_t	*rdx_pkt;
3272 	vsw_t		*vswp = ldcp->ldc_vswp;
3273 
3274 	/*
3275 	 * We know this is a ctrl/rdx packet so
3276 	 * cast it into the correct structure.
3277 	 */
3278 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3279 
3280 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3281 
3282 	switch (rdx_pkt->tag.vio_subtype) {
3283 	case VIO_SUBTYPE_INFO:
3284 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3285 
3286 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3287 			return;
3288 
3289 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3290 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3291 
3292 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3293 
3294 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3295 
3296 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3297 		    sizeof (vio_rdx_msg_t), B_TRUE);
3298 
3299 		vsw_next_milestone(ldcp);
3300 		break;
3301 
3302 	case VIO_SUBTYPE_ACK:
3303 		/*
3304 		 * Should be handled in-band by callback handler.
3305 		 */
3306 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3307 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3308 		break;
3309 
3310 	case VIO_SUBTYPE_NACK:
3311 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3312 
3313 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3314 			return;
3315 
3316 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3317 		vsw_next_milestone(ldcp);
3318 		break;
3319 
3320 	default:
3321 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3322 		    rdx_pkt->tag.vio_subtype);
3323 	}
3324 
3325 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3326 }
3327 
3328 static void
3329 vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
3330 {
3331 	vnet_physlink_msg_t	*msgp;
3332 	vsw_t			*vswp = ldcp->ldc_vswp;
3333 
3334 	msgp = (vnet_physlink_msg_t *)pkt;
3335 
3336 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3337 
3338 	switch (msgp->tag.vio_subtype) {
3339 	case VIO_SUBTYPE_INFO:
3340 
3341 		/* vsw shouldn't recv physlink info */
3342 		DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
3343 		break;
3344 
3345 	case VIO_SUBTYPE_ACK:
3346 
3347 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3348 		break;
3349 
3350 	case VIO_SUBTYPE_NACK:
3351 
3352 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3353 		break;
3354 
3355 	default:
3356 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3357 		    msgp->tag.vio_subtype);
3358 	}
3359 
3360 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3361 }
3362 
3363 static void
3364 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3365 	uint32_t msglen)
3366 {
3367 	uint16_t	env = tagp->vio_subtype_env;
3368 	vsw_t		*vswp = ldcp->ldc_vswp;
3369 	lane_t		*lp = &ldcp->lane_out;
3370 	uint8_t		dring_mode = lp->dring_mode;
3371 
3372 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3373 
3374 	/* session id check */
3375 	if (ldcp->session_status & VSW_PEER_SESSION) {
3376 		if (ldcp->peer_session != tagp->vio_sid) {
3377 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3378 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3379 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3380 			return;
3381 		}
3382 	}
3383 
3384 	/*
3385 	 * It is an error for us to be getting data packets
3386 	 * before the handshake has completed.
3387 	 */
3388 	if (ldcp->hphase != VSW_MILESTONE4) {
3389 		DERR(vswp, "%s: got data packet before handshake complete "
3390 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3391 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3392 		DUMP_FLAGS(ldcp->lane_in.lstate);
3393 		DUMP_FLAGS(ldcp->lane_out.lstate);
3394 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3395 		return;
3396 	}
3397 	if (dring_mode == VIO_TX_DRING) {
3398 		/*
3399 		 * To reduce the locking contention, release the ldc_cblock
3400 		 * here and re-acquire it once we are done receiving packets.
3401 		 * We do this only in TxDring mode to allow further callbaks to
3402 		 * continue while the msg worker thread processes the messages.
3403 		 * In RxDringData mode, we process the messages in the callback
3404 		 * itself and wake up rcv worker thread to process only data
3405 		 * info messages.
3406 		 */
3407 		mutex_exit(&ldcp->ldc_cblock);
3408 		mutex_enter(&ldcp->ldc_rxlock);
3409 	}
3410 
3411 	/*
3412 	 * Switch on vio_subtype envelope, then let lower routines
3413 	 * decide if its an INFO, ACK or NACK packet.
3414 	 */
3415 	if (env == VIO_DRING_DATA) {
3416 		ldcp->rx_dringdata(ldcp, dpkt);
3417 	} else if (env == VIO_PKT_DATA) {
3418 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3419 	} else if (env == VIO_DESC_DATA) {
3420 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3421 	} else {
3422 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n",
3423 		    __func__, env);
3424 	}
3425 
3426 	if (dring_mode == VIO_TX_DRING) {
3427 		mutex_exit(&ldcp->ldc_rxlock);
3428 		mutex_enter(&ldcp->ldc_cblock);
3429 	}
3430 
3431 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3432 }
3433 
3434 /*
3435  * dummy pkt data handler function for vnet protocol version 1.0
3436  */
3437 static void
3438 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3439 {
3440 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3441 }
3442 
3443 /*
3444  * This function handles raw pkt data messages received over the channel.
3445  * Currently, only priority-eth-type frames are received through this mechanism.
3446  * In this case, the frame(data) is present within the message itself which
3447  * is copied into an mblk before switching it.
3448  */
3449 static void
3450 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3451 {
3452 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3453 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3454 	uint32_t		size;
3455 	mblk_t			*mp;
3456 	vio_mblk_t		*vmp;
3457 	vsw_t			*vswp = ldcp->ldc_vswp;
3458 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3459 	lane_t			*lp = &ldcp->lane_out;
3460 
3461 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3462 	if (size < ETHERMIN || size > lp->mtu) {
3463 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3464 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3465 		    ldcp->ldc_id, size);
3466 		return;
3467 	}
3468 
3469 	vmp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3470 	if (vmp == NULL) {
3471 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3472 		if (mp == NULL) {
3473 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3474 			DWARN(vswp, "%s(%lld) allocb failure, "
3475 			    "unable to process priority frame\n", __func__,
3476 			    ldcp->ldc_id);
3477 			return;
3478 		}
3479 	} else {
3480 		mp = vmp->mp;
3481 	}
3482 
3483 	/* skip over the extra space for vlan tag */
3484 	mp->b_rptr += VLAN_TAGSZ;
3485 
3486 	/* copy the frame from the payload of raw data msg into the mblk */
3487 	bcopy(dpkt->data, mp->b_rptr, size);
3488 	mp->b_wptr = mp->b_rptr + size;
3489 
3490 	if (vmp != NULL) {
3491 		vmp->state = VIO_MBLK_HAS_DATA;
3492 	}
3493 
3494 	/* update stats */
3495 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3496 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3497 
3498 	/*
3499 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3500 	 */
3501 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3502 
3503 	/* switch the frame to destination */
3504 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3505 }
3506 
3507 /*
3508  * Process an in-band descriptor message (most likely from
3509  * OBP).
3510  */
3511 static void
3512 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3513 {
3514 	vnet_ibnd_desc_t	*ibnd_desc;
3515 	dring_info_t		*dp = NULL;
3516 	vsw_private_desc_t	*priv_addr = NULL;
3517 	vsw_t			*vswp = ldcp->ldc_vswp;
3518 	mblk_t			*mp = NULL;
3519 	size_t			nbytes = 0;
3520 	size_t			off = 0;
3521 	uint64_t		idx = 0;
3522 	uint32_t		num = 1, len, datalen = 0;
3523 	uint64_t		ncookies = 0;
3524 	int			i, rv;
3525 	int			j = 0;
3526 
3527 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3528 
3529 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3530 
3531 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3532 	case VIO_SUBTYPE_INFO:
3533 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3534 
3535 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3536 			return;
3537 
3538 		/*
3539 		 * Data is padded to align on a 8 byte boundary,
3540 		 * nbytes is actual data length, i.e. minus that
3541 		 * padding.
3542 		 */
3543 		datalen = ibnd_desc->nbytes;
3544 
3545 		D2(vswp, "%s(%lld): processing inband desc : "
3546 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3547 
3548 		ncookies = ibnd_desc->ncookies;
3549 
3550 		/*
3551 		 * allocb(9F) returns an aligned data block. We
3552 		 * need to ensure that we ask ldc for an aligned
3553 		 * number of bytes also.
3554 		 */
3555 		nbytes = datalen;
3556 		if (nbytes & 0x7) {
3557 			off = 8 - (nbytes & 0x7);
3558 			nbytes += off;
3559 		}
3560 
3561 		/* alloc extra space for VLAN_TAG */
3562 		mp = allocb(datalen + 8, BPRI_MED);
3563 		if (mp == NULL) {
3564 			DERR(vswp, "%s(%lld): allocb failed",
3565 			    __func__, ldcp->ldc_id);
3566 			ldcp->ldc_stats.rx_allocb_fail++;
3567 			return;
3568 		}
3569 
3570 		/* skip over the extra space for VLAN_TAG */
3571 		mp->b_rptr += 8;
3572 
3573 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3574 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3575 		    LDC_COPY_IN);
3576 
3577 		if (rv != 0) {
3578 			DERR(vswp, "%s(%d): unable to copy in data from "
3579 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3580 			freemsg(mp);
3581 			ldcp->ldc_stats.ierrors++;
3582 			return;
3583 		}
3584 
3585 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3586 		    __func__, ldcp->ldc_id, nbytes, ncookies);
3587 
3588 		/* point to the actual end of data */
3589 		mp->b_wptr = mp->b_rptr + datalen;
3590 		ldcp->ldc_stats.ipackets++;
3591 		ldcp->ldc_stats.rbytes += datalen;
3592 
3593 		/*
3594 		 * We ACK back every in-band descriptor message we process
3595 		 */
3596 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3597 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3598 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3599 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3600 
3601 		/*
3602 		 * there is extra space alloc'd for VLAN_TAG
3603 		 */
3604 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3605 
3606 		/* send the packet to be switched */
3607 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3608 		    ldcp->ldc_port, NULL);
3609 
3610 		break;
3611 
3612 	case VIO_SUBTYPE_ACK:
3613 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3614 
3615 		/* Verify the ACK is valid */
3616 		idx = ibnd_desc->hdr.desc_handle;
3617 
3618 		if (idx >= vsw_num_descriptors) {
3619 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3620 			    "(idx %ld)", vswp->instance, idx);
3621 			return;
3622 		}
3623 
3624 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3625 			DERR(vswp, "%s: no dring found", __func__);
3626 			return;
3627 		}
3628 
3629 		len = dp->num_descriptors;
3630 		/*
3631 		 * If the descriptor we are being ACK'ed for is not the
3632 		 * one we expected, then pkts were lost somwhere, either
3633 		 * when we tried to send a msg, or a previous ACK msg from
3634 		 * our peer. In either case we now reclaim the descriptors
3635 		 * in the range from the last ACK we received up to the
3636 		 * current ACK.
3637 		 */
3638 		if (idx != dp->last_ack_recv) {
3639 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3640 			    __func__, dp->last_ack_recv, idx);
3641 			num = idx >= dp->last_ack_recv ?
3642 			    idx - dp->last_ack_recv + 1:
3643 			    (len - dp->last_ack_recv + 1) + idx;
3644 		}
3645 
3646 		/*
3647 		 * When we sent the in-band message to our peer we
3648 		 * marked the copy in our private ring as READY. We now
3649 		 * check that the descriptor we are being ACK'ed for is in
3650 		 * fact READY, i.e. it is one we have shared with our peer.
3651 		 *
3652 		 * If its not we flag an error, but still reset the descr
3653 		 * back to FREE.
3654 		 */
3655 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3656 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3657 			mutex_enter(&priv_addr->dstate_lock);
3658 			if (priv_addr->dstate != VIO_DESC_READY) {
3659 				DERR(vswp, "%s: (%ld) desc at index %ld not "
3660 				    "READY (0x%lx)", __func__,
3661 				    ldcp->ldc_id, idx, priv_addr->dstate);
3662 				DERR(vswp, "%s: bound %d: ncookies %ld : "
3663 				    "datalen %ld", __func__,
3664 				    priv_addr->bound, priv_addr->ncookies,
3665 				    priv_addr->datalen);
3666 			}
3667 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3668 			    ldcp->ldc_id, idx);
3669 			/* release resources associated with sent msg */
3670 			priv_addr->datalen = 0;
3671 			priv_addr->dstate = VIO_DESC_FREE;
3672 			mutex_exit(&priv_addr->dstate_lock);
3673 		}
3674 		/* update to next expected value */
3675 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3676 
3677 		break;
3678 
3679 	case VIO_SUBTYPE_NACK:
3680 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3681 
3682 		/*
3683 		 * We should only get a NACK if our peer doesn't like
3684 		 * something about a message we have sent it. If this
3685 		 * happens we just release the resources associated with
3686 		 * the message. (We are relying on higher layers to decide
3687 		 * whether or not to resend.
3688 		 */
3689 
3690 		/* limit check */
3691 		idx = ibnd_desc->hdr.desc_handle;
3692 
3693 		if (idx >= vsw_num_descriptors) {
3694 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3695 			    __func__, idx);
3696 			return;
3697 		}
3698 
3699 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3700 			DERR(vswp, "%s: no dring found", __func__);
3701 			return;
3702 		}
3703 
3704 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3705 
3706 		/* move to correct location in ring */
3707 		priv_addr += idx;
3708 
3709 		/* release resources associated with sent msg */
3710 		mutex_enter(&priv_addr->dstate_lock);
3711 		priv_addr->datalen = 0;
3712 		priv_addr->dstate = VIO_DESC_FREE;
3713 		mutex_exit(&priv_addr->dstate_lock);
3714 
3715 		break;
3716 
3717 	default:
3718 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3719 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3720 	}
3721 
3722 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3723 }
3724 
3725 static void
3726 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3727 {
3728 	_NOTE(ARGUNUSED(epkt))
3729 
3730 	vsw_t		*vswp = ldcp->ldc_vswp;
3731 	uint16_t	env = tagp->vio_subtype_env;
3732 
3733 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3734 
3735 	/*
3736 	 * Error vio_subtypes have yet to be defined. So for
3737 	 * the moment we can't do anything.
3738 	 */
3739 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3740 
3741 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3742 }
3743 
3744 /* transmit the packet over the given port */
3745 int
3746 vsw_portsend(vsw_port_t *port, mblk_t *mp)
3747 {
3748 	mblk_t		*mpt;
3749 	int		count;
3750 	vsw_ldc_t 	*ldcp = port->ldcp;
3751 	int		status = 0;
3752 
3753 	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3754 	if (count != 0) {
3755 		status = ldcp->tx(ldcp, mp, mpt, count);
3756 	}
3757 	return (status);
3758 }
3759 
3760 /*
3761  * Break up frames into 2 seperate chains: normal and
3762  * priority, based on the frame type. The number of
3763  * priority frames is also counted and returned.
3764  *
3765  * Params:
3766  * 	vswp:	pointer to the instance of vsw
3767  *	np:	head of packet chain to be broken
3768  *	npt:	tail of packet chain to be broken
3769  *
3770  * Returns:
3771  *	np:	head of normal data packets
3772  *	npt:	tail of normal data packets
3773  *	hp:	head of high priority packets
3774  *	hpt:	tail of high priority packets
3775  */
3776 static uint32_t
3777 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
3778 	mblk_t **hp, mblk_t **hpt)
3779 {
3780 	mblk_t			*tmp = NULL;
3781 	mblk_t			*smp = NULL;
3782 	mblk_t			*hmp = NULL;	/* high prio pkts head */
3783 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
3784 	mblk_t			*nmp = NULL;	/* normal pkts head */
3785 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
3786 	uint32_t		count = 0;
3787 	int			i;
3788 	struct ether_header	*ehp;
3789 	uint32_t		num_types;
3790 	uint16_t		*types;
3791 
3792 	tmp = *np;
3793 	while (tmp != NULL) {
3794 
3795 		smp = tmp;
3796 		tmp = tmp->b_next;
3797 		smp->b_next = NULL;
3798 		smp->b_prev = NULL;
3799 
3800 		ehp = (struct ether_header *)smp->b_rptr;
3801 		num_types = vswp->pri_num_types;
3802 		types = vswp->pri_types;
3803 		for (i = 0; i < num_types; i++) {
3804 			if (ehp->ether_type == types[i]) {
3805 				/* high priority frame */
3806 
3807 				if (hmp != NULL) {
3808 					hmpt->b_next = smp;
3809 					hmpt = smp;
3810 				} else {
3811 					hmp = hmpt = smp;
3812 				}
3813 				count++;
3814 				break;
3815 			}
3816 		}
3817 		if (i == num_types) {
3818 			/* normal data frame */
3819 
3820 			if (nmp != NULL) {
3821 				nmpt->b_next = smp;
3822 				nmpt = smp;
3823 			} else {
3824 				nmp = nmpt = smp;
3825 			}
3826 		}
3827 	}
3828 
3829 	*hp = hmp;
3830 	*hpt = hmpt;
3831 	*np = nmp;
3832 	*npt = nmpt;
3833 
3834 	return (count);
3835 }
3836 
3837 /*
3838  * Wrapper function to transmit normal and/or priority frames over the channel.
3839  */
3840 static int
3841 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3842 {
3843 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
3844 	mblk_t			*tmp;
3845 	mblk_t			*smp;
3846 	mblk_t			*hmp;	/* high prio pkts head */
3847 	mblk_t			*hmpt;	/* high prio pkts tail */
3848 	mblk_t			*nmp;	/* normal pkts head */
3849 	mblk_t			*nmpt;	/* normal pkts tail */
3850 	uint32_t		n = 0;
3851 	vsw_t			*vswp = ldcp->ldc_vswp;
3852 
3853 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
3854 	ASSERT(count != 0);
3855 
3856 	nmp = mp;
3857 	nmpt = mpt;
3858 
3859 	/* gather any priority frames from the chain of packets */
3860 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
3861 
3862 	/* transmit priority frames */
3863 	tmp = hmp;
3864 	while (tmp != NULL) {
3865 		smp = tmp;
3866 		tmp = tmp->b_next;
3867 		smp->b_next = NULL;
3868 		vsw_ldcsend_pkt(ldcp, smp);
3869 	}
3870 
3871 	count -= n;
3872 
3873 	if (count == 0) {
3874 		/* no normal data frames to process */
3875 		return (0);
3876 	}
3877 
3878 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
3879 }
3880 
3881 /*
3882  * Wrapper function to transmit normal frames over the channel.
3883  */
3884 static int
3885 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3886 {
3887 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
3888 	mblk_t		*tmp = NULL;
3889 
3890 	ASSERT(count != 0);
3891 	/*
3892 	 * If the TX thread is enabled, then queue the
3893 	 * ordinary frames and signal the tx thread.
3894 	 */
3895 	if (ldcp->tx_thread != NULL) {
3896 
3897 		mutex_enter(&ldcp->tx_thr_lock);
3898 
3899 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
3900 			/*
3901 			 * If we reached queue limit,
3902 			 * do not queue new packets,
3903 			 * drop them.
3904 			 */
3905 			ldcp->ldc_stats.tx_qfull += count;
3906 			mutex_exit(&ldcp->tx_thr_lock);
3907 			freemsgchain(mp);
3908 			goto exit;
3909 		}
3910 		if (ldcp->tx_mhead == NULL) {
3911 			ldcp->tx_mhead = mp;
3912 			ldcp->tx_mtail = mpt;
3913 			cv_signal(&ldcp->tx_thr_cv);
3914 		} else {
3915 			ldcp->tx_mtail->b_next = mp;
3916 			ldcp->tx_mtail = mpt;
3917 		}
3918 		ldcp->tx_cnt += count;
3919 		mutex_exit(&ldcp->tx_thr_lock);
3920 	} else {
3921 		while (mp != NULL) {
3922 			tmp = mp->b_next;
3923 			mp->b_next = mp->b_prev = NULL;
3924 			(void) vsw_ldcsend(ldcp, mp, 1);
3925 			mp = tmp;
3926 		}
3927 	}
3928 
3929 exit:
3930 	return (0);
3931 }
3932 
3933 /*
3934  * This function transmits the frame in the payload of a raw data
3935  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
3936  * send special frames with high priorities, without going through
3937  * the normal data path which uses descriptor ring mechanism.
3938  */
3939 static void
3940 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
3941 {
3942 	vio_raw_data_msg_t	*pkt;
3943 	mblk_t			*bp;
3944 	mblk_t			*nmp = NULL;
3945 	vio_mblk_t		*vmp;
3946 	caddr_t			dst;
3947 	uint32_t		mblksz;
3948 	uint32_t		size;
3949 	uint32_t		nbytes;
3950 	int			rv;
3951 	vsw_t			*vswp = ldcp->ldc_vswp;
3952 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3953 
3954 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3955 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3956 		(void) atomic_inc_32(&statsp->tx_pri_fail);
3957 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
3958 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
3959 		    ldcp->lane_out.lstate);
3960 		goto send_pkt_exit;
3961 	}
3962 
3963 	size = msgsize(mp);
3964 
3965 	/* frame size bigger than available payload len of raw data msg ? */
3966 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
3967 		(void) atomic_inc_32(&statsp->tx_pri_fail);
3968 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3969 		    ldcp->ldc_id, size);
3970 		goto send_pkt_exit;
3971 	}
3972 
3973 	if (size < ETHERMIN)
3974 		size = ETHERMIN;
3975 
3976 	/* alloc space for a raw data message */
3977 	vmp = vio_allocb(vswp->pri_tx_vmp);
3978 	if (vmp == NULL) {
3979 		(void) atomic_inc_32(&statsp->tx_pri_fail);
3980 		DWARN(vswp, "vio_allocb failed\n");
3981 		goto send_pkt_exit;
3982 	} else {
3983 		nmp = vmp->mp;
3984 	}
3985 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
3986 
3987 	/* copy frame into the payload of raw data message */
3988 	dst = (caddr_t)pkt->data;
3989 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3990 		mblksz = MBLKL(bp);
3991 		bcopy(bp->b_rptr, dst, mblksz);
3992 		dst += mblksz;
3993 	}
3994 
3995 	vmp->state = VIO_MBLK_HAS_DATA;
3996 
3997 	/* setup the raw data msg */
3998 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
3999 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4000 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4001 	pkt->tag.vio_sid = ldcp->local_session;
4002 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4003 
4004 	/* send the msg over ldc */
4005 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4006 	if (rv != 0) {
4007 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4008 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4009 		    ldcp->ldc_id);
4010 		goto send_pkt_exit;
4011 	}
4012 
4013 	/* update stats */
4014 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4015 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4016 
4017 send_pkt_exit:
4018 	if (nmp != NULL)
4019 		freemsg(nmp);
4020 	freemsg(mp);
4021 }
4022 
4023 /*
4024  * Transmit the packet over the given LDC channel.
4025  *
4026  * The 'retries' argument indicates how many times a packet
4027  * is retried before it is dropped. Note, the retry is done
4028  * only for a resource related failure, for all other failures
4029  * the packet is dropped immediately.
4030  */
4031 static int
4032 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4033 {
4034 	int		i;
4035 	int		rc;
4036 	int		status = 0;
4037 	vsw_port_t	*port = ldcp->ldc_port;
4038 	dring_info_t	*dp = NULL;
4039 	lane_t		*lp = &ldcp->lane_out;
4040 
4041 	for (i = 0; i < retries; ) {
4042 		/*
4043 		 * Send the message out using the appropriate
4044 		 * transmit function which will free mblock when it
4045 		 * is finished with it.
4046 		 */
4047 		mutex_enter(&port->tx_lock);
4048 		if (port->transmit != NULL) {
4049 			status = (*port->transmit)(ldcp, mp);
4050 		}
4051 		if (status == LDC_TX_SUCCESS) {
4052 			mutex_exit(&port->tx_lock);
4053 			break;
4054 		}
4055 		i++;	/* increment the counter here */
4056 
4057 		/* If its the last retry, then update the oerror */
4058 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4059 			ldcp->ldc_stats.oerrors++;
4060 		}
4061 		mutex_exit(&port->tx_lock);
4062 
4063 		if (status != LDC_TX_NORESOURCES) {
4064 			/*
4065 			 * No retrying required for errors un-related
4066 			 * to resources.
4067 			 */
4068 			break;
4069 		}
4070 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4071 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4072 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4073 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4074 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4075 
4076 			/* Need to reclaim in TxDring mode. */
4077 			if (lp->dring_mode == VIO_TX_DRING) {
4078 				rc = vsw_reclaim_dring(dp, dp->end_idx);
4079 			}
4080 
4081 		} else {
4082 			/*
4083 			 * If there is no dring or the xfer_mode is
4084 			 * set to DESC_MODE(ie., OBP), then simply break here.
4085 			 */
4086 			break;
4087 		}
4088 
4089 		/*
4090 		 * Delay only if none were reclaimed
4091 		 * and its not the last retry.
4092 		 */
4093 		if ((rc == 0) && (i < retries)) {
4094 			delay(drv_usectohz(vsw_ldc_tx_delay));
4095 		}
4096 	}
4097 	freemsg(mp);
4098 	return (status);
4099 }
4100 
4101 /*
4102  * Send an in-band descriptor message over ldc.
4103  */
4104 static int
4105 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4106 {
4107 	vsw_t			*vswp = ldcp->ldc_vswp;
4108 	vnet_ibnd_desc_t	ibnd_msg;
4109 	vsw_private_desc_t	*priv_desc = NULL;
4110 	dring_info_t		*dp = NULL;
4111 	size_t			n, size = 0;
4112 	caddr_t			bufp;
4113 	mblk_t			*bp;
4114 	int			idx, i;
4115 	int			status = LDC_TX_SUCCESS;
4116 	static int		warn_msg = 1;
4117 	lane_t			*lp = &ldcp->lane_out;
4118 
4119 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4120 
4121 	ASSERT(mp != NULL);
4122 
4123 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4124 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4125 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4126 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4127 		    ldcp->lane_out.lstate);
4128 		ldcp->ldc_stats.oerrors++;
4129 		return (LDC_TX_FAILURE);
4130 	}
4131 
4132 	/*
4133 	 * The dring here is as an internal buffer,
4134 	 * rather than a transfer channel.
4135 	 */
4136 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4137 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4138 		    __func__, ldcp->ldc_id);
4139 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4140 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4141 		ldcp->ldc_stats.oerrors++;
4142 		return (LDC_TX_FAILURE);
4143 	}
4144 
4145 	size = msgsize(mp);
4146 	if (size > (size_t)lp->mtu) {
4147 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4148 		    ldcp->ldc_id, size);
4149 		ldcp->ldc_stats.oerrors++;
4150 		return (LDC_TX_FAILURE);
4151 	}
4152 
4153 	/*
4154 	 * Find a free descriptor in our buffer ring
4155 	 */
4156 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4157 		if (warn_msg) {
4158 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4159 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4160 			warn_msg = 0;
4161 		}
4162 
4163 		/* nothing more we can do */
4164 		status = LDC_TX_NORESOURCES;
4165 		goto vsw_descrsend_free_exit;
4166 	} else {
4167 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4168 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4169 		warn_msg = 1;
4170 	}
4171 
4172 	/* copy data into the descriptor */
4173 	bufp = priv_desc->datap;
4174 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4175 		n = MBLKL(bp);
4176 		bcopy(bp->b_rptr, bufp, n);
4177 		bufp += n;
4178 	}
4179 
4180 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4181 
4182 	/* create and send the in-band descp msg */
4183 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4184 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4185 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4186 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4187 
4188 	/*
4189 	 * Copy the mem cookies describing the data from the
4190 	 * private region of the descriptor ring into the inband
4191 	 * descriptor.
4192 	 */
4193 	for (i = 0; i < priv_desc->ncookies; i++) {
4194 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4195 		    sizeof (ldc_mem_cookie_t));
4196 	}
4197 
4198 	ibnd_msg.hdr.desc_handle = idx;
4199 	ibnd_msg.ncookies = priv_desc->ncookies;
4200 	ibnd_msg.nbytes = size;
4201 
4202 	ldcp->ldc_stats.opackets++;
4203 	ldcp->ldc_stats.obytes += size;
4204 
4205 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4206 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4207 
4208 vsw_descrsend_free_exit:
4209 
4210 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4211 	return (status);
4212 }
4213 
4214 static void
4215 vsw_send_ver(void *arg)
4216 {
4217 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4218 	vsw_t		*vswp = ldcp->ldc_vswp;
4219 	lane_t		*lp = &ldcp->lane_out;
4220 	vio_ver_msg_t	ver_msg;
4221 
4222 	D1(vswp, "%s enter", __func__);
4223 
4224 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4225 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4226 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4227 	ver_msg.tag.vio_sid = ldcp->local_session;
4228 
4229 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4230 		ver_msg.ver_major = vsw_versions[0].ver_major;
4231 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4232 	} else {
4233 		/* use the major,minor that we've ack'd */
4234 		lane_t	*lpi = &ldcp->lane_in;
4235 		ver_msg.ver_major = lpi->ver_major;
4236 		ver_msg.ver_minor = lpi->ver_minor;
4237 	}
4238 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4239 
4240 	lp->lstate |= VSW_VER_INFO_SENT;
4241 	lp->ver_major = ver_msg.ver_major;
4242 	lp->ver_minor = ver_msg.ver_minor;
4243 
4244 	DUMP_TAG(ver_msg.tag);
4245 
4246 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4247 
4248 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4249 }
4250 
4251 static void
4252 vsw_send_attr(vsw_ldc_t *ldcp)
4253 {
4254 	vsw_t			*vswp = ldcp->ldc_vswp;
4255 	lane_t			*lp = &ldcp->lane_out;
4256 	vnet_attr_msg_t		attr_msg;
4257 
4258 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4259 
4260 	/*
4261 	 * Subtype is set to INFO by default
4262 	 */
4263 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4264 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4265 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4266 	attr_msg.tag.vio_sid = ldcp->local_session;
4267 
4268 	/* payload copied from default settings for lane */
4269 	attr_msg.mtu = lp->mtu;
4270 	attr_msg.addr_type = lp->addr_type;
4271 	attr_msg.xfer_mode = lp->xfer_mode;
4272 	attr_msg.ack_freq = lp->xfer_mode;
4273 	attr_msg.options = lp->dring_mode;
4274 
4275 	READ_ENTER(&vswp->if_lockrw);
4276 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4277 	RW_EXIT(&vswp->if_lockrw);
4278 
4279 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4280 
4281 	DUMP_TAG(attr_msg.tag);
4282 
4283 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4284 
4285 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4286 }
4287 
4288 static void
4289 vsw_send_dring_info(vsw_ldc_t *ldcp)
4290 {
4291 	int		msgsize;
4292 	void		*msg;
4293 	vsw_t		*vswp = ldcp->ldc_vswp;
4294 	vsw_port_t	*port = ldcp->ldc_port;
4295 	lane_t		*lp = &ldcp->lane_out;
4296 	vgen_stats_t	*statsp = &ldcp->ldc_stats;
4297 
4298 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4299 
4300 	/* dring mode has been negotiated in attr phase; save in stats */
4301 	statsp->dring_mode = lp->dring_mode;
4302 
4303 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4304 		/*
4305 		 * Change the transmit routine for RxDringData mode.
4306 		 */
4307 		port->transmit = vsw_dringsend_shm;
4308 		msg = (void *) vsw_create_rx_dring_info(ldcp);
4309 		if (msg == NULL) {
4310 			return;
4311 		}
4312 		msgsize =
4313 		    VNET_DRING_REG_EXT_MSG_SIZE(lp->dringp->data_ncookies);
4314 		ldcp->rcv_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4315 		    vsw_ldc_rcv_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4316 		ldcp->rx_dringdata = vsw_process_dringdata_shm;
4317 	} else {
4318 		msg = (void *) vsw_create_tx_dring_info(ldcp);
4319 		if (msg == NULL) {
4320 			return;
4321 		}
4322 		msgsize = sizeof (vio_dring_reg_msg_t);
4323 		ldcp->msg_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4324 		    vsw_ldc_msg_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4325 		ldcp->rx_dringdata = vsw_process_dringdata;
4326 	}
4327 
4328 	lp->lstate |= VSW_DRING_INFO_SENT;
4329 	DUMP_TAG_PTR((vio_msg_tag_t *)msg);
4330 	(void) vsw_send_msg(ldcp, msg, msgsize, B_TRUE);
4331 	kmem_free(msg, msgsize);
4332 
4333 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4334 }
4335 
4336 static void
4337 vsw_send_rdx(vsw_ldc_t *ldcp)
4338 {
4339 	vsw_t		*vswp = ldcp->ldc_vswp;
4340 	vio_rdx_msg_t	rdx_msg;
4341 
4342 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4343 
4344 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4345 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4346 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4347 	rdx_msg.tag.vio_sid = ldcp->local_session;
4348 
4349 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4350 
4351 	DUMP_TAG(rdx_msg.tag);
4352 
4353 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4354 
4355 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4356 }
4357 
4358 /*
4359  * Remove the specified address from the list of address maintained
4360  * in this port node.
4361  */
4362 mcst_addr_t *
4363 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4364 {
4365 	vsw_t		*vswp = NULL;
4366 	vsw_port_t	*port = NULL;
4367 	mcst_addr_t	*prev_p = NULL;
4368 	mcst_addr_t	*curr_p = NULL;
4369 
4370 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4371 	    __func__, devtype, addr);
4372 
4373 	if (devtype == VSW_VNETPORT) {
4374 		port = (vsw_port_t *)arg;
4375 		mutex_enter(&port->mca_lock);
4376 		prev_p = curr_p = port->mcap;
4377 	} else {
4378 		vswp = (vsw_t *)arg;
4379 		mutex_enter(&vswp->mca_lock);
4380 		prev_p = curr_p = vswp->mcap;
4381 	}
4382 
4383 	while (curr_p != NULL) {
4384 		if (curr_p->addr == addr) {
4385 			D2(NULL, "%s: address found", __func__);
4386 			/* match found */
4387 			if (prev_p == curr_p) {
4388 				/* list head */
4389 				if (devtype == VSW_VNETPORT)
4390 					port->mcap = curr_p->nextp;
4391 				else
4392 					vswp->mcap = curr_p->nextp;
4393 			} else {
4394 				prev_p->nextp = curr_p->nextp;
4395 			}
4396 			break;
4397 		} else {
4398 			prev_p = curr_p;
4399 			curr_p = curr_p->nextp;
4400 		}
4401 	}
4402 
4403 	if (devtype == VSW_VNETPORT)
4404 		mutex_exit(&port->mca_lock);
4405 	else
4406 		mutex_exit(&vswp->mca_lock);
4407 
4408 	D1(NULL, "%s: exit", __func__);
4409 
4410 	return (curr_p);
4411 }
4412 
4413 /*
4414  * Create a ring consisting of just a private portion and link
4415  * it into the list of rings for the outbound lane.
4416  *
4417  * These type of rings are used primarily for temporary data
4418  * storage (i.e. as data buffers).
4419  */
4420 void
4421 vsw_create_privring(vsw_ldc_t *ldcp)
4422 {
4423 	dring_info_t		*dp;
4424 	vsw_t			*vswp = ldcp->ldc_vswp;
4425 
4426 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4427 
4428 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4429 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4430 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4431 	ldcp->lane_out.dringp = dp;
4432 
4433 	/* no public section */
4434 	dp->pub_addr = NULL;
4435 	dp->priv_addr = kmem_zalloc(
4436 	    (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP);
4437 	dp->num_descriptors = vsw_num_descriptors;
4438 
4439 	if (vsw_setup_tx_dring(ldcp, dp)) {
4440 		DERR(vswp, "%s: setup of ring failed", __func__);
4441 		vsw_destroy_tx_dring(ldcp);
4442 		return;
4443 	}
4444 
4445 	/* haven't used any descriptors yet */
4446 	dp->end_idx = 0;
4447 	dp->restart_reqd = B_TRUE;
4448 
4449 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4450 }
4451 
4452 /*
4453  * Set the default lane attributes. These are copied into
4454  * the attr msg we send to our peer. If they are not acceptable
4455  * then (currently) the handshake ends.
4456  */
4457 static void
4458 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
4459 {
4460 	bzero(lp, sizeof (lane_t));
4461 
4462 	READ_ENTER(&vswp->if_lockrw);
4463 	ether_copy(&(vswp->if_addr), &(lp->addr));
4464 	RW_EXIT(&vswp->if_lockrw);
4465 
4466 	lp->mtu = vswp->max_frame_size;
4467 	lp->addr_type = ADDR_TYPE_MAC;
4468 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
4469 	lp->ack_freq = 0;	/* for shared mode */
4470 	lp->seq_num = VNET_ISS;
4471 }
4472 
4473 /*
4474  * Map the descriptor ring exported by the peer.
4475  */
4476 static dring_info_t *
4477 vsw_map_dring(vsw_ldc_t *ldcp, void *pkt)
4478 {
4479 	dring_info_t	*dp = NULL;
4480 	lane_t		*lp = &ldcp->lane_out;
4481 
4482 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4483 		/*
4484 		 * In RxDringData mode, dring that we map in
4485 		 * becomes our transmit descriptor ring.
4486 		 */
4487 		dp =  vsw_map_tx_dring(ldcp, pkt);
4488 	} else {
4489 		/*
4490 		 * In TxDring mode, dring that we map in
4491 		 * becomes our receive descriptor ring.
4492 		 */
4493 		dp =  vsw_map_rx_dring(ldcp, pkt);
4494 	}
4495 	return (dp);
4496 }
4497 
4498 /*
4499  * Common dring mapping function used in both TxDring and RxDringData modes.
4500  */
4501 dring_info_t *
4502 vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt)
4503 {
4504 	int		rv;
4505 	dring_info_t	*dp;
4506 	ldc_mem_info_t	minfo;
4507 	vsw_t		*vswp = ldcp->ldc_vswp;
4508 
4509 	/*
4510 	 * If the dring params are unacceptable then we NACK back.
4511 	 */
4512 	if ((dring_pkt->num_descriptors == 0) ||
4513 	    (dring_pkt->descriptor_size == 0) ||
4514 	    (dring_pkt->ncookies != 1)) {
4515 		DERR(vswp, "%s (%lld): invalid dring info",
4516 		    __func__, ldcp->ldc_id);
4517 		return (NULL);
4518 	}
4519 
4520 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4521 
4522 	dp->num_descriptors = dring_pkt->num_descriptors;
4523 	dp->descriptor_size = dring_pkt->descriptor_size;
4524 	dp->options = dring_pkt->options;
4525 	dp->dring_ncookies = dring_pkt->ncookies;
4526 
4527 	/*
4528 	 * Note: should only get one cookie. Enforced in
4529 	 * the ldc layer.
4530 	 */
4531 	bcopy(&dring_pkt->cookie[0], &dp->dring_cookie[0],
4532 	    sizeof (ldc_mem_cookie_t));
4533 
4534 	rv = ldc_mem_dring_map(ldcp->ldc_handle, &dp->dring_cookie[0],
4535 	    dp->dring_ncookies, dp->num_descriptors, dp->descriptor_size,
4536 	    LDC_DIRECT_MAP, &(dp->dring_handle));
4537 	if (rv != 0) {
4538 		goto fail;
4539 	}
4540 
4541 	rv = ldc_mem_dring_info(dp->dring_handle, &minfo);
4542 	if (rv != 0) {
4543 		goto fail;
4544 	}
4545 	/* store the address of the ring */
4546 	dp->pub_addr = minfo.vaddr;
4547 
4548 	/* cache the dring mtype */
4549 	dp->dring_mtype = minfo.mtype;
4550 
4551 	/* no private section as we are importing */
4552 	dp->priv_addr = NULL;
4553 
4554 	/*
4555 	 * Using simple mono increasing int for ident at the moment.
4556 	 */
4557 	dp->ident = ldcp->next_ident;
4558 	ldcp->next_ident++;
4559 
4560 	/*
4561 	 * Acknowledge it; we send back a unique dring identifier that
4562 	 * the sending side will use in future to refer to this
4563 	 * descriptor ring.
4564 	 */
4565 	dring_pkt->dring_ident = dp->ident;
4566 
4567 	return (dp);
4568 fail:
4569 	if (dp->dring_handle != NULL) {
4570 		(void) ldc_mem_dring_unmap(dp->dring_handle);
4571 	}
4572 	kmem_free(dp, sizeof (*dp));
4573 	return (NULL);
4574 }
4575 
4576 /*
4577  * Unmap the descriptor ring exported by the peer.
4578  */
4579 static void
4580 vsw_unmap_dring(vsw_ldc_t *ldcp)
4581 {
4582 	lane_t	*lane_out = &ldcp->lane_out;
4583 
4584 	if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
4585 		vsw_unmap_tx_dring(ldcp);
4586 	} else {
4587 		vsw_unmap_rx_dring(ldcp);
4588 	}
4589 }
4590 
4591 /*
4592  * Map the shared memory data buffer area exported by the peer.
4593  * Used in RxDringData mode only.
4594  */
4595 static int
4596 vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt)
4597 {
4598 	int			rv;
4599 	vio_dring_reg_ext_msg_t	*emsg;
4600 	vio_dring_reg_msg_t	*msg = pkt;
4601 	uint8_t			*buf = (uint8_t *)msg->cookie;
4602 	vsw_t			*vswp = ldcp->ldc_vswp;
4603 	ldc_mem_info_t		minfo;
4604 
4605 	/* skip over dring cookies */
4606 	ASSERT(msg->ncookies == 1);
4607 	buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));
4608 
4609 	emsg = (vio_dring_reg_ext_msg_t *)buf;
4610 	if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
4611 		return (1);
4612 	}
4613 
4614 	/* save # of data area cookies */
4615 	dp->data_ncookies = emsg->data_ncookies;
4616 
4617 	/* save data area size */
4618 	dp->data_sz = emsg->data_area_size;
4619 
4620 	/* allocate ldc mem handle for data area */
4621 	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &dp->data_handle);
4622 	if (rv != 0) {
4623 		cmn_err(CE_WARN, "ldc_mem_alloc_handle failed\n");
4624 		DWARN(vswp, "%s (%lld) ldc_mem_alloc_handle() failed: %d\n",
4625 		    __func__, ldcp->ldc_id, rv);
4626 		return (1);
4627 	}
4628 
4629 	/* map the data area */
4630 	rv = ldc_mem_map(dp->data_handle, emsg->data_cookie,
4631 	    emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_R,
4632 	    (caddr_t *)&dp->data_addr, NULL);
4633 	if (rv != 0) {
4634 		cmn_err(CE_WARN, "ldc_mem_map failed\n");
4635 		DWARN(vswp, "%s (%lld) ldc_mem_map() failed: %d\n",
4636 		    __func__, ldcp->ldc_id, rv);
4637 		return (1);
4638 	}
4639 
4640 	/* get the map info */
4641 	rv = ldc_mem_info(dp->data_handle, &minfo);
4642 	if (rv != 0) {
4643 		cmn_err(CE_WARN, "ldc_mem_info failed\n");
4644 		DWARN(vswp, "%s (%lld) ldc_mem_info() failed: %d\n",
4645 		    __func__, ldcp->ldc_id, rv);
4646 		return (1);
4647 	}
4648 
4649 	if (minfo.mtype != LDC_DIRECT_MAP) {
4650 		DWARN(vswp, "%s (%lld) mtype(%d) is not direct map\n",
4651 		    __func__, ldcp->ldc_id, minfo.mtype);
4652 		return (1);
4653 	}
4654 
4655 	/* allocate memory for data area cookies */
4656 	dp->data_cookie = kmem_zalloc(emsg->data_ncookies *
4657 	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
4658 
4659 	/* save data area cookies */
4660 	bcopy(emsg->data_cookie, dp->data_cookie,
4661 	    emsg->data_ncookies * sizeof (ldc_mem_cookie_t));
4662 
4663 	return (0);
4664 }
4665 
4666 /*
4667  * Reset and free all the resources associated with the channel.
4668  */
4669 static void
4670 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
4671 {
4672 	lane_t	*lp;
4673 
4674 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
4675 
4676 	if (dir == INBOUND) {
4677 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
4678 		    " of channel %lld", __func__, ldcp->ldc_id);
4679 		lp = &ldcp->lane_in;
4680 	} else {
4681 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
4682 		    " of channel %lld", __func__, ldcp->ldc_id);
4683 		lp = &ldcp->lane_out;
4684 	}
4685 
4686 	lp->lstate = VSW_LANE_INACTIV;
4687 	lp->seq_num = VNET_ISS;
4688 
4689 	if (dir == INBOUND) {
4690 		/* Unmap the remote dring which is imported from the peer */
4691 		vsw_unmap_dring(ldcp);
4692 	} else {
4693 		/* Destroy the local dring which is exported to the peer */
4694 		vsw_destroy_dring(ldcp);
4695 	}
4696 
4697 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
4698 }
4699 
4700 /*
4701  * Destroy the descriptor ring.
4702  */
4703 static void
4704 vsw_destroy_dring(vsw_ldc_t *ldcp)
4705 {
4706 	lane_t	*lp = &ldcp->lane_out;
4707 
4708 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4709 		vsw_destroy_rx_dring(ldcp);
4710 	} else {
4711 		vsw_destroy_tx_dring(ldcp);
4712 	}
4713 }
4714 
4715 /*
4716  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
4717  * This thread is woken up by the vsw_portsend to transmit
4718  * packets.
4719  */
4720 static void
4721 vsw_ldc_tx_worker(void *arg)
4722 {
4723 	callb_cpr_t	cprinfo;
4724 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
4725 	vsw_t *vswp = ldcp->ldc_vswp;
4726 	mblk_t *mp;
4727 	mblk_t *tmp;
4728 
4729 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4730 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
4731 	    "vnet_tx_thread");
4732 	mutex_enter(&ldcp->tx_thr_lock);
4733 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
4734 
4735 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4736 		/*
4737 		 * Wait until the data is received or a stop
4738 		 * request is received.
4739 		 */
4740 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
4741 		    (ldcp->tx_mhead == NULL)) {
4742 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
4743 		}
4744 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
4745 
4746 		/*
4747 		 * First process the stop request.
4748 		 */
4749 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
4750 			D2(vswp, "%s(%lld):tx thread stopped\n",
4751 			    __func__, ldcp->ldc_id);
4752 			break;
4753 		}
4754 		mp = ldcp->tx_mhead;
4755 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
4756 		ldcp->tx_cnt = 0;
4757 		mutex_exit(&ldcp->tx_thr_lock);
4758 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
4759 		    __func__, ldcp->ldc_id);
4760 		while (mp != NULL) {
4761 			tmp = mp->b_next;
4762 			mp->b_next = mp->b_prev = NULL;
4763 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
4764 			mp = tmp;
4765 		}
4766 		mutex_enter(&ldcp->tx_thr_lock);
4767 	}
4768 
4769 	/*
4770 	 * Update the run status and wakeup the thread that
4771 	 * has sent the stop request.
4772 	 */
4773 	ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
4774 	ldcp->tx_thread = NULL;
4775 	CALLB_CPR_EXIT(&cprinfo);
4776 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4777 	thread_exit();
4778 }
4779 
4780 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
4781 static void
4782 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
4783 {
4784 	kt_did_t	tid = 0;
4785 	vsw_t		*vswp = ldcp->ldc_vswp;
4786 
4787 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4788 	/*
4789 	 * Send a stop request by setting the stop flag and
4790 	 * wait until the receive thread stops.
4791 	 */
4792 	mutex_enter(&ldcp->tx_thr_lock);
4793 	if (ldcp->tx_thread != NULL) {
4794 		tid = ldcp->tx_thread->t_did;
4795 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
4796 		cv_signal(&ldcp->tx_thr_cv);
4797 	}
4798 	mutex_exit(&ldcp->tx_thr_lock);
4799 
4800 	if (tid != 0) {
4801 		thread_join(tid);
4802 	}
4803 
4804 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4805 }
4806 
4807 static int
4808 vsw_mapin_avail(vsw_ldc_t *ldcp)
4809 {
4810 	int		rv;
4811 	ldc_info_t	info;
4812 	uint64_t	mapin_sz_req;
4813 	uint64_t	dblk_sz;
4814 	vsw_t		*vswp = ldcp->ldc_vswp;
4815 
4816 	rv = ldc_info(ldcp->ldc_handle, &info);
4817 	if (rv != 0) {
4818 		return (B_FALSE);
4819 	}
4820 
4821 	dblk_sz = RXDRING_DBLK_SZ(vswp->max_frame_size);
4822 	mapin_sz_req = (VSW_RXDRING_NRBUFS * dblk_sz);
4823 
4824 	if (info.direct_map_size_max >= mapin_sz_req) {
4825 		return (B_TRUE);
4826 	}
4827 
4828 	return (B_FALSE);
4829 }
4830 
4831 /*
4832  * Debugging routines
4833  */
4834 static void
4835 display_state(void)
4836 {
4837 	vsw_t		*vswp;
4838 	vsw_port_list_t	*plist;
4839 	vsw_port_t 	*port;
4840 	vsw_ldc_t 	*ldcp;
4841 	extern vsw_t 	*vsw_head;
4842 
4843 	cmn_err(CE_NOTE, "***** system state *****");
4844 
4845 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
4846 		plist = &vswp->plist;
4847 		READ_ENTER(&plist->lockrw);
4848 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
4849 		    vswp->instance, plist->num_ports);
4850 
4851 		for (port = plist->head; port != NULL; port = port->p_next) {
4852 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
4853 			    port->p_instance, port->num_ldcs);
4854 			ldcp = port->ldcp;
4855 			cmn_err(CE_CONT, "chan %lu : dev %d : "
4856 			    "status %d : phase %u\n",
4857 			    ldcp->ldc_id, ldcp->dev_class,
4858 			    ldcp->ldc_status, ldcp->hphase);
4859 			cmn_err(CE_CONT, "chan %lu : lsession %lu : "
4860 			    "psession %lu\n", ldcp->ldc_id,
4861 			    ldcp->local_session, ldcp->peer_session);
4862 
4863 			cmn_err(CE_CONT, "Inbound lane:\n");
4864 			display_lane(&ldcp->lane_in);
4865 			cmn_err(CE_CONT, "Outbound lane:\n");
4866 			display_lane(&ldcp->lane_out);
4867 		}
4868 		RW_EXIT(&plist->lockrw);
4869 	}
4870 	cmn_err(CE_NOTE, "***** system state *****");
4871 }
4872 
4873 static void
4874 display_lane(lane_t *lp)
4875 {
4876 	dring_info_t	*drp = lp->dringp;
4877 
4878 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
4879 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
4880 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
4881 	    lp->addr_type, lp->addr, lp->xfer_mode);
4882 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
4883 
4884 	cmn_err(CE_CONT, "Dring info:\n");
4885 	cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
4886 	    drp->num_descriptors, drp->descriptor_size);
4887 	cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->dring_handle);
4888 	cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
4889 	    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
4890 	cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
4891 	    drp->ident, drp->end_idx);
4892 	display_ring(drp);
4893 }
4894 
4895 static void
4896 display_ring(dring_info_t *dringp)
4897 {
4898 	uint64_t		i;
4899 	uint64_t		priv_count = 0;
4900 	uint64_t		pub_count = 0;
4901 	vnet_public_desc_t	*pub_addr = NULL;
4902 	vsw_private_desc_t	*priv_addr = NULL;
4903 
4904 	for (i = 0; i < vsw_num_descriptors; i++) {
4905 		if (dringp->pub_addr != NULL) {
4906 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
4907 
4908 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
4909 				pub_count++;
4910 		}
4911 
4912 		if (dringp->priv_addr != NULL) {
4913 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
4914 
4915 			if (priv_addr->dstate == VIO_DESC_FREE)
4916 				priv_count++;
4917 		}
4918 	}
4919 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
4920 	    i, priv_count, pub_count);
4921 }
4922 
4923 static void
4924 dump_flags(uint64_t state)
4925 {
4926 	int	i;
4927 
4928 	typedef struct flag_name {
4929 		int	flag_val;
4930 		char	*flag_name;
4931 	} flag_name_t;
4932 
4933 	flag_name_t	flags[] = {
4934 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
4935 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
4936 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
4937 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
4938 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
4939 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
4940 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
4941 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
4942 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
4943 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
4944 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
4945 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
4946 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
4947 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
4948 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
4949 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
4950 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
4951 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
4952 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
4953 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
4954 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
4955 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
4956 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
4957 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
4958 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
4959 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
4960 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
4961 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
4962 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
4963 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
4964 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
4965 
4966 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
4967 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
4968 		if (state & flags[i].flag_val)
4969 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
4970 	}
4971 }
4972