xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_ldc.c (revision 17a2b317610f531d565bf4e940433aab2d9e6985)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/debug.h>
29 #include <sys/time.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/user.h>
33 #include <sys/stropts.h>
34 #include <sys/stream.h>
35 #include <sys/strlog.h>
36 #include <sys/strsubr.h>
37 #include <sys/cmn_err.h>
38 #include <sys/cpu.h>
39 #include <sys/kmem.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/ksynch.h>
44 #include <sys/stat.h>
45 #include <sys/kstat.h>
46 #include <sys/vtrace.h>
47 #include <sys/strsun.h>
48 #include <sys/dlpi.h>
49 #include <sys/ethernet.h>
50 #include <net/if.h>
51 #include <sys/varargs.h>
52 #include <sys/machsystm.h>
53 #include <sys/modctl.h>
54 #include <sys/modhash.h>
55 #include <sys/mac.h>
56 #include <sys/mac_ether.h>
57 #include <sys/taskq.h>
58 #include <sys/note.h>
59 #include <sys/mach_descrip.h>
60 #include <sys/mdeg.h>
61 #include <sys/ldc.h>
62 #include <sys/vsw_fdb.h>
63 #include <sys/vsw.h>
64 #include <sys/vio_mailbox.h>
65 #include <sys/vnet_mailbox.h>
66 #include <sys/vnet_common.h>
67 #include <sys/vio_util.h>
68 #include <sys/sdt.h>
69 #include <sys/atomic.h>
70 #include <sys/callb.h>
71 #include <sys/vlan.h>
72 
73 /* Port add/deletion/etc routines */
74 static	void vsw_port_delete(vsw_port_t *port);
75 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
76 static	void vsw_ldc_detach(vsw_ldc_t *ldcp);
77 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
78 static	void vsw_ldc_uninit(vsw_ldc_t *ldcp);
79 static	void vsw_ldc_drain(vsw_ldc_t *ldcp);
80 static	void vsw_drain_port_taskq(vsw_port_t *port);
81 static	void vsw_marker_task(void *);
82 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
83 void vsw_detach_ports(vsw_t *vswp);
84 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
85 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
86 int vsw_port_detach(vsw_t *vswp, int p_instance);
87 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
88 int vsw_port_attach(vsw_port_t *portp);
89 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
90 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
91 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
92 void vsw_reset_ports(vsw_t *vswp);
93 void vsw_port_reset(vsw_port_t *portp);
94 void vsw_physlink_update_ports(vsw_t *vswp);
95 static	void vsw_port_physlink_update(vsw_port_t *portp);
96 
97 /* Interrupt routines */
98 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
99 
100 /* Handshake routines */
101 static	void vsw_ldc_reinit(vsw_ldc_t *);
102 static	void vsw_conn_task(void *);
103 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
104 static	void vsw_next_milestone(vsw_ldc_t *);
105 static	int vsw_supported_version(vio_ver_msg_t *);
106 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
107 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
108 void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
109 
110 /* Data processing routines */
111 void vsw_process_pkt(void *);
112 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *, int);
113 static void vsw_process_ctrl_pkt(void *);
114 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
115 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
116 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
117 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
121 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
122 	uint32_t);
123 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
124 static void vsw_process_pkt_data(void *, void *, uint32_t);
125 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
126 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
127 static void vsw_process_evt_read(vsw_ldc_t *ldcp);
128 static void vsw_ldc_rcv(vsw_ldc_t *ldcp);
129 
130 /* Switching/data transmit routines */
131 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
132 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
133 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
134 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
135 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
136 
137 /* Packet creation routines */
138 static void vsw_send_ver(void *);
139 static void vsw_send_attr(vsw_ldc_t *);
140 static void vsw_send_dring_info(vsw_ldc_t *);
141 static void vsw_send_rdx(vsw_ldc_t *);
142 static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);
143 
144 /* Dring routines */
145 static void vsw_create_privring(vsw_ldc_t *);
146 static dring_info_t *vsw_map_dring(vsw_ldc_t *ldcp, void *pkt);
147 static void vsw_unmap_dring(vsw_ldc_t *ldcp);
148 static void vsw_destroy_dring(vsw_ldc_t *ldcp);
149 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
150 static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt);
151 static void vsw_set_lane_attr(vsw_t *, lane_t *);
152 dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp,
153     vio_dring_reg_msg_t *dring_pkt);
154 
155 /* tx/msg/rcv thread routines */
156 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
157 static void vsw_ldc_tx_worker(void *arg);
158 
159 /* Misc support routines */
160 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
161 static int vsw_get_same_dest_list(struct ether_header *ehp,
162     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
163 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
164 
165 /* Debugging routines */
166 static void dump_flags(uint64_t);
167 static void display_state(void);
168 static void display_lane(lane_t *);
169 static void display_ring(dring_info_t *);
170 
171 /*
172  * Functions imported from other files.
173  */
174 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
175 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
176 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
177 extern void vsw_del_mcst_port(vsw_port_t *port);
178 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
179 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
180 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
181 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
182 extern void vsw_create_vlans(void *arg, int type);
183 extern void vsw_destroy_vlans(void *arg, int type);
184 extern void vsw_vlan_add_ids(void *arg, int type);
185 extern void vsw_vlan_remove_ids(void *arg, int type);
186 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
187 	struct ether_header *ehp, uint16_t *vidp);
188 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
189 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
190 	mblk_t **npt);
191 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
192 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
193 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
194 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
195 extern void vsw_hio_stop_port(vsw_port_t *portp);
196 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
197 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
198 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
199 extern void vsw_destroy_rxpools(void *arg);
200 extern void vsw_stop_msg_thread(vsw_ldc_t *ldcp);
201 extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
202 extern int vsw_dringsend(vsw_ldc_t *, mblk_t *);
203 extern int vsw_reclaim_dring(dring_info_t *dp, int start);
204 extern int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
205     int *);
206 extern vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *);
207 extern int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp);
208 extern void vsw_destroy_tx_dring(vsw_ldc_t *ldcp);
209 extern dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt);
210 extern void vsw_unmap_rx_dring(vsw_ldc_t *ldcp);
211 extern void vsw_ldc_msg_worker(void *arg);
212 extern void vsw_process_dringdata(void *, void *);
213 extern vio_dring_reg_msg_t *vsw_create_rx_dring_info(vsw_ldc_t *);
214 extern void vsw_destroy_rx_dring(vsw_ldc_t *ldcp);
215 extern dring_info_t *vsw_map_tx_dring(vsw_ldc_t *ldcp, void *pkt);
216 extern void vsw_unmap_tx_dring(vsw_ldc_t *ldcp);
217 extern void vsw_ldc_rcv_worker(void *arg);
218 extern void vsw_stop_rcv_thread(vsw_ldc_t *ldcp);
219 extern int vsw_dringsend_shm(vsw_ldc_t *, mblk_t *);
220 extern void vsw_process_dringdata_shm(void *, void *);
221 
222 /*
223  * Tunables used in this file.
224  */
225 extern int vsw_num_handshakes;
226 extern int vsw_ldc_tx_delay;
227 extern int vsw_ldc_tx_retries;
228 extern int vsw_ldc_retries;
229 extern int vsw_ldc_delay;
230 extern boolean_t vsw_ldc_rxthr_enabled;
231 extern boolean_t vsw_ldc_txthr_enabled;
232 extern uint32_t vsw_num_descriptors;
233 extern uint8_t  vsw_dring_mode;
234 extern uint32_t vsw_max_tx_qcount;
235 extern boolean_t vsw_obp_ver_proto_workaround;
236 extern uint32_t vsw_publish_macaddr_count;
237 
238 #define	LDC_ENTER_LOCK(ldcp)	\
239 				mutex_enter(&((ldcp)->ldc_cblock));\
240 				mutex_enter(&((ldcp)->ldc_rxlock));\
241 				mutex_enter(&((ldcp)->ldc_txlock));
242 #define	LDC_EXIT_LOCK(ldcp)	\
243 				mutex_exit(&((ldcp)->ldc_txlock));\
244 				mutex_exit(&((ldcp)->ldc_rxlock));\
245 				mutex_exit(&((ldcp)->ldc_cblock));
246 
247 #define	VSW_VER_EQ(ldcp, major, minor)	\
248 	((ldcp)->lane_out.ver_major == (major) &&	\
249 	    (ldcp)->lane_out.ver_minor == (minor))
250 
251 #define	VSW_VER_LT(ldcp, major, minor)	\
252 	(((ldcp)->lane_out.ver_major < (major)) ||	\
253 	    ((ldcp)->lane_out.ver_major == (major) &&	\
254 	    (ldcp)->lane_out.ver_minor < (minor)))
255 
256 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
257 	(((ldcp)->lane_out.ver_major > (major)) ||	\
258 	    ((ldcp)->lane_out.ver_major == (major) &&	\
259 	    (ldcp)->lane_out.ver_minor >= (minor)))
260 
261 #define	VSW_VER_LTEQ(ldcp, major, minor)	\
262 	(((ldcp)->lane_out.ver_major < (major)) ||	\
263 	    ((ldcp)->lane_out.ver_major == (major) &&	\
264 	    (ldcp)->lane_out.ver_minor <= (minor)))
265 
266 /*
267  * VIO Protocol Version Info:
268  *
269  * The version specified below represents the version of protocol currently
270  * supported in the driver. It means the driver can negotiate with peers with
271  * versions <= this version. Here is a summary of the feature(s) that are
272  * supported at each version of the protocol:
273  *
274  * 1.0			Basic VIO protocol.
275  * 1.1			vDisk protocol update (no virtual network update).
276  * 1.2			Support for priority frames (priority-ether-types).
277  * 1.3			VLAN and HybridIO support.
278  * 1.4			Jumbo Frame support.
279  * 1.5			Link State Notification support with optional support
280  * 			for Physical Link information.
281  * 1.6			Support for RxDringData mode.
282  */
283 static	ver_sup_t	vsw_versions[] = { {1, 6} };
284 
285 /*
286  * For the moment the state dump routines have their own
287  * private flag.
288  */
289 #define	DUMP_STATE	0
290 
291 #if DUMP_STATE
292 
293 #define	DUMP_TAG(tag) \
294 {			\
295 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
296 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
297 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
298 }
299 
300 #define	DUMP_TAG_PTR(tag) \
301 {			\
302 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
303 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
304 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
305 }
306 
307 #define	DUMP_FLAGS(flags) dump_flags(flags);
308 #define	DISPLAY_STATE()	display_state()
309 
310 #else
311 
312 #define	DUMP_TAG(tag)
313 #define	DUMP_TAG_PTR(tag)
314 #define	DUMP_FLAGS(state)
315 #define	DISPLAY_STATE()
316 
317 #endif	/* DUMP_STATE */
318 
319 /*
320  * Attach the specified port.
321  *
322  * Returns 0 on success, 1 on failure.
323  */
324 int
325 vsw_port_attach(vsw_port_t *port)
326 {
327 	vsw_t			*vswp = port->p_vswp;
328 	vsw_port_list_t		*plist = &vswp->plist;
329 	vsw_port_t		*p, **pp;
330 	int			nids = port->num_ldcs;
331 	uint64_t		*ldcids;
332 	int			rv;
333 
334 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
335 
336 	/* port already exists? */
337 	READ_ENTER(&plist->lockrw);
338 	for (p = plist->head; p != NULL; p = p->p_next) {
339 		if (p->p_instance == port->p_instance) {
340 			DWARN(vswp, "%s: port instance %d already attached",
341 			    __func__, p->p_instance);
342 			RW_EXIT(&plist->lockrw);
343 			return (1);
344 		}
345 	}
346 	RW_EXIT(&plist->lockrw);
347 
348 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
349 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
350 	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
351 
352 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
353 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
354 	port->state = VSW_PORT_INIT;
355 
356 	D2(vswp, "%s: %d nids", __func__, nids);
357 	ldcids = port->ldc_ids;
358 	D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[0]);
359 	if (vsw_ldc_attach(port, (uint64_t)ldcids[0]) != 0) {
360 		DERR(vswp, "%s: ldc_attach failed", __func__);
361 		goto exit_error;
362 	}
363 
364 	if (vswp->switching_setup_done == B_TRUE) {
365 		/*
366 		 * If the underlying network device has been setup,
367 		 * then open a mac client and porgram the mac address
368 		 * for this port.
369 		 */
370 		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
371 		if (rv != 0) {
372 			goto exit_error;
373 		}
374 	}
375 
376 	/* create the fdb entry for this port/mac address */
377 	vsw_fdbe_add(vswp, port);
378 
379 	vsw_create_vlans(port, VSW_VNETPORT);
380 
381 	WRITE_ENTER(&plist->lockrw);
382 
383 	/* link it into the list of ports for this vsw instance */
384 	pp = (vsw_port_t **)(&plist->head);
385 	port->p_next = *pp;
386 	*pp = port;
387 	plist->num_ports++;
388 
389 	RW_EXIT(&plist->lockrw);
390 
391 	/*
392 	 * Initialise the port and any ldc's under it.
393 	 */
394 	(void) vsw_ldc_init(port->ldcp);
395 
396 	/* announce macaddr of vnet to the physical switch */
397 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
398 		vsw_publish_macaddr(vswp, port);
399 	}
400 
401 	D1(vswp, "%s: exit", __func__);
402 	return (0);
403 
404 exit_error:
405 
406 	cv_destroy(&port->state_cv);
407 	mutex_destroy(&port->state_lock);
408 
409 	rw_destroy(&port->maccl_rwlock);
410 	mutex_destroy(&port->tx_lock);
411 	mutex_destroy(&port->mca_lock);
412 	kmem_free(port, sizeof (vsw_port_t));
413 	return (1);
414 }
415 
416 /*
417  * Detach the specified port.
418  *
419  * Returns 0 on success, 1 on failure.
420  */
421 int
422 vsw_port_detach(vsw_t *vswp, int p_instance)
423 {
424 	vsw_port_t	*port = NULL;
425 	vsw_port_list_t	*plist = &vswp->plist;
426 
427 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
428 
429 	WRITE_ENTER(&plist->lockrw);
430 
431 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
432 		RW_EXIT(&plist->lockrw);
433 		return (1);
434 	}
435 
436 	if (vsw_plist_del_node(vswp, port)) {
437 		RW_EXIT(&plist->lockrw);
438 		return (1);
439 	}
440 
441 	/* cleanup any HybridIO for this port */
442 	vsw_hio_stop_port(port);
443 
444 	/*
445 	 * No longer need to hold writer lock on port list now
446 	 * that we have unlinked the target port from the list.
447 	 */
448 	RW_EXIT(&plist->lockrw);
449 
450 	/* Cleanup and close the mac client */
451 	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
452 
453 	/* Remove the fdb entry for this port/mac address */
454 	vsw_fdbe_del(vswp, &(port->p_macaddr));
455 	vsw_destroy_vlans(port, VSW_VNETPORT);
456 
457 	/* Remove any multicast addresses.. */
458 	vsw_del_mcst_port(port);
459 
460 	vsw_port_delete(port);
461 
462 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
463 	return (0);
464 }
465 
466 /*
467  * Detach all active ports.
468  */
469 void
470 vsw_detach_ports(vsw_t *vswp)
471 {
472 	vsw_port_list_t 	*plist = &vswp->plist;
473 	vsw_port_t		*port = NULL;
474 
475 	D1(vswp, "%s: enter", __func__);
476 
477 	WRITE_ENTER(&plist->lockrw);
478 
479 	while ((port = plist->head) != NULL) {
480 		(void) vsw_plist_del_node(vswp, port);
481 
482 		/* cleanup any HybridIO for this port */
483 		vsw_hio_stop_port(port);
484 
485 		/* Cleanup and close the mac client */
486 		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
487 
488 		/* Remove the fdb entry for this port/mac address */
489 		vsw_fdbe_del(vswp, &(port->p_macaddr));
490 		vsw_destroy_vlans(port, VSW_VNETPORT);
491 
492 		/* Remove any multicast addresses.. */
493 		vsw_del_mcst_port(port);
494 
495 		/*
496 		 * No longer need to hold the lock on the port list
497 		 * now that we have unlinked the target port from the
498 		 * list.
499 		 */
500 		RW_EXIT(&plist->lockrw);
501 		vsw_port_delete(port);
502 		WRITE_ENTER(&plist->lockrw);
503 	}
504 	RW_EXIT(&plist->lockrw);
505 
506 	D1(vswp, "%s: exit", __func__);
507 }
508 
509 /*
510  * Delete the specified port.
511  */
512 static void
513 vsw_port_delete(vsw_port_t *port)
514 {
515 	vsw_t			*vswp = port->p_vswp;
516 
517 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
518 
519 	vsw_ldc_uninit(port->ldcp);
520 
521 	/*
522 	 * Wait for any pending ctrl msg tasks which reference this
523 	 * port to finish.
524 	 */
525 	vsw_drain_port_taskq(port);
526 
527 	/*
528 	 * Wait for any active callbacks to finish
529 	 */
530 	vsw_ldc_drain(port->ldcp);
531 
532 	vsw_ldc_detach(port->ldcp);
533 
534 	rw_destroy(&port->maccl_rwlock);
535 	mutex_destroy(&port->mca_lock);
536 	mutex_destroy(&port->tx_lock);
537 
538 	cv_destroy(&port->state_cv);
539 	mutex_destroy(&port->state_lock);
540 
541 	if (port->num_ldcs != 0) {
542 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
543 		port->num_ldcs = 0;
544 	}
545 
546 	if (port->nvids != 0) {
547 		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
548 	}
549 
550 	kmem_free(port, sizeof (vsw_port_t));
551 
552 	D1(vswp, "%s: exit", __func__);
553 }
554 
555 /*
556  * Attach a logical domain channel (ldc) under a specified port.
557  *
558  * Returns 0 on success, 1 on failure.
559  */
560 static int
561 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
562 {
563 	vsw_t 		*vswp = port->p_vswp;
564 	vsw_ldc_t 	*ldcp = NULL;
565 	ldc_attr_t 	attr;
566 	ldc_status_t	istatus;
567 	int 		status = DDI_FAILURE;
568 	char		kname[MAXNAMELEN];
569 	enum		{ PROG_init = 0x0,
570 			    PROG_callback = 0x1,
571 			    PROG_tx_thread = 0x2}
572 			progress;
573 
574 	progress = PROG_init;
575 
576 	D1(vswp, "%s: enter", __func__);
577 
578 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
579 	if (ldcp == NULL) {
580 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
581 		return (1);
582 	}
583 	ldcp->ldc_id = ldc_id;
584 
585 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
586 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
587 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
588 	ldcp->msg_thr_flags = 0;
589 	mutex_init(&ldcp->msg_thr_lock, NULL, MUTEX_DRIVER, NULL);
590 	cv_init(&ldcp->msg_thr_cv, NULL, CV_DRIVER, NULL);
591 	ldcp->rcv_thr_flags = 0;
592 	mutex_init(&ldcp->rcv_thr_lock, NULL, MUTEX_DRIVER, NULL);
593 	cv_init(&ldcp->rcv_thr_cv, NULL, CV_DRIVER, NULL);
594 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
595 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
596 
597 	/* required for handshake with peer */
598 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
599 	ldcp->peer_session = 0;
600 	ldcp->session_status = 0;
601 	ldcp->hss_id = 1;	/* Initial handshake session id */
602 	ldcp->hphase = VSW_MILESTONE0;
603 
604 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
605 
606 	/* only set for outbound lane, inbound set by peer */
607 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
608 
609 	attr.devclass = LDC_DEV_NT_SVC;
610 	attr.instance = ddi_get_instance(vswp->dip);
611 	attr.mode = LDC_MODE_UNRELIABLE;
612 	attr.mtu = VSW_LDC_MTU;
613 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
614 	if (status != 0) {
615 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
616 		    __func__, ldc_id, status);
617 		goto ldc_attach_fail;
618 	}
619 
620 	if (vsw_ldc_txthr_enabled) {
621 		ldcp->tx_thr_flags = 0;
622 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
623 
624 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
625 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
626 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
627 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
628 
629 		progress |= PROG_tx_thread;
630 		if (ldcp->tx_thread == NULL) {
631 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
632 			    __func__, ldc_id);
633 			goto ldc_attach_fail;
634 		}
635 	}
636 
637 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
638 	if (status != 0) {
639 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
640 		    __func__, ldc_id, status);
641 		(void) ldc_fini(ldcp->ldc_handle);
642 		goto ldc_attach_fail;
643 	}
644 	/*
645 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
646 	 * data msgs, including raw data msgs used to recv priority frames.
647 	 */
648 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
649 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
650 
651 	progress |= PROG_callback;
652 
653 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
654 
655 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
656 		DERR(vswp, "%s: ldc_status failed", __func__);
657 		mutex_destroy(&ldcp->status_lock);
658 		goto ldc_attach_fail;
659 	}
660 
661 	ldcp->ldc_status = istatus;
662 	ldcp->ldc_port = port;
663 	ldcp->ldc_vswp = vswp;
664 
665 	vsw_reset_vnet_proto_ops(ldcp);
666 
667 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
668 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
669 	    kname, &ldcp->ldc_stats);
670 	if (ldcp->ksp == NULL) {
671 		DERR(vswp, "%s: kstats setup failed", __func__);
672 		goto ldc_attach_fail;
673 	}
674 
675 	/* link it into this port */
676 	port->ldcp = ldcp;
677 
678 	D1(vswp, "%s: exit", __func__);
679 	return (0);
680 
681 ldc_attach_fail:
682 
683 	if (progress & PROG_callback) {
684 		(void) ldc_unreg_callback(ldcp->ldc_handle);
685 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
686 	}
687 
688 	if (progress & PROG_tx_thread) {
689 		if (ldcp->tx_thread != NULL) {
690 			vsw_stop_tx_thread(ldcp);
691 		}
692 		mutex_destroy(&ldcp->tx_thr_lock);
693 		cv_destroy(&ldcp->tx_thr_cv);
694 	}
695 	if (ldcp->ksp != NULL) {
696 		vgen_destroy_kstats(ldcp->ksp);
697 	}
698 	mutex_destroy(&ldcp->msg_thr_lock);
699 	mutex_destroy(&ldcp->rcv_thr_lock);
700 	mutex_destroy(&ldcp->ldc_txlock);
701 	mutex_destroy(&ldcp->ldc_rxlock);
702 	mutex_destroy(&ldcp->ldc_cblock);
703 	mutex_destroy(&ldcp->drain_cv_lock);
704 	cv_destroy(&ldcp->msg_thr_cv);
705 	cv_destroy(&ldcp->rcv_thr_cv);
706 	cv_destroy(&ldcp->drain_cv);
707 
708 	kmem_free(ldcp, sizeof (vsw_ldc_t));
709 
710 	return (1);
711 }
712 
713 /*
714  * Detach a logical domain channel (ldc) belonging to a
715  * particular port.
716  */
717 static void
718 vsw_ldc_detach(vsw_ldc_t *ldcp)
719 {
720 	int 		rv;
721 	vsw_t 		*vswp = ldcp->ldc_port->p_vswp;
722 	int		retries = 0;
723 
724 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
725 
726 	/* Stop msg/rcv thread */
727 	if (ldcp->rcv_thread != NULL) {
728 		vsw_stop_rcv_thread(ldcp);
729 	} else if (ldcp->msg_thread != NULL) {
730 		vsw_stop_msg_thread(ldcp);
731 	}
732 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
733 
734 	/* Stop the tx thread */
735 	if (ldcp->tx_thread != NULL) {
736 		vsw_stop_tx_thread(ldcp);
737 		mutex_destroy(&ldcp->tx_thr_lock);
738 		cv_destroy(&ldcp->tx_thr_cv);
739 		if (ldcp->tx_mhead != NULL) {
740 			freemsgchain(ldcp->tx_mhead);
741 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
742 			ldcp->tx_cnt = 0;
743 		}
744 	}
745 
746 	/* Destory kstats */
747 	vgen_destroy_kstats(ldcp->ksp);
748 
749 	/*
750 	 * Before we can close the channel we must release any mapped
751 	 * resources (e.g. drings).
752 	 */
753 	vsw_free_lane_resources(ldcp, INBOUND);
754 	vsw_free_lane_resources(ldcp, OUTBOUND);
755 
756 	/*
757 	 * Close the channel, retry on EAAGIN.
758 	 */
759 	while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
760 		if (++retries > vsw_ldc_retries) {
761 			break;
762 		}
763 		drv_usecwait(vsw_ldc_delay);
764 	}
765 	if (rv != 0) {
766 		cmn_err(CE_NOTE,
767 		    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
768 		    vswp->instance, rv, ldcp->ldc_id);
769 	}
770 
771 	(void) ldc_fini(ldcp->ldc_handle);
772 
773 	ldcp->ldc_status = LDC_INIT;
774 	ldcp->ldc_handle = NULL;
775 	ldcp->ldc_vswp = NULL;
776 
777 	mutex_destroy(&ldcp->msg_thr_lock);
778 	mutex_destroy(&ldcp->rcv_thr_lock);
779 	mutex_destroy(&ldcp->ldc_txlock);
780 	mutex_destroy(&ldcp->ldc_rxlock);
781 	mutex_destroy(&ldcp->ldc_cblock);
782 	mutex_destroy(&ldcp->drain_cv_lock);
783 	mutex_destroy(&ldcp->status_lock);
784 	cv_destroy(&ldcp->msg_thr_cv);
785 	cv_destroy(&ldcp->rcv_thr_cv);
786 	cv_destroy(&ldcp->drain_cv);
787 
788 	kmem_free(ldcp, sizeof (vsw_ldc_t));
789 }
790 
791 /*
792  * Open and attempt to bring up the channel. Note that channel
793  * can only be brought up if peer has also opened channel.
794  *
795  * Returns 0 if can open and bring up channel, otherwise
796  * returns 1.
797  */
798 static int
799 vsw_ldc_init(vsw_ldc_t *ldcp)
800 {
801 	vsw_t 		*vswp = ldcp->ldc_vswp;
802 	ldc_status_t	istatus = 0;
803 	int		rv;
804 
805 	D1(vswp, "%s: enter", __func__);
806 
807 	LDC_ENTER_LOCK(ldcp);
808 
809 	/* don't start at 0 in case clients don't like that */
810 	ldcp->next_ident = 1;
811 
812 	rv = ldc_open(ldcp->ldc_handle);
813 	if (rv != 0) {
814 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
815 		    __func__, ldcp->ldc_id, rv);
816 		LDC_EXIT_LOCK(ldcp);
817 		return (1);
818 	}
819 
820 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
821 		DERR(vswp, "%s: unable to get status", __func__);
822 		LDC_EXIT_LOCK(ldcp);
823 		return (1);
824 
825 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
826 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
827 		    __func__, ldcp->ldc_id, istatus);
828 		LDC_EXIT_LOCK(ldcp);
829 		return (1);
830 	}
831 
832 	mutex_enter(&ldcp->status_lock);
833 	ldcp->ldc_status = istatus;
834 	mutex_exit(&ldcp->status_lock);
835 
836 	rv = ldc_up(ldcp->ldc_handle);
837 	if (rv != 0) {
838 		/*
839 		 * Not a fatal error for ldc_up() to fail, as peer
840 		 * end point may simply not be ready yet.
841 		 */
842 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
843 		    ldcp->ldc_id, rv);
844 		LDC_EXIT_LOCK(ldcp);
845 		return (1);
846 	}
847 
848 	/*
849 	 * ldc_up() call is non-blocking so need to explicitly
850 	 * check channel status to see if in fact the channel
851 	 * is UP.
852 	 */
853 	mutex_enter(&ldcp->status_lock);
854 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
855 		DERR(vswp, "%s: unable to get status", __func__);
856 		mutex_exit(&ldcp->status_lock);
857 		LDC_EXIT_LOCK(ldcp);
858 		return (1);
859 
860 	}
861 
862 	if (ldcp->ldc_status == LDC_UP) {
863 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
864 		    ldcp->ldc_id, istatus);
865 		mutex_exit(&ldcp->status_lock);
866 		LDC_EXIT_LOCK(ldcp);
867 
868 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
869 		return (0);
870 	}
871 
872 	mutex_exit(&ldcp->status_lock);
873 	LDC_EXIT_LOCK(ldcp);
874 
875 	D1(vswp, "%s: exit", __func__);
876 	return (0);
877 }
878 
879 /* disable callbacks on the channel */
880 static void
881 vsw_ldc_uninit(vsw_ldc_t *ldcp)
882 {
883 	vsw_t	*vswp = ldcp->ldc_vswp;
884 	int	rv;
885 
886 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
887 
888 	LDC_ENTER_LOCK(ldcp);
889 
890 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
891 	if (rv != 0) {
892 		cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
893 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
894 	}
895 
896 	mutex_enter(&ldcp->status_lock);
897 	ldcp->ldc_status = LDC_INIT;
898 	mutex_exit(&ldcp->status_lock);
899 
900 	LDC_EXIT_LOCK(ldcp);
901 
902 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
903 }
904 
905 /*
906  * Wait until the callback(s) associated with the ldcs under the specified
907  * port have completed.
908  *
909  * Prior to this function being invoked each channel under this port
910  * should have been quiesced via ldc_set_cb_mode(DISABLE).
911  *
912  * A short explaination of what we are doing below..
913  *
914  * The simplest approach would be to have a reference counter in
915  * the ldc structure which is increment/decremented by the callbacks as
916  * they use the channel. The drain function could then simply disable any
917  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
918  * there is a tiny window here - before the callback is able to get the lock
919  * on the channel it is interrupted and this function gets to execute. It
920  * sees that the ref count is zero and believes its free to delete the
921  * associated data structures.
922  *
923  * We get around this by taking advantage of the fact that before the ldc
924  * framework invokes a callback it sets a flag to indicate that there is a
925  * callback active (or about to become active). If when we attempt to
926  * unregister a callback when this active flag is set then the unregister
927  * will fail with EWOULDBLOCK.
928  *
929  * If the unregister fails we do a cv_timedwait. We will either be signaled
930  * by the callback as it is exiting (note we have to wait a short period to
931  * allow the callback to return fully to the ldc framework and it to clear
932  * the active flag), or by the timer expiring. In either case we again attempt
933  * the unregister. We repeat this until we can succesfully unregister the
934  * callback.
935  *
936  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
937  * the case where the callback has finished but the ldc framework has not yet
938  * cleared the active flag. In this case we would never get a cv_signal.
939  */
940 static void
941 vsw_ldc_drain(vsw_ldc_t *ldcp)
942 {
943 	vsw_t	*vswp = ldcp->ldc_port->p_vswp;
944 
945 	D1(vswp, "%s: enter", __func__);
946 
947 	/*
948 	 * If we can unregister the channel callback then we
949 	 * know that there is no callback either running or
950 	 * scheduled to run for this channel so move on to next
951 	 * channel in the list.
952 	 */
953 	mutex_enter(&ldcp->drain_cv_lock);
954 
955 	/* prompt active callbacks to quit */
956 	ldcp->drain_state = VSW_LDC_DRAINING;
957 
958 	if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
959 		D2(vswp, "%s: unreg callback for chan %ld", __func__,
960 		    ldcp->ldc_id);
961 		mutex_exit(&ldcp->drain_cv_lock);
962 	} else {
963 		/*
964 		 * If we end up here we know that either 1) a callback
965 		 * is currently executing, 2) is about to start (i.e.
966 		 * the ldc framework has set the active flag but
967 		 * has not actually invoked the callback yet, or 3)
968 		 * has finished and has returned to the ldc framework
969 		 * but the ldc framework has not yet cleared the
970 		 * active bit.
971 		 *
972 		 * Wait for it to finish.
973 		 */
974 		while (ldc_unreg_callback(ldcp->ldc_handle) == EWOULDBLOCK) {
975 			(void) cv_timedwait(&ldcp->drain_cv,
976 			    &ldcp->drain_cv_lock, ddi_get_lbolt() + hz);
977 		}
978 
979 		mutex_exit(&ldcp->drain_cv_lock);
980 		D2(vswp, "%s: unreg callback for chan %ld after "
981 		    "timeout", __func__, ldcp->ldc_id);
982 	}
983 
984 	D1(vswp, "%s: exit", __func__);
985 }
986 
987 /*
988  * Wait until all tasks which reference this port have completed.
989  *
990  * Prior to this function being invoked each channel under this port
991  * should have been quiesced via ldc_set_cb_mode(DISABLE).
992  */
993 static void
994 vsw_drain_port_taskq(vsw_port_t *port)
995 {
996 	vsw_t		*vswp = port->p_vswp;
997 
998 	D1(vswp, "%s: enter", __func__);
999 
1000 	/*
1001 	 * Mark the port as in the process of being detached, and
1002 	 * dispatch a marker task to the queue so we know when all
1003 	 * relevant tasks have completed.
1004 	 */
1005 	mutex_enter(&port->state_lock);
1006 	port->state = VSW_PORT_DETACHING;
1007 
1008 	if ((vswp->taskq_p == NULL) ||
1009 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1010 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1011 		cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
1012 		    vswp->instance);
1013 		mutex_exit(&port->state_lock);
1014 		return;
1015 	}
1016 
1017 	/*
1018 	 * Wait for the marker task to finish.
1019 	 */
1020 	while (port->state != VSW_PORT_DETACHABLE)
1021 		cv_wait(&port->state_cv, &port->state_lock);
1022 
1023 	mutex_exit(&port->state_lock);
1024 
1025 	D1(vswp, "%s: exit", __func__);
1026 }
1027 
1028 static void
1029 vsw_marker_task(void *arg)
1030 {
1031 	vsw_port_t	*port = arg;
1032 	vsw_t		*vswp = port->p_vswp;
1033 
1034 	D1(vswp, "%s: enter", __func__);
1035 
1036 	mutex_enter(&port->state_lock);
1037 
1038 	/*
1039 	 * No further tasks should be dispatched which reference
1040 	 * this port so ok to mark it as safe to detach.
1041 	 */
1042 	port->state = VSW_PORT_DETACHABLE;
1043 
1044 	cv_signal(&port->state_cv);
1045 
1046 	mutex_exit(&port->state_lock);
1047 
1048 	D1(vswp, "%s: exit", __func__);
1049 }
1050 
1051 vsw_port_t *
1052 vsw_lookup_port(vsw_t *vswp, int p_instance)
1053 {
1054 	vsw_port_list_t *plist = &vswp->plist;
1055 	vsw_port_t	*port;
1056 
1057 	for (port = plist->head; port != NULL; port = port->p_next) {
1058 		if (port->p_instance == p_instance) {
1059 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1060 			return (port);
1061 		}
1062 	}
1063 
1064 	return (NULL);
1065 }
1066 
1067 void
1068 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1069 {
1070 	vsw_ldc_t	*ldcp = portp->ldcp;
1071 
1072 	mutex_enter(&ldcp->ldc_cblock);
1073 
1074 	/*
1075 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1076 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1077 	 */
1078 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1079 	    portp->nvids != 0) {
1080 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1081 	}
1082 
1083 	mutex_exit(&ldcp->ldc_cblock);
1084 }
1085 
1086 void
1087 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1088 {
1089 	vsw_ldc_t	*ldcp = portp->ldcp;
1090 
1091 	mutex_enter(&ldcp->ldc_cblock);
1092 
1093 	/*
1094 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1095 	 * to trigger re-negotiation, which inturn trigger HybridIO
1096 	 * setup/cleanup.
1097 	 */
1098 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1099 	    (portp->p_hio_capable == B_TRUE)) {
1100 		if (immediate == B_TRUE) {
1101 			(void) ldc_down(ldcp->ldc_handle);
1102 		} else {
1103 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1104 		}
1105 	}
1106 
1107 	mutex_exit(&ldcp->ldc_cblock);
1108 }
1109 
1110 void
1111 vsw_port_reset(vsw_port_t *portp)
1112 {
1113 	vsw_ldc_t	*ldcp = portp->ldcp;
1114 
1115 	mutex_enter(&ldcp->ldc_cblock);
1116 
1117 	/*
1118 	 * reset channel and terminate the connection.
1119 	 */
1120 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1121 
1122 	mutex_exit(&ldcp->ldc_cblock);
1123 }
1124 
1125 void
1126 vsw_reset_ports(vsw_t *vswp)
1127 {
1128 	vsw_port_list_t	*plist = &vswp->plist;
1129 	vsw_port_t	*portp;
1130 
1131 	READ_ENTER(&plist->lockrw);
1132 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1133 		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1134 			vsw_hio_stop_port(portp);
1135 		}
1136 		vsw_port_reset(portp);
1137 	}
1138 	RW_EXIT(&plist->lockrw);
1139 }
1140 
1141 static void
1142 vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
1143 {
1144 	vnet_physlink_msg_t	msg;
1145 	vnet_physlink_msg_t	*msgp = &msg;
1146 	uint32_t		physlink_info = 0;
1147 
1148 	if (plink_state == LINK_STATE_UP) {
1149 		physlink_info |= VNET_PHYSLINK_STATE_UP;
1150 	} else {
1151 		physlink_info |= VNET_PHYSLINK_STATE_DOWN;
1152 	}
1153 
1154 	msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
1155 	msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
1156 	msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
1157 	msgp->tag.vio_sid = ldcp->local_session;
1158 	msgp->physlink_info = physlink_info;
1159 
1160 	(void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
1161 }
1162 
1163 static void
1164 vsw_port_physlink_update(vsw_port_t *portp)
1165 {
1166 	vsw_ldc_t	*ldcp;
1167 	vsw_t		*vswp;
1168 
1169 	vswp = portp->p_vswp;
1170 	ldcp = portp->ldcp;
1171 
1172 	mutex_enter(&ldcp->ldc_cblock);
1173 
1174 	/*
1175 	 * If handshake has completed successfully and if the vnet device
1176 	 * has negotiated to get physical link state updates, send a message
1177 	 * with the current state.
1178 	 */
1179 	if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
1180 		vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
1181 	}
1182 
1183 	mutex_exit(&ldcp->ldc_cblock);
1184 }
1185 
1186 void
1187 vsw_physlink_update_ports(vsw_t *vswp)
1188 {
1189 	vsw_port_list_t	*plist = &vswp->plist;
1190 	vsw_port_t	*portp;
1191 
1192 	READ_ENTER(&plist->lockrw);
1193 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1194 		vsw_port_physlink_update(portp);
1195 	}
1196 	RW_EXIT(&plist->lockrw);
1197 }
1198 
1199 /*
1200  * Search for and remove the specified port from the port
1201  * list. Returns 0 if able to locate and remove port, otherwise
1202  * returns 1.
1203  */
1204 static int
1205 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1206 {
1207 	vsw_port_list_t *plist = &vswp->plist;
1208 	vsw_port_t	*curr_p, *prev_p;
1209 
1210 	if (plist->head == NULL)
1211 		return (1);
1212 
1213 	curr_p = prev_p = plist->head;
1214 
1215 	while (curr_p != NULL) {
1216 		if (curr_p == port) {
1217 			if (prev_p == curr_p) {
1218 				plist->head = curr_p->p_next;
1219 			} else {
1220 				prev_p->p_next = curr_p->p_next;
1221 			}
1222 			plist->num_ports--;
1223 			break;
1224 		} else {
1225 			prev_p = curr_p;
1226 			curr_p = curr_p->p_next;
1227 		}
1228 	}
1229 	return (0);
1230 }
1231 
1232 /*
1233  * Interrupt handler for ldc messages.
1234  */
1235 static uint_t
1236 vsw_ldc_cb(uint64_t event, caddr_t arg)
1237 {
1238 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1239 	vsw_t 		*vswp = ldcp->ldc_vswp;
1240 
1241 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1242 
1243 	mutex_enter(&ldcp->ldc_cblock);
1244 	ldcp->ldc_stats.callbacks++;
1245 
1246 	mutex_enter(&ldcp->status_lock);
1247 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1248 		mutex_exit(&ldcp->status_lock);
1249 		mutex_exit(&ldcp->ldc_cblock);
1250 		return (LDC_SUCCESS);
1251 	}
1252 	mutex_exit(&ldcp->status_lock);
1253 
1254 	if (event & LDC_EVT_UP) {
1255 		/*
1256 		 * Channel has come up.
1257 		 */
1258 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1259 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1260 
1261 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1262 
1263 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1264 	}
1265 
1266 	if (event & LDC_EVT_READ) {
1267 		/*
1268 		 * Data available for reading.
1269 		 */
1270 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1271 		    __func__, ldcp->ldc_id, event);
1272 
1273 		vsw_process_evt_read(ldcp);
1274 
1275 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1276 
1277 		goto vsw_cb_exit;
1278 	}
1279 
1280 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1281 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1282 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1283 
1284 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1285 	}
1286 
1287 	/*
1288 	 * Catch either LDC_EVT_WRITE which we don't support or any
1289 	 * unknown event.
1290 	 */
1291 	if (event &
1292 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1293 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1294 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1295 	}
1296 
1297 vsw_cb_exit:
1298 	mutex_exit(&ldcp->ldc_cblock);
1299 
1300 	/*
1301 	 * Let the drain function know we are finishing if it
1302 	 * is waiting.
1303 	 */
1304 	mutex_enter(&ldcp->drain_cv_lock);
1305 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1306 		cv_signal(&ldcp->drain_cv);
1307 	mutex_exit(&ldcp->drain_cv_lock);
1308 
1309 	return (LDC_SUCCESS);
1310 }
1311 
1312 /*
1313  * Reinitialise data structures associated with the channel.
1314  */
1315 static void
1316 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1317 {
1318 	vsw_t		*vswp = ldcp->ldc_vswp;
1319 	vsw_port_t	*port;
1320 
1321 	D1(vswp, "%s: enter", __func__);
1322 
1323 	port = ldcp->ldc_port;
1324 
1325 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1326 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1327 
1328 	vsw_free_lane_resources(ldcp, INBOUND);
1329 	vsw_free_lane_resources(ldcp, OUTBOUND);
1330 
1331 	ldcp->lane_in.lstate = 0;
1332 	ldcp->lane_out.lstate = 0;
1333 
1334 	/*
1335 	 * Remove parent port from any multicast groups
1336 	 * it may have registered with. Client must resend
1337 	 * multicast add command after handshake completes.
1338 	 */
1339 	vsw_del_mcst_port(port);
1340 
1341 	ldcp->peer_session = 0;
1342 	ldcp->session_status = 0;
1343 	ldcp->hcnt = 0;
1344 	ldcp->hphase = VSW_MILESTONE0;
1345 
1346 	vsw_reset_vnet_proto_ops(ldcp);
1347 
1348 	D1(vswp, "%s: exit", __func__);
1349 }
1350 
1351 /*
1352  * Process a connection event.
1353  */
1354 void
1355 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1356 {
1357 	vsw_t		*vswp = ldcp->ldc_vswp;
1358 	vsw_conn_evt_t	*conn = NULL;
1359 
1360 	D1(vswp, "%s: enter", __func__);
1361 
1362 	/*
1363 	 * Check if either a reset or restart event is pending
1364 	 * or in progress. If so just return.
1365 	 *
1366 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1367 	 * being received by the callback handler, or a ECONNRESET error
1368 	 * code being returned from a ldc_read() or ldc_write() call.
1369 	 *
1370 	 * A VSW_CONN_RESTART event occurs when some error checking code
1371 	 * decides that there is a problem with data from the channel,
1372 	 * and that the handshake should be restarted.
1373 	 */
1374 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1375 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1376 		return;
1377 
1378 	/*
1379 	 * If it is an LDC_UP event we first check the recorded
1380 	 * state of the channel. If this is UP then we know that
1381 	 * the channel moving to the UP state has already been dealt
1382 	 * with and don't need to dispatch a  new task.
1383 	 *
1384 	 * The reason for this check is that when we do a ldc_up(),
1385 	 * depending on the state of the peer, we may or may not get
1386 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1387 	 * every time we do ldc_up() we explicitly check the channel
1388 	 * status to see has it come up (ldc_up() is asynch and will
1389 	 * complete at some undefined time), and take the appropriate
1390 	 * action.
1391 	 *
1392 	 * The flip side of this is that we may get a LDC_UP event
1393 	 * when we have already seen that the channel is up and have
1394 	 * dealt with that.
1395 	 */
1396 	mutex_enter(&ldcp->status_lock);
1397 	if (evt == VSW_CONN_UP) {
1398 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1399 			mutex_exit(&ldcp->status_lock);
1400 			return;
1401 		}
1402 	}
1403 	mutex_exit(&ldcp->status_lock);
1404 
1405 	/*
1406 	 * The transaction group id allows us to identify and discard
1407 	 * any tasks which are still pending on the taskq and refer
1408 	 * to the handshake session we are about to restart or reset.
1409 	 * These stale messages no longer have any real meaning.
1410 	 */
1411 	(void) atomic_inc_32(&ldcp->hss_id);
1412 
1413 	ASSERT(vswp->taskq_p != NULL);
1414 
1415 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1416 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1417 		    " connection event", vswp->instance);
1418 		goto err_exit;
1419 	}
1420 
1421 	conn->evt = evt;
1422 	conn->ldcp = ldcp;
1423 
1424 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1425 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1426 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1427 		    vswp->instance);
1428 
1429 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1430 		goto err_exit;
1431 	}
1432 
1433 	D1(vswp, "%s: exit", __func__);
1434 	return;
1435 
1436 err_exit:
1437 	/*
1438 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1439 	 * that future requests will at least be attempted and will hopefully
1440 	 * succeed.
1441 	 */
1442 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1443 		ldcp->reset_active = 0;
1444 }
1445 
1446 /*
1447  * Deal with events relating to a connection. Invoked from a taskq.
1448  */
1449 static void
1450 vsw_conn_task(void *arg)
1451 {
1452 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1453 	vsw_ldc_t	*ldcp = NULL;
1454 	vsw_port_t	*portp;
1455 	vsw_t		*vswp = NULL;
1456 	uint16_t	evt;
1457 	ldc_status_t	curr_status;
1458 
1459 	ldcp = conn->ldcp;
1460 	evt = conn->evt;
1461 	vswp = ldcp->ldc_vswp;
1462 	portp = ldcp->ldc_port;
1463 
1464 	D1(vswp, "%s: enter", __func__);
1465 
1466 	/* can safely free now have copied out data */
1467 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1468 
1469 	if (ldcp->rcv_thread != NULL) {
1470 		vsw_stop_rcv_thread(ldcp);
1471 	} else if (ldcp->msg_thread != NULL) {
1472 		vsw_stop_msg_thread(ldcp);
1473 	}
1474 
1475 	mutex_enter(&ldcp->status_lock);
1476 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1477 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1478 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1479 		mutex_exit(&ldcp->status_lock);
1480 		return;
1481 	}
1482 
1483 	/*
1484 	 * If we wish to restart the handshake on this channel, then if
1485 	 * the channel is UP we bring it DOWN to flush the underlying
1486 	 * ldc queue.
1487 	 */
1488 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1489 		(void) ldc_down(ldcp->ldc_handle);
1490 
1491 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1492 		vsw_hio_stop(vswp, ldcp);
1493 	}
1494 
1495 	/*
1496 	 * re-init all the associated data structures.
1497 	 */
1498 	vsw_ldc_reinit(ldcp);
1499 
1500 	/*
1501 	 * Bring the channel back up (note it does no harm to
1502 	 * do this even if the channel is already UP, Just
1503 	 * becomes effectively a no-op).
1504 	 */
1505 	(void) ldc_up(ldcp->ldc_handle);
1506 
1507 	/*
1508 	 * Check if channel is now UP. This will only happen if
1509 	 * peer has also done a ldc_up().
1510 	 */
1511 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1512 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1513 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1514 		mutex_exit(&ldcp->status_lock);
1515 		return;
1516 	}
1517 
1518 	ldcp->ldc_status = curr_status;
1519 
1520 	/* channel UP so restart handshake by sending version info */
1521 	if (curr_status == LDC_UP) {
1522 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1523 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1524 			    " handshake attempts (%d) on channel %ld",
1525 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1526 			mutex_exit(&ldcp->status_lock);
1527 			return;
1528 		}
1529 
1530 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1531 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1532 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1533 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1534 			    vswp->instance);
1535 
1536 			/*
1537 			 * Don't count as valid restart attempt if couldn't
1538 			 * send version msg.
1539 			 */
1540 			if (ldcp->hcnt > 0)
1541 				ldcp->hcnt--;
1542 		}
1543 	}
1544 
1545 	/*
1546 	 * Mark that the process is complete by clearing the flag.
1547 	 *
1548 	 * Note is it possible that the taskq dispatch above may have failed,
1549 	 * most likely due to memory shortage. We still clear the flag so
1550 	 * future attempts will at least be attempted and will hopefully
1551 	 * succeed.
1552 	 */
1553 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1554 		ldcp->reset_active = 0;
1555 
1556 	mutex_exit(&ldcp->status_lock);
1557 
1558 	D1(vswp, "%s: exit", __func__);
1559 }
1560 
1561 /*
1562  * returns 0 if legal for event signified by flag to have
1563  * occured at the time it did. Otherwise returns 1.
1564  */
1565 int
1566 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1567 {
1568 	vsw_t		*vswp = ldcp->ldc_vswp;
1569 	uint64_t	state;
1570 	uint64_t	phase;
1571 
1572 	if (dir == INBOUND)
1573 		state = ldcp->lane_in.lstate;
1574 	else
1575 		state = ldcp->lane_out.lstate;
1576 
1577 	phase = ldcp->hphase;
1578 
1579 	switch (flag) {
1580 	case VSW_VER_INFO_RECV:
1581 		if (phase > VSW_MILESTONE0) {
1582 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1583 			    " when in state %d\n", ldcp->ldc_id, phase);
1584 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1585 			return (1);
1586 		}
1587 		break;
1588 
1589 	case VSW_VER_ACK_RECV:
1590 	case VSW_VER_NACK_RECV:
1591 		if (!(state & VSW_VER_INFO_SENT)) {
1592 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1593 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1594 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1595 			return (1);
1596 		} else
1597 			state &= ~VSW_VER_INFO_SENT;
1598 		break;
1599 
1600 	case VSW_ATTR_INFO_RECV:
1601 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1602 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1603 			    " when in state %d\n", ldcp->ldc_id, phase);
1604 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1605 			return (1);
1606 		}
1607 		break;
1608 
1609 	case VSW_ATTR_ACK_RECV:
1610 	case VSW_ATTR_NACK_RECV:
1611 		if (!(state & VSW_ATTR_INFO_SENT)) {
1612 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1613 			    " or ATTR_NACK when in state %d\n",
1614 			    ldcp->ldc_id, phase);
1615 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1616 			return (1);
1617 		} else
1618 			state &= ~VSW_ATTR_INFO_SENT;
1619 		break;
1620 
1621 	case VSW_DRING_INFO_RECV:
1622 		if (phase < VSW_MILESTONE1) {
1623 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1624 			    " when in state %d\n", ldcp->ldc_id, phase);
1625 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1626 			return (1);
1627 		}
1628 		break;
1629 
1630 	case VSW_DRING_ACK_RECV:
1631 	case VSW_DRING_NACK_RECV:
1632 		if (!(state & VSW_DRING_INFO_SENT)) {
1633 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1634 			    " or DRING_NACK when in state %d\n",
1635 			    ldcp->ldc_id, phase);
1636 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1637 			return (1);
1638 		} else
1639 			state &= ~VSW_DRING_INFO_SENT;
1640 		break;
1641 
1642 	case VSW_RDX_INFO_RECV:
1643 		if (phase < VSW_MILESTONE3) {
1644 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1645 			    " when in state %d\n", ldcp->ldc_id, phase);
1646 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1647 			return (1);
1648 		}
1649 		break;
1650 
1651 	case VSW_RDX_ACK_RECV:
1652 	case VSW_RDX_NACK_RECV:
1653 		if (!(state & VSW_RDX_INFO_SENT)) {
1654 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1655 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1656 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1657 			return (1);
1658 		} else
1659 			state &= ~VSW_RDX_INFO_SENT;
1660 		break;
1661 
1662 	case VSW_MCST_INFO_RECV:
1663 		if (phase < VSW_MILESTONE3) {
1664 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1665 			    " when in state %d\n", ldcp->ldc_id, phase);
1666 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1667 			return (1);
1668 		}
1669 		break;
1670 
1671 	default:
1672 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1673 		    ldcp->ldc_id, flag);
1674 		return (1);
1675 	}
1676 
1677 	if (dir == INBOUND)
1678 		ldcp->lane_in.lstate = state;
1679 	else
1680 		ldcp->lane_out.lstate = state;
1681 
1682 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1683 
1684 	return (0);
1685 }
1686 
1687 void
1688 vsw_next_milestone(vsw_ldc_t *ldcp)
1689 {
1690 	vsw_t		*vswp = ldcp->ldc_vswp;
1691 	vsw_port_t	*portp = ldcp->ldc_port;
1692 	lane_t		*lane_out = &ldcp->lane_out;
1693 	lane_t		*lane_in = &ldcp->lane_in;
1694 
1695 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1696 	    ldcp->ldc_id, ldcp->hphase);
1697 
1698 	DUMP_FLAGS(lane_in->lstate);
1699 	DUMP_FLAGS(lane_out->lstate);
1700 
1701 	switch (ldcp->hphase) {
1702 
1703 	case VSW_MILESTONE0:
1704 		/*
1705 		 * If we haven't started to handshake with our peer,
1706 		 * start to do so now.
1707 		 */
1708 		if (lane_out->lstate == 0) {
1709 			D2(vswp, "%s: (chan %lld) starting handshake "
1710 			    "with peer", __func__, ldcp->ldc_id);
1711 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1712 		}
1713 
1714 		/*
1715 		 * Only way to pass this milestone is to have successfully
1716 		 * negotiated version info.
1717 		 */
1718 		if ((lane_in->lstate & VSW_VER_ACK_SENT) &&
1719 		    (lane_out->lstate & VSW_VER_ACK_RECV)) {
1720 
1721 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1722 			    __func__, ldcp->ldc_id);
1723 
1724 			vsw_set_vnet_proto_ops(ldcp);
1725 
1726 			/*
1727 			 * Next milestone is passed when attribute
1728 			 * information has been successfully exchanged.
1729 			 */
1730 			ldcp->hphase = VSW_MILESTONE1;
1731 			vsw_send_attr(ldcp);
1732 
1733 		}
1734 		break;
1735 
1736 	case VSW_MILESTONE1:
1737 		/*
1738 		 * Only way to pass this milestone is to have successfully
1739 		 * negotiated attribute information, in both directions.
1740 		 */
1741 		if (!((lane_in->lstate & VSW_ATTR_ACK_SENT) &&
1742 		    (lane_out->lstate & VSW_ATTR_ACK_RECV))) {
1743 			break;
1744 		}
1745 
1746 		ldcp->hphase = VSW_MILESTONE2;
1747 
1748 		/*
1749 		 * If the peer device has said it wishes to
1750 		 * use descriptor rings then we send it our ring
1751 		 * info, otherwise we just set up a private ring
1752 		 * which we use an internal buffer
1753 		 */
1754 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1755 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1756 		    (VSW_VER_LT(ldcp, 1, 2) &&
1757 		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
1758 			vsw_send_dring_info(ldcp);
1759 			break;
1760 		}
1761 
1762 		/*
1763 		 * The peer doesn't operate in dring mode; we
1764 		 * can simply fallthru to the RDX phase from
1765 		 * here.
1766 		 */
1767 		/*FALLTHRU*/
1768 
1769 	case VSW_MILESTONE2:
1770 		/*
1771 		 * If peer has indicated in its attribute message that
1772 		 * it wishes to use descriptor rings then the only way
1773 		 * to pass this milestone is for us to have received
1774 		 * valid dring info.
1775 		 *
1776 		 * If peer is not using descriptor rings then just fall
1777 		 * through.
1778 		 */
1779 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1780 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1781 		    (VSW_VER_LT(ldcp, 1, 2) &&
1782 		    (lane_in->xfer_mode ==
1783 		    VIO_DRING_MODE_V1_0))) {
1784 			if (!(lane_in->lstate & VSW_DRING_ACK_SENT))
1785 				break;
1786 		}
1787 
1788 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1789 		    __func__, ldcp->ldc_id);
1790 
1791 		ldcp->hphase = VSW_MILESTONE3;
1792 		vsw_send_rdx(ldcp);
1793 		break;
1794 
1795 	case VSW_MILESTONE3:
1796 		/*
1797 		 * Pass this milestone when all paramaters have been
1798 		 * successfully exchanged and RDX sent in both directions.
1799 		 *
1800 		 * Mark the relevant lane as available to transmit data. In
1801 		 * RxDringData mode, lane_in is associated with transmit and
1802 		 * lane_out is associated with receive. It is the reverse in
1803 		 * TxDring mode.
1804 		 */
1805 		if ((lane_out->lstate & VSW_RDX_ACK_SENT) &&
1806 		    (lane_in->lstate & VSW_RDX_ACK_RECV)) {
1807 
1808 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1809 			    __func__, ldcp->ldc_id);
1810 			D2(vswp, "%s: ** handshake complete (0x%llx : "
1811 			    "0x%llx) **", __func__, lane_in->lstate,
1812 			    lane_out->lstate);
1813 			if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
1814 				lane_in->lstate |= VSW_LANE_ACTIVE;
1815 			} else {
1816 				lane_out->lstate |= VSW_LANE_ACTIVE;
1817 			}
1818 			ldcp->hphase = VSW_MILESTONE4;
1819 			ldcp->hcnt = 0;
1820 			DISPLAY_STATE();
1821 			/* Start HIO if enabled and capable */
1822 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1823 				D2(vswp, "%s: start HybridIO setup", __func__);
1824 				vsw_hio_start(vswp, ldcp);
1825 			}
1826 
1827 			if (ldcp->pls_negotiated == B_TRUE) {
1828 				/*
1829 				 * The vnet device has negotiated to get phys
1830 				 * link updates. Now that the handshake with
1831 				 * the vnet device is complete, send an initial
1832 				 * update with the current physical link state.
1833 				 */
1834 				vsw_send_physlink_msg(ldcp,
1835 				    vswp->phys_link_state);
1836 			}
1837 
1838 		} else {
1839 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1840 			    __func__, lane_in->lstate,
1841 			    lane_out->lstate);
1842 		}
1843 		break;
1844 
1845 	case VSW_MILESTONE4:
1846 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1847 		    ldcp->ldc_id);
1848 		break;
1849 
1850 	default:
1851 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1852 		    ldcp->ldc_id, ldcp->hphase);
1853 	}
1854 
1855 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1856 	    ldcp->hphase);
1857 }
1858 
1859 /*
1860  * Check if major version is supported.
1861  *
1862  * Returns 0 if finds supported major number, and if necessary
1863  * adjusts the minor field.
1864  *
1865  * Returns 1 if can't match major number exactly. Sets mjor/minor
1866  * to next lowest support values, or to zero if no other values possible.
1867  */
1868 static int
1869 vsw_supported_version(vio_ver_msg_t *vp)
1870 {
1871 	int	i;
1872 
1873 	D1(NULL, "vsw_supported_version: enter");
1874 
1875 	for (i = 0; i < VSW_NUM_VER; i++) {
1876 		if (vsw_versions[i].ver_major == vp->ver_major) {
1877 			/*
1878 			 * Matching or lower major version found. Update
1879 			 * minor number if necessary.
1880 			 */
1881 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1882 				D2(NULL, "%s: adjusting minor value from %d "
1883 				    "to %d", __func__, vp->ver_minor,
1884 				    vsw_versions[i].ver_minor);
1885 				vp->ver_minor = vsw_versions[i].ver_minor;
1886 			}
1887 
1888 			return (0);
1889 		}
1890 
1891 		/*
1892 		 * If the message contains a higher major version number, set
1893 		 * the message's major/minor versions to the current values
1894 		 * and return false, so this message will get resent with
1895 		 * these values.
1896 		 */
1897 		if (vsw_versions[i].ver_major < vp->ver_major) {
1898 			D2(NULL, "%s: adjusting major and minor "
1899 			    "values to %d, %d\n",
1900 			    __func__, vsw_versions[i].ver_major,
1901 			    vsw_versions[i].ver_minor);
1902 			vp->ver_major = vsw_versions[i].ver_major;
1903 			vp->ver_minor = vsw_versions[i].ver_minor;
1904 			return (1);
1905 		}
1906 	}
1907 
1908 	/* No match was possible, zero out fields */
1909 	vp->ver_major = 0;
1910 	vp->ver_minor = 0;
1911 
1912 	D1(NULL, "vsw_supported_version: exit");
1913 
1914 	return (1);
1915 }
1916 
1917 /*
1918  * Set vnet-protocol-version dependent functions based on version.
1919  */
1920 static void
1921 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1922 {
1923 	vsw_t	*vswp = ldcp->ldc_vswp;
1924 	lane_t	*lp = &ldcp->lane_out;
1925 
1926 	/*
1927 	 * Setup the appropriate dring data processing routine and any
1928 	 * associated thread based on the version.
1929 	 *
1930 	 * In versions < 1.6, we support only TxDring mode. In this mode, the
1931 	 * msg worker thread processes all types of VIO msgs (ctrl and data).
1932 	 *
1933 	 * In versions >= 1.6, we also support RxDringData mode. In this mode,
1934 	 * the rcv worker thread processes dring data messages (msgtype:
1935 	 * VIO_TYPE_DATA, subtype: VIO_SUBTYPE_INFO, env: VIO_DRING_DATA). The
1936 	 * rest of the data messages (including acks) and ctrl messages are
1937 	 * handled directly by the callback (intr) thread.
1938 	 *
1939 	 * However, for versions >= 1.6, we could still fallback to TxDring
1940 	 * mode. This could happen if RxDringData mode has been disabled (see
1941 	 * vsw_dring_mode) on this guest or on the peer guest. This info is
1942 	 * determined as part of attr exchange phase of handshake. Hence, we
1943 	 * setup these pointers for v1.6 after attr msg phase completes during
1944 	 * handshake.
1945 	 */
1946 	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
1947 		/*
1948 		 * Set data dring mode for vsw_send_attr(). We setup msg worker
1949 		 * thread in TxDring mode or rcv worker thread in RxDringData
1950 		 * mode when attr phase of handshake completes.
1951 		 */
1952 		if (vsw_dring_mode == VIO_RX_DRING_DATA) {
1953 			lp->dring_mode = (VIO_RX_DRING_DATA | VIO_TX_DRING);
1954 		} else {
1955 			lp->dring_mode = VIO_TX_DRING;
1956 		}
1957 	} else {
1958 		lp->dring_mode = VIO_TX_DRING;
1959 	}
1960 
1961 	/*
1962 	 * Setup the MTU for attribute negotiation based on the version.
1963 	 */
1964 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
1965 		/*
1966 		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
1967 		 * Support), set the mtu in our attributes to max_frame_size.
1968 		 */
1969 		lp->mtu = vswp->max_frame_size;
1970 	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
1971 		/*
1972 		 * If the version negotiated with peer is == 1.3 (Vlan Tag
1973 		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
1974 		 */
1975 		lp->mtu = ETHERMAX + VLAN_TAGSZ;
1976 	} else {
1977 		vsw_port_t	*portp = ldcp->ldc_port;
1978 		/*
1979 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
1980 		 * We can negotiate that size with those peers provided only
1981 		 * pvid is defined for our peer and there are no vids. Then we
1982 		 * can send/recv only untagged frames of max size ETHERMAX.
1983 		 * Note that pvid of the peer can be different, as vsw has to
1984 		 * serve the vnet in that vlan even if itself is not assigned
1985 		 * to that vlan.
1986 		 */
1987 		if (portp->nvids == 0) {
1988 			lp->mtu = ETHERMAX;
1989 		}
1990 	}
1991 
1992 	/*
1993 	 * Setup version dependent data processing functions.
1994 	 */
1995 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
1996 		/* Versions >= 1.2 */
1997 
1998 		if (VSW_PRI_ETH_DEFINED(vswp)) {
1999 			/*
2000 			 * enable priority routines and pkt mode only if
2001 			 * at least one pri-eth-type is specified in MD.
2002 			 */
2003 			ldcp->tx = vsw_ldctx_pri;
2004 			ldcp->rx_pktdata = vsw_process_pkt_data;
2005 
2006 			/* set xfer mode for vsw_send_attr() */
2007 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2008 		} else {
2009 			/* no priority eth types defined in MD */
2010 
2011 			ldcp->tx = vsw_ldctx;
2012 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2013 
2014 			/* set xfer mode for vsw_send_attr() */
2015 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2016 		}
2017 
2018 	} else {
2019 		/* Versions prior to 1.2  */
2020 
2021 		vsw_reset_vnet_proto_ops(ldcp);
2022 	}
2023 }
2024 
2025 /*
2026  * Reset vnet-protocol-version dependent functions to v1.0.
2027  */
2028 static void
2029 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2030 {
2031 	lane_t	*lp = &ldcp->lane_out;
2032 
2033 	ldcp->tx = vsw_ldctx;
2034 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2035 
2036 	/* set xfer mode for vsw_send_attr() */
2037 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2038 }
2039 
2040 static void
2041 vsw_process_evt_read(vsw_ldc_t *ldcp)
2042 {
2043 	if (ldcp->msg_thread != NULL) {
2044 		/*
2045 		 * TxDring mode; wakeup message worker
2046 		 * thread to process the VIO messages.
2047 		 */
2048 		mutex_exit(&ldcp->ldc_cblock);
2049 		mutex_enter(&ldcp->msg_thr_lock);
2050 		if (!(ldcp->msg_thr_flags & VSW_WTHR_DATARCVD)) {
2051 			ldcp->msg_thr_flags |= VSW_WTHR_DATARCVD;
2052 			cv_signal(&ldcp->msg_thr_cv);
2053 		}
2054 		mutex_exit(&ldcp->msg_thr_lock);
2055 		mutex_enter(&ldcp->ldc_cblock);
2056 	} else {
2057 		/*
2058 		 * We invoke vsw_process_pkt() in the context of the LDC
2059 		 * callback (vsw_ldc_cb()) during handshake, until the dring
2060 		 * mode is negotiated. After the dring mode is negotiated, the
2061 		 * msgs are processed by the msg worker thread (above case) if
2062 		 * the dring mode is TxDring. Otherwise (in RxDringData mode)
2063 		 * we continue to process the msgs directly in the callback
2064 		 * context.
2065 		 */
2066 		vsw_process_pkt(ldcp);
2067 	}
2068 }
2069 
2070 /*
2071  * Main routine for processing messages received over LDC.
2072  */
2073 void
2074 vsw_process_pkt(void *arg)
2075 {
2076 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2077 	vsw_t 		*vswp = ldcp->ldc_vswp;
2078 	size_t		msglen;
2079 	vio_msg_tag_t	*tagp;
2080 	uint64_t	*ldcmsg;
2081 	int 		rv = 0;
2082 
2083 
2084 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2085 
2086 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2087 
2088 	ldcmsg = ldcp->ldcmsg;
2089 	/*
2090 	 * If channel is up read messages until channel is empty.
2091 	 */
2092 	do {
2093 		msglen = ldcp->msglen;
2094 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2095 
2096 		if (rv != 0) {
2097 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2098 			    __func__, ldcp->ldc_id, rv, msglen);
2099 		}
2100 
2101 		/* channel has been reset */
2102 		if (rv == ECONNRESET) {
2103 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2104 			break;
2105 		}
2106 
2107 		if (msglen == 0) {
2108 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2109 			    ldcp->ldc_id);
2110 			break;
2111 		}
2112 
2113 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2114 		    ldcp->ldc_id, msglen);
2115 
2116 		/*
2117 		 * Figure out what sort of packet we have gotten by
2118 		 * examining the msg tag, and then switch it appropriately.
2119 		 */
2120 		tagp = (vio_msg_tag_t *)ldcmsg;
2121 
2122 		switch (tagp->vio_msgtype) {
2123 		case VIO_TYPE_CTRL:
2124 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp, msglen);
2125 			break;
2126 		case VIO_TYPE_DATA:
2127 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2128 			break;
2129 		case VIO_TYPE_ERR:
2130 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2131 			break;
2132 		default:
2133 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2134 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2135 			break;
2136 		}
2137 	} while (msglen);
2138 
2139 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2140 }
2141 
2142 /*
2143  * Dispatch a task to process a VIO control message.
2144  */
2145 static void
2146 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp,
2147 	int msglen)
2148 {
2149 	vsw_ctrl_task_t		*ctaskp = NULL;
2150 	vsw_port_t		*port = ldcp->ldc_port;
2151 	vsw_t			*vswp = port->p_vswp;
2152 
2153 	D1(vswp, "%s: enter", __func__);
2154 
2155 	/*
2156 	 * We need to handle RDX ACK messages in-band as once they
2157 	 * are exchanged it is possible that we will get an
2158 	 * immediate (legitimate) data packet.
2159 	 */
2160 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2161 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2162 
2163 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2164 			return;
2165 
2166 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2167 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2168 		    "(ostate 0x%llx : hphase %d)", __func__,
2169 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2170 		vsw_next_milestone(ldcp);
2171 		return;
2172 	}
2173 
2174 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2175 
2176 	if (ctaskp == NULL) {
2177 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2178 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2179 		return;
2180 	}
2181 
2182 	ctaskp->ldcp = ldcp;
2183 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, msglen);
2184 	ctaskp->hss_id = ldcp->hss_id;
2185 
2186 	/*
2187 	 * Dispatch task to processing taskq if port is not in
2188 	 * the process of being detached.
2189 	 */
2190 	mutex_enter(&port->state_lock);
2191 	if (port->state == VSW_PORT_INIT) {
2192 		if ((vswp->taskq_p == NULL) ||
2193 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2194 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2195 			mutex_exit(&port->state_lock);
2196 			DERR(vswp, "%s: unable to dispatch task to taskq",
2197 			    __func__);
2198 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2199 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2200 			return;
2201 		}
2202 	} else {
2203 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2204 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2205 		    "task", __func__, port->p_instance);
2206 	}
2207 
2208 	mutex_exit(&port->state_lock);
2209 
2210 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2211 	    ldcp->ldc_id);
2212 	D1(vswp, "%s: exit", __func__);
2213 }
2214 
2215 /*
2216  * Process a VIO ctrl message. Invoked from taskq.
2217  */
2218 static void
2219 vsw_process_ctrl_pkt(void *arg)
2220 {
2221 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2222 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2223 	vsw_t 		*vswp = ldcp->ldc_vswp;
2224 	vio_msg_tag_t	tag;
2225 	uint16_t	env;
2226 
2227 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2228 
2229 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2230 	env = tag.vio_subtype_env;
2231 
2232 	/* stale pkt check */
2233 	if (ctaskp->hss_id < ldcp->hss_id) {
2234 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2235 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2236 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2237 		return;
2238 	}
2239 
2240 	/* session id check */
2241 	if (ldcp->session_status & VSW_PEER_SESSION) {
2242 		if (ldcp->peer_session != tag.vio_sid) {
2243 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2244 			    __func__, ldcp->ldc_id, tag.vio_sid);
2245 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2246 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2247 			return;
2248 		}
2249 	}
2250 
2251 	/*
2252 	 * Switch on vio_subtype envelope, then let lower routines
2253 	 * decide if its an INFO, ACK or NACK packet.
2254 	 */
2255 	switch (env) {
2256 	case VIO_VER_INFO:
2257 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2258 		break;
2259 	case VIO_DRING_REG:
2260 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2261 		break;
2262 	case VIO_DRING_UNREG:
2263 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2264 		break;
2265 	case VIO_ATTR_INFO:
2266 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2267 		break;
2268 	case VNET_MCAST_INFO:
2269 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2270 		break;
2271 	case VIO_RDX:
2272 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2273 		break;
2274 	case VIO_DDS_INFO:
2275 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2276 		break;
2277 
2278 	case VNET_PHYSLINK_INFO:
2279 		vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
2280 		break;
2281 	default:
2282 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2283 	}
2284 
2285 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2286 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2287 }
2288 
2289 /*
2290  * Version negotiation. We can end up here either because our peer
2291  * has responded to a handshake message we have sent it, or our peer
2292  * has initiated a handshake with us. If its the former then can only
2293  * be ACK or NACK, if its the later can only be INFO.
2294  *
2295  * If its an ACK we move to the next stage of the handshake, namely
2296  * attribute exchange. If its a NACK we see if we can specify another
2297  * version, if we can't we stop.
2298  *
2299  * If it is an INFO we reset all params associated with communication
2300  * in that direction over this channel (remember connection is
2301  * essentially 2 independent simplex channels).
2302  */
2303 void
2304 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2305 {
2306 	vio_ver_msg_t	*ver_pkt;
2307 	vsw_t 		*vswp = ldcp->ldc_vswp;
2308 
2309 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2310 
2311 	/*
2312 	 * We know this is a ctrl/version packet so
2313 	 * cast it into the correct structure.
2314 	 */
2315 	ver_pkt = (vio_ver_msg_t *)pkt;
2316 
2317 	switch (ver_pkt->tag.vio_subtype) {
2318 	case VIO_SUBTYPE_INFO:
2319 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2320 
2321 		/*
2322 		 * Record the session id, which we will use from now
2323 		 * until we see another VER_INFO msg. Even then the
2324 		 * session id in most cases will be unchanged, execpt
2325 		 * if channel was reset.
2326 		 */
2327 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2328 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2329 			DERR(vswp, "%s: updating session id for chan %lld "
2330 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2331 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2332 		}
2333 
2334 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2335 		ldcp->session_status |= VSW_PEER_SESSION;
2336 
2337 		/* Legal message at this time ? */
2338 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2339 			return;
2340 
2341 		/*
2342 		 * First check the device class. Currently only expect
2343 		 * to be talking to a network device. In the future may
2344 		 * also talk to another switch.
2345 		 */
2346 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2347 			DERR(vswp, "%s: illegal device class %d", __func__,
2348 			    ver_pkt->dev_class);
2349 
2350 			ver_pkt->tag.vio_sid = ldcp->local_session;
2351 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2352 
2353 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2354 
2355 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2356 			    sizeof (vio_ver_msg_t), B_TRUE);
2357 
2358 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2359 			vsw_next_milestone(ldcp);
2360 			return;
2361 		} else {
2362 			ldcp->dev_class = ver_pkt->dev_class;
2363 		}
2364 
2365 		/*
2366 		 * Now check the version.
2367 		 */
2368 		if (vsw_supported_version(ver_pkt) == 0) {
2369 			/*
2370 			 * Support this major version and possibly
2371 			 * adjusted minor version.
2372 			 */
2373 
2374 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2375 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2376 
2377 			/* Store accepted values */
2378 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2379 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2380 
2381 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2382 
2383 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2384 
2385 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2386 				/*
2387 				 * Send a version info message
2388 				 * using the accepted version that
2389 				 * we are about to ack. Also note that
2390 				 * we send our ver info before we ack.
2391 				 * Otherwise, as soon as receiving the
2392 				 * ack, obp sends attr info msg, which
2393 				 * breaks vsw_check_flag() invoked
2394 				 * from vsw_process_ctrl_attr_pkt();
2395 				 * as we also need VSW_VER_ACK_RECV to
2396 				 * be set in lane_out.lstate, before
2397 				 * we can receive attr info.
2398 				 */
2399 				vsw_send_ver(ldcp);
2400 			}
2401 		} else {
2402 			/*
2403 			 * NACK back with the next lower major/minor
2404 			 * pairing we support (if don't suuport any more
2405 			 * versions then they will be set to zero.
2406 			 */
2407 
2408 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2409 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2410 
2411 			/* Store updated values */
2412 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2413 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2414 
2415 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2416 
2417 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2418 		}
2419 
2420 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2421 		ver_pkt->tag.vio_sid = ldcp->local_session;
2422 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2423 		    sizeof (vio_ver_msg_t), B_TRUE);
2424 
2425 		vsw_next_milestone(ldcp);
2426 		break;
2427 
2428 	case VIO_SUBTYPE_ACK:
2429 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2430 
2431 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2432 			return;
2433 
2434 		/* Store updated values */
2435 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2436 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2437 
2438 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2439 		vsw_next_milestone(ldcp);
2440 
2441 		break;
2442 
2443 	case VIO_SUBTYPE_NACK:
2444 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2445 
2446 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2447 			return;
2448 
2449 		/*
2450 		 * If our peer sent us a NACK with the ver fields set to
2451 		 * zero then there is nothing more we can do. Otherwise see
2452 		 * if we support either the version suggested, or a lesser
2453 		 * one.
2454 		 */
2455 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2456 			DERR(vswp, "%s: peer unable to negotiate any "
2457 			    "further.", __func__);
2458 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2459 			vsw_next_milestone(ldcp);
2460 			return;
2461 		}
2462 
2463 		/*
2464 		 * Check to see if we support this major version or
2465 		 * a lower one. If we don't then maj/min will be set
2466 		 * to zero.
2467 		 */
2468 		(void) vsw_supported_version(ver_pkt);
2469 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2470 			/* Nothing more we can do */
2471 			DERR(vswp, "%s: version negotiation failed.\n",
2472 			    __func__);
2473 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2474 			vsw_next_milestone(ldcp);
2475 		} else {
2476 			/* found a supported major version */
2477 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2478 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2479 
2480 			D2(vswp, "%s: resending with updated values (%x, %x)",
2481 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2482 
2483 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2484 			ver_pkt->tag.vio_sid = ldcp->local_session;
2485 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2486 
2487 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2488 
2489 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2490 			    sizeof (vio_ver_msg_t), B_TRUE);
2491 
2492 			vsw_next_milestone(ldcp);
2493 
2494 		}
2495 		break;
2496 
2497 	default:
2498 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2499 		    ver_pkt->tag.vio_subtype);
2500 	}
2501 
2502 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2503 }
2504 
2505 static int
2506 vsw_process_attr_info(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2507 {
2508 	vsw_t			*vswp = ldcp->ldc_vswp;
2509 	vsw_port_t		*port = ldcp->ldc_port;
2510 	struct ether_addr	ea;
2511 	uint64_t		macaddr = 0;
2512 	lane_t			*lane_out = &ldcp->lane_out;
2513 	lane_t			*lane_in = &ldcp->lane_in;
2514 	uint32_t		mtu;
2515 	int			i;
2516 	uint8_t			dring_mode;
2517 
2518 	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2519 
2520 	if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) {
2521 		return (1);
2522 	}
2523 
2524 	if ((msg->xfer_mode != VIO_DESC_MODE) &&
2525 	    (msg->xfer_mode != lane_out->xfer_mode)) {
2526 		D2(NULL, "%s: unknown mode %x\n", __func__, msg->xfer_mode);
2527 		return (1);
2528 	}
2529 
2530 	/* Only support MAC addresses at moment. */
2531 	if ((msg->addr_type != ADDR_TYPE_MAC) || (msg->addr == 0)) {
2532 		D2(NULL, "%s: invalid addr_type %x, or address 0x%llx\n",
2533 		    __func__, msg->addr_type, msg->addr);
2534 		return (1);
2535 	}
2536 
2537 	/*
2538 	 * MAC address supplied by device should match that stored
2539 	 * in the vsw-port OBP node. Need to decide what to do if they
2540 	 * don't match, for the moment just warn but don't fail.
2541 	 */
2542 	vnet_macaddr_ultostr(msg->addr, ea.ether_addr_octet);
2543 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
2544 		DERR(NULL, "%s: device supplied address "
2545 		    "0x%llx doesn't match node address 0x%llx\n",
2546 		    __func__, msg->addr, port->p_macaddr);
2547 	}
2548 
2549 	/*
2550 	 * Ack freq only makes sense in pkt mode, in shared
2551 	 * mode the ring descriptors say whether or not to
2552 	 * send back an ACK.
2553 	 */
2554 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2555 	    (msg->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2556 	    (VSW_VER_LT(ldcp, 1, 2) &&
2557 	    (msg->xfer_mode == VIO_DRING_MODE_V1_0))) {
2558 		if (msg->ack_freq > 0) {
2559 			D2(NULL, "%s: non zero ack freq in SHM mode\n",
2560 			    __func__);
2561 			return (1);
2562 		}
2563 	}
2564 
2565 	/*
2566 	 * Process dring mode attribute.
2567 	 */
2568 	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2569 		/*
2570 		 * Versions >= 1.6:
2571 		 * Though we are operating in v1.6 mode, it is possible that
2572 		 * RxDringData mode has been disabled either on this guest or
2573 		 * on the peer guest. If so, we revert to pre v1.6 behavior of
2574 		 * TxDring mode. But this must be agreed upon in both
2575 		 * directions of attr exchange. We first determine the mode
2576 		 * that can be negotiated.
2577 		 */
2578 		if ((msg->options & VIO_RX_DRING_DATA) != 0 &&
2579 		    vsw_dring_mode == VIO_RX_DRING_DATA) {
2580 			/*
2581 			 * The peer is capable of handling RxDringData AND we
2582 			 * are also capable of it; we enable RxDringData mode
2583 			 * on this channel.
2584 			 */
2585 			dring_mode = VIO_RX_DRING_DATA;
2586 		} else if ((msg->options & VIO_TX_DRING) != 0) {
2587 			/*
2588 			 * If the peer is capable of TxDring mode, we
2589 			 * negotiate TxDring mode on this channel.
2590 			 */
2591 			dring_mode = VIO_TX_DRING;
2592 		} else {
2593 			/*
2594 			 * We support only VIO_TX_DRING and VIO_RX_DRING_DATA
2595 			 * modes. We don't support VIO_RX_DRING mode.
2596 			 */
2597 			return (1);
2598 		}
2599 
2600 		/*
2601 		 * If we have received an ack for the attr info that we sent,
2602 		 * then check if the dring mode matches what the peer had ack'd
2603 		 * (saved in lane_out). If they don't match, we fail the
2604 		 * handshake.
2605 		 */
2606 		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2607 			if (msg->options != lane_out->dring_mode) {
2608 				/* send NACK */
2609 				return (1);
2610 			}
2611 		} else {
2612 			/*
2613 			 * Save the negotiated dring mode in our attr
2614 			 * parameters, so it gets sent in the attr info from us
2615 			 * to the peer.
2616 			 */
2617 			lane_out->dring_mode = dring_mode;
2618 		}
2619 
2620 		/* save the negotiated dring mode in the msg to be replied */
2621 		msg->options = dring_mode;
2622 	}
2623 
2624 	/*
2625 	 * Process MTU attribute.
2626 	 */
2627 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2628 		/*
2629 		 * Versions >= 1.4:
2630 		 * Validate mtu of the peer is at least ETHERMAX. Then, the mtu
2631 		 * is negotiated down to the minimum of our mtu and peer's mtu.
2632 		 */
2633 		if (msg->mtu < ETHERMAX) {
2634 			return (1);
2635 		}
2636 
2637 		mtu = MIN(msg->mtu, vswp->max_frame_size);
2638 
2639 		/*
2640 		 * If we have received an ack for the attr info
2641 		 * that we sent, then check if the mtu computed
2642 		 * above matches the mtu that the peer had ack'd
2643 		 * (saved in local hparams). If they don't
2644 		 * match, we fail the handshake.
2645 		 */
2646 		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2647 			if (mtu != lane_out->mtu) {
2648 				/* send NACK */
2649 				return (1);
2650 			}
2651 		} else {
2652 			/*
2653 			 * Save the mtu computed above in our
2654 			 * attr parameters, so it gets sent in
2655 			 * the attr info from us to the peer.
2656 			 */
2657 			lane_out->mtu = mtu;
2658 		}
2659 
2660 		/* save the MIN mtu in the msg to be replied */
2661 		msg->mtu = mtu;
2662 	} else {
2663 		/* Versions < 1.4, mtu must match */
2664 		if (msg->mtu != lane_out->mtu) {
2665 			D2(NULL, "%s: invalid MTU (0x%llx)\n",
2666 			    __func__, msg->mtu);
2667 			return (1);
2668 		}
2669 	}
2670 
2671 	/*
2672 	 * Otherwise store attributes for this lane and update
2673 	 * lane state.
2674 	 */
2675 	lane_in->mtu = msg->mtu;
2676 	lane_in->addr = msg->addr;
2677 	lane_in->addr_type = msg->addr_type;
2678 	lane_in->xfer_mode = msg->xfer_mode;
2679 	lane_in->ack_freq = msg->ack_freq;
2680 	lane_in->physlink_update = msg->physlink_update;
2681 	lane_in->dring_mode = msg->options;
2682 
2683 	/*
2684 	 * Check if the client has requested physlink state updates.
2685 	 * If there is a physical device bound to this vswitch (L2
2686 	 * mode), set the ack bits to indicate it is supported.
2687 	 * Otherwise, set the nack bits.
2688 	 */
2689 	if (VSW_VER_GTEQ(ldcp, 1, 5)) {	/* Protocol ver >= 1.5 */
2690 
2691 		/* Does the vnet need phys link state updates ? */
2692 		if ((lane_in->physlink_update &
2693 		    PHYSLINK_UPDATE_STATE_MASK) ==
2694 		    PHYSLINK_UPDATE_STATE) {
2695 
2696 			if (vswp->smode & VSW_LAYER2) {
2697 				/* is a net-dev assigned to us ? */
2698 				msg->physlink_update =
2699 				    PHYSLINK_UPDATE_STATE_ACK;
2700 				ldcp->pls_negotiated = B_TRUE;
2701 			} else {
2702 				/* not in L2 mode */
2703 				msg->physlink_update =
2704 				    PHYSLINK_UPDATE_STATE_NACK;
2705 				ldcp->pls_negotiated = B_FALSE;
2706 			}
2707 
2708 		} else {
2709 			msg->physlink_update =
2710 			    PHYSLINK_UPDATE_NONE;
2711 			ldcp->pls_negotiated = B_FALSE;
2712 		}
2713 
2714 	} else {
2715 		/*
2716 		 * physlink_update bits are ignored
2717 		 * if set by clients < v1.5 protocol.
2718 		 */
2719 		msg->physlink_update = PHYSLINK_UPDATE_NONE;
2720 		ldcp->pls_negotiated = B_FALSE;
2721 	}
2722 
2723 	macaddr = lane_in->addr;
2724 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2725 		port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2726 		macaddr >>= 8;
2727 	}
2728 
2729 	/*
2730 	 * Setup device specific xmit routines. Note this could be changed
2731 	 * further in vsw_send_dring_info() for versions >= 1.6 if operating in
2732 	 * RxDringData mode.
2733 	 */
2734 	mutex_enter(&port->tx_lock);
2735 
2736 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2737 	    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2738 	    (VSW_VER_LT(ldcp, 1, 2) &&
2739 	    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2740 		D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2741 		port->transmit = vsw_dringsend;
2742 	} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2743 		D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2744 		vsw_create_privring(ldcp);
2745 		port->transmit = vsw_descrsend;
2746 		lane_out->xfer_mode = VIO_DESC_MODE;
2747 	}
2748 
2749 	/*
2750 	 * HybridIO is supported only vnet, not by OBP.
2751 	 * So, set hio_capable to true only when in DRING mode.
2752 	 */
2753 	if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2754 	    (lane_in->xfer_mode != VIO_DESC_MODE)) {
2755 		(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2756 	} else {
2757 		(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2758 	}
2759 
2760 	mutex_exit(&port->tx_lock);
2761 
2762 	return (0);
2763 }
2764 
2765 static int
2766 vsw_process_attr_ack(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2767 {
2768 	vsw_t	*vswp = ldcp->ldc_vswp;
2769 	lane_t	*lane_out = &ldcp->lane_out;
2770 	lane_t	*lane_in = &ldcp->lane_in;
2771 
2772 	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2773 
2774 	if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) {
2775 		return (1);
2776 	}
2777 
2778 	/*
2779 	 * Process dring mode attribute.
2780 	 */
2781 	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2782 		/*
2783 		 * Versions >= 1.6:
2784 		 * The ack msg sent by the peer contains the negotiated dring
2785 		 * mode between our capability (that we had sent in our attr
2786 		 * info) and the peer's capability.
2787 		 */
2788 		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2789 			/*
2790 			 * If we have sent an ack for the attr info msg from
2791 			 * the peer, check if the dring mode that was
2792 			 * negotiated then (saved in lane_out) matches the
2793 			 * mode that the peer has ack'd. If they don't match,
2794 			 * we fail the handshake.
2795 			 */
2796 			if (lane_out->dring_mode != msg->options) {
2797 				return (1);
2798 			}
2799 		} else {
2800 			if ((msg->options & lane_out->dring_mode) == 0) {
2801 				/*
2802 				 * Peer ack'd with a mode that we don't
2803 				 * support; we fail the handshake.
2804 				 */
2805 				return (1);
2806 			}
2807 			if ((msg->options & (VIO_TX_DRING|VIO_RX_DRING_DATA))
2808 			    == (VIO_TX_DRING|VIO_RX_DRING_DATA)) {
2809 				/*
2810 				 * Peer must ack with only one negotiated mode.
2811 				 * Otherwise fail handshake.
2812 				 */
2813 				return (1);
2814 			}
2815 
2816 			/*
2817 			 * Save the negotiated mode, so we can validate it when
2818 			 * we receive attr info from the peer.
2819 			 */
2820 			lane_out->dring_mode = msg->options;
2821 		}
2822 	}
2823 
2824 	/*
2825 	 * Process MTU attribute.
2826 	 */
2827 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2828 		/*
2829 		 * Versions >= 1.4:
2830 		 * The ack msg sent by the peer contains the minimum of
2831 		 * our mtu (that we had sent in our attr info) and the
2832 		 * peer's mtu.
2833 		 *
2834 		 * If we have sent an ack for the attr info msg from
2835 		 * the peer, check if the mtu that was computed then
2836 		 * (saved in lane_out params) matches the mtu that the
2837 		 * peer has ack'd. If they don't match, we fail the
2838 		 * handshake.
2839 		 */
2840 		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2841 			if (lane_out->mtu != msg->mtu) {
2842 				return (1);
2843 			}
2844 		} else {
2845 			/*
2846 			 * If the mtu ack'd by the peer is > our mtu
2847 			 * fail handshake. Otherwise, save the mtu, so
2848 			 * we can validate it when we receive attr info
2849 			 * from our peer.
2850 			 */
2851 			if (msg->mtu <= lane_out->mtu) {
2852 				lane_out->mtu = msg->mtu;
2853 			} else {
2854 				return (1);
2855 			}
2856 		}
2857 	}
2858 
2859 	return (0);
2860 }
2861 
2862 /*
2863  * Process an attribute packet. We can end up here either because our peer
2864  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2865  * peer has sent us an attribute INFO message
2866  *
2867  * If its an ACK we then move to the next stage of the handshake which
2868  * is to send our descriptor ring info to our peer. If its a NACK then
2869  * there is nothing more we can (currently) do.
2870  *
2871  * If we get a valid/acceptable INFO packet (and we have already negotiated
2872  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2873  * NACK back and reset channel state to INACTIV.
2874  *
2875  * FUTURE: in time we will probably negotiate over attributes, but for
2876  * the moment unacceptable attributes are regarded as a fatal error.
2877  *
2878  */
2879 void
2880 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2881 {
2882 	vnet_attr_msg_t	*attr_pkt;
2883 	vsw_t		*vswp = ldcp->ldc_vswp;
2884 	lane_t		*lane_out = &ldcp->lane_out;
2885 	lane_t		*lane_in = &ldcp->lane_in;
2886 	int		rv;
2887 
2888 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2889 
2890 	/*
2891 	 * We know this is a ctrl/attr packet so
2892 	 * cast it into the correct structure.
2893 	 */
2894 	attr_pkt = (vnet_attr_msg_t *)pkt;
2895 
2896 	switch (attr_pkt->tag.vio_subtype) {
2897 	case VIO_SUBTYPE_INFO:
2898 
2899 		rv = vsw_process_attr_info(ldcp, attr_pkt);
2900 		if (rv != 0) {
2901 			vsw_free_lane_resources(ldcp, INBOUND);
2902 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2903 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2904 		} else {
2905 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2906 			lane_in->lstate |= VSW_ATTR_ACK_SENT;
2907 		}
2908 		attr_pkt->tag.vio_sid = ldcp->local_session;
2909 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2910 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2911 		    sizeof (vnet_attr_msg_t), B_TRUE);
2912 		vsw_next_milestone(ldcp);
2913 		break;
2914 
2915 	case VIO_SUBTYPE_ACK:
2916 
2917 		rv = vsw_process_attr_ack(ldcp, attr_pkt);
2918 		if (rv != 0) {
2919 			return;
2920 		}
2921 		lane_out->lstate |= VSW_ATTR_ACK_RECV;
2922 		vsw_next_milestone(ldcp);
2923 		break;
2924 
2925 	case VIO_SUBTYPE_NACK:
2926 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2927 
2928 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2929 			return;
2930 
2931 		lane_out->lstate |= VSW_ATTR_NACK_RECV;
2932 		vsw_next_milestone(ldcp);
2933 		break;
2934 
2935 	default:
2936 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2937 		    attr_pkt->tag.vio_subtype);
2938 	}
2939 
2940 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2941 }
2942 
2943 static int
2944 vsw_process_dring_reg_info(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2945 {
2946 	int		rv;
2947 	vsw_t		*vswp = ldcp->ldc_vswp;
2948 	lane_t		*lp = &ldcp->lane_out;
2949 	dring_info_t	*dp = NULL;
2950 
2951 	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2952 
2953 	rv = vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV);
2954 	if (rv != 0) {
2955 		return (1);
2956 	}
2957 
2958 	if (VSW_VER_GTEQ(ldcp, 1, 6) &&
2959 	    (lp->dring_mode != ((vio_dring_reg_msg_t *)tagp)->options)) {
2960 		/*
2961 		 * The earlier version of Solaris vnet driver doesn't set the
2962 		 * option (VIO_TX_DRING in its case) correctly in its dring reg
2963 		 * message. We workaround that here by doing the check only
2964 		 * for versions >= v1.6.
2965 		 */
2966 		DWARN(vswp, "%s(%lld): Rcvd dring reg option (%d), "
2967 		    "negotiated mode (%d)\n", __func__, ldcp->ldc_id,
2968 		    ((vio_dring_reg_msg_t *)tagp)->options, lp->dring_mode);
2969 		return (1);
2970 	}
2971 
2972 	/*
2973 	 * Map dring exported by the peer.
2974 	 */
2975 	dp = vsw_map_dring(ldcp, (void *)tagp);
2976 	if (dp == NULL) {
2977 		return (1);
2978 	}
2979 
2980 	/*
2981 	 * Map data buffers exported by the peer if we are in RxDringData mode.
2982 	 */
2983 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
2984 		rv = vsw_map_data(ldcp, dp, (void *)tagp);
2985 		if (rv != 0) {
2986 			vsw_unmap_dring(ldcp);
2987 			return (1);
2988 		}
2989 	}
2990 
2991 	return (0);
2992 }
2993 
2994 static int
2995 vsw_process_dring_reg_ack(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2996 {
2997 	vsw_t		*vswp = ldcp->ldc_vswp;
2998 	dring_info_t	*dp;
2999 
3000 	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3001 
3002 	if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) {
3003 		return (1);
3004 	}
3005 
3006 	dp = ldcp->lane_out.dringp;
3007 
3008 	/* save dring_ident acked by peer */
3009 	dp->ident = ((vio_dring_reg_msg_t *)tagp)->dring_ident;
3010 
3011 	return (0);
3012 }
3013 
3014 /*
3015  * Process a dring info packet. We can end up here either because our peer
3016  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3017  * peer has sent us a dring INFO message.
3018  *
3019  * If we get a valid/acceptable INFO packet (and we have already negotiated
3020  * a version) we ACK back and update the lane state, otherwise we NACK back.
3021  *
3022  * FUTURE: nothing to stop client from sending us info on multiple dring's
3023  * but for the moment we will just use the first one we are given.
3024  *
3025  */
3026 void
3027 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3028 {
3029 	int		rv;
3030 	int		msgsize;
3031 	dring_info_t	*dp;
3032 	vio_msg_tag_t	*tagp = (vio_msg_tag_t *)pkt;
3033 	vsw_t		*vswp = ldcp->ldc_vswp;
3034 	lane_t		*lane_out = &ldcp->lane_out;
3035 	lane_t		*lane_in = &ldcp->lane_in;
3036 
3037 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3038 
3039 	switch (tagp->vio_subtype) {
3040 	case VIO_SUBTYPE_INFO:
3041 		rv = vsw_process_dring_reg_info(ldcp, tagp);
3042 		if (rv != 0) {
3043 			vsw_free_lane_resources(ldcp, INBOUND);
3044 			tagp->vio_subtype = VIO_SUBTYPE_NACK;
3045 			lane_in->lstate |= VSW_DRING_NACK_SENT;
3046 		} else {
3047 			tagp->vio_subtype = VIO_SUBTYPE_ACK;
3048 			lane_in->lstate |= VSW_DRING_ACK_SENT;
3049 		}
3050 		tagp->vio_sid = ldcp->local_session;
3051 		DUMP_TAG_PTR(tagp);
3052 		if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
3053 			dp = lane_in->dringp;
3054 			msgsize =
3055 			    VNET_DRING_REG_EXT_MSG_SIZE(dp->data_ncookies);
3056 		} else {
3057 			msgsize = sizeof (vio_dring_reg_msg_t);
3058 		}
3059 		(void) vsw_send_msg(ldcp, (void *)tagp, msgsize, B_TRUE);
3060 		vsw_next_milestone(ldcp);
3061 		break;
3062 
3063 	case VIO_SUBTYPE_ACK:
3064 		rv = vsw_process_dring_reg_ack(ldcp, tagp);
3065 		if (rv != 0) {
3066 			return;
3067 		}
3068 		lane_out->lstate |= VSW_DRING_ACK_RECV;
3069 		vsw_next_milestone(ldcp);
3070 		break;
3071 
3072 	case VIO_SUBTYPE_NACK:
3073 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3074 
3075 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3076 			return;
3077 
3078 		lane_out->lstate |= VSW_DRING_NACK_RECV;
3079 		vsw_next_milestone(ldcp);
3080 		break;
3081 
3082 	default:
3083 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3084 		    tagp->vio_subtype);
3085 	}
3086 
3087 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3088 }
3089 
3090 /*
3091  * Process a request from peer to unregister a dring.
3092  *
3093  * For the moment we just restart the handshake if our
3094  * peer endpoint attempts to unregister a dring.
3095  */
3096 void
3097 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3098 {
3099 	vsw_t			*vswp = ldcp->ldc_vswp;
3100 	vio_dring_unreg_msg_t	*dring_pkt;
3101 
3102 	/*
3103 	 * We know this is a ctrl/dring packet so
3104 	 * cast it into the correct structure.
3105 	 */
3106 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3107 
3108 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3109 
3110 	switch (dring_pkt->tag.vio_subtype) {
3111 	case VIO_SUBTYPE_INFO:
3112 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3113 
3114 		DWARN(vswp, "%s: restarting handshake..", __func__);
3115 		break;
3116 
3117 	case VIO_SUBTYPE_ACK:
3118 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3119 
3120 		DWARN(vswp, "%s: restarting handshake..", __func__);
3121 		break;
3122 
3123 	case VIO_SUBTYPE_NACK:
3124 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3125 
3126 		DWARN(vswp, "%s: restarting handshake..", __func__);
3127 		break;
3128 
3129 	default:
3130 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3131 		    dring_pkt->tag.vio_subtype);
3132 	}
3133 
3134 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3135 
3136 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3137 }
3138 
3139 #define	SND_MCST_NACK(ldcp, pkt) \
3140 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3141 	pkt->tag.vio_sid = ldcp->local_session; \
3142 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3143 			sizeof (vnet_mcast_msg_t), B_TRUE);
3144 
3145 /*
3146  * Process a multicast request from a vnet.
3147  *
3148  * Vnet's specify a multicast address that they are interested in. This
3149  * address is used as a key into the hash table which forms the multicast
3150  * forwarding database (mFDB).
3151  *
3152  * The table keys are the multicast addresses, while the table entries
3153  * are pointers to lists of ports which wish to receive packets for the
3154  * specified multicast address.
3155  *
3156  * When a multicast packet is being switched we use the address as a key
3157  * into the hash table, and then walk the appropriate port list forwarding
3158  * the pkt to each port in turn.
3159  *
3160  * If a vnet is no longer interested in a particular multicast grouping
3161  * we simply find the correct location in the hash table and then delete
3162  * the relevant port from the port list.
3163  *
3164  * To deal with the case whereby a port is being deleted without first
3165  * removing itself from the lists in the hash table, we maintain a list
3166  * of multicast addresses the port has registered an interest in, within
3167  * the port structure itself. We then simply walk that list of addresses
3168  * using them as keys into the hash table and remove the port from the
3169  * appropriate lists.
3170  */
3171 static void
3172 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3173 {
3174 	vnet_mcast_msg_t	*mcst_pkt;
3175 	vsw_port_t		*port = ldcp->ldc_port;
3176 	vsw_t			*vswp = ldcp->ldc_vswp;
3177 	int			i;
3178 
3179 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3180 
3181 	/*
3182 	 * We know this is a ctrl/mcast packet so
3183 	 * cast it into the correct structure.
3184 	 */
3185 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3186 
3187 	switch (mcst_pkt->tag.vio_subtype) {
3188 	case VIO_SUBTYPE_INFO:
3189 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3190 
3191 		/*
3192 		 * Check if in correct state to receive a multicast
3193 		 * message (i.e. handshake complete). If not reset
3194 		 * the handshake.
3195 		 */
3196 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3197 			return;
3198 
3199 		/*
3200 		 * Before attempting to add or remove address check
3201 		 * that they are valid multicast addresses.
3202 		 * If not, then NACK back.
3203 		 */
3204 		for (i = 0; i < mcst_pkt->count; i++) {
3205 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3206 				DERR(vswp, "%s: invalid multicast address",
3207 				    __func__);
3208 				SND_MCST_NACK(ldcp, mcst_pkt);
3209 				return;
3210 			}
3211 		}
3212 
3213 		/*
3214 		 * Now add/remove the addresses. If this fails we
3215 		 * NACK back.
3216 		 */
3217 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3218 			SND_MCST_NACK(ldcp, mcst_pkt);
3219 			return;
3220 		}
3221 
3222 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3223 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3224 
3225 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3226 
3227 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3228 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3229 		break;
3230 
3231 	case VIO_SUBTYPE_ACK:
3232 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3233 
3234 		/*
3235 		 * We shouldn't ever get a multicast ACK message as
3236 		 * at the moment we never request multicast addresses
3237 		 * to be set on some other device. This may change in
3238 		 * the future if we have cascading switches.
3239 		 */
3240 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3241 			return;
3242 
3243 				/* Do nothing */
3244 		break;
3245 
3246 	case VIO_SUBTYPE_NACK:
3247 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3248 
3249 		/*
3250 		 * We shouldn't get a multicast NACK packet for the
3251 		 * same reasons as we shouldn't get a ACK packet.
3252 		 */
3253 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3254 			return;
3255 
3256 				/* Do nothing */
3257 		break;
3258 
3259 	default:
3260 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3261 		    mcst_pkt->tag.vio_subtype);
3262 	}
3263 
3264 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3265 }
3266 
3267 static void
3268 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3269 {
3270 	vio_rdx_msg_t	*rdx_pkt;
3271 	vsw_t		*vswp = ldcp->ldc_vswp;
3272 
3273 	/*
3274 	 * We know this is a ctrl/rdx packet so
3275 	 * cast it into the correct structure.
3276 	 */
3277 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3278 
3279 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3280 
3281 	switch (rdx_pkt->tag.vio_subtype) {
3282 	case VIO_SUBTYPE_INFO:
3283 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3284 
3285 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3286 			return;
3287 
3288 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3289 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3290 
3291 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3292 
3293 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3294 
3295 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3296 		    sizeof (vio_rdx_msg_t), B_TRUE);
3297 
3298 		vsw_next_milestone(ldcp);
3299 		break;
3300 
3301 	case VIO_SUBTYPE_ACK:
3302 		/*
3303 		 * Should be handled in-band by callback handler.
3304 		 */
3305 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3306 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3307 		break;
3308 
3309 	case VIO_SUBTYPE_NACK:
3310 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3311 
3312 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3313 			return;
3314 
3315 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3316 		vsw_next_milestone(ldcp);
3317 		break;
3318 
3319 	default:
3320 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3321 		    rdx_pkt->tag.vio_subtype);
3322 	}
3323 
3324 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3325 }
3326 
3327 static void
3328 vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
3329 {
3330 	vnet_physlink_msg_t	*msgp;
3331 	vsw_t			*vswp = ldcp->ldc_vswp;
3332 
3333 	msgp = (vnet_physlink_msg_t *)pkt;
3334 
3335 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3336 
3337 	switch (msgp->tag.vio_subtype) {
3338 	case VIO_SUBTYPE_INFO:
3339 
3340 		/* vsw shouldn't recv physlink info */
3341 		DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
3342 		break;
3343 
3344 	case VIO_SUBTYPE_ACK:
3345 
3346 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3347 		break;
3348 
3349 	case VIO_SUBTYPE_NACK:
3350 
3351 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3352 		break;
3353 
3354 	default:
3355 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3356 		    msgp->tag.vio_subtype);
3357 	}
3358 
3359 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3360 }
3361 
3362 static void
3363 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3364 	uint32_t msglen)
3365 {
3366 	uint16_t	env = tagp->vio_subtype_env;
3367 	vsw_t		*vswp = ldcp->ldc_vswp;
3368 	lane_t		*lp = &ldcp->lane_out;
3369 	uint8_t		dring_mode = lp->dring_mode;
3370 
3371 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3372 
3373 	/* session id check */
3374 	if (ldcp->session_status & VSW_PEER_SESSION) {
3375 		if (ldcp->peer_session != tagp->vio_sid) {
3376 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3377 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3378 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3379 			return;
3380 		}
3381 	}
3382 
3383 	/*
3384 	 * It is an error for us to be getting data packets
3385 	 * before the handshake has completed.
3386 	 */
3387 	if (ldcp->hphase != VSW_MILESTONE4) {
3388 		DERR(vswp, "%s: got data packet before handshake complete "
3389 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3390 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3391 		DUMP_FLAGS(ldcp->lane_in.lstate);
3392 		DUMP_FLAGS(ldcp->lane_out.lstate);
3393 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3394 		return;
3395 	}
3396 	if (dring_mode == VIO_TX_DRING) {
3397 		/*
3398 		 * To reduce the locking contention, release the ldc_cblock
3399 		 * here and re-acquire it once we are done receiving packets.
3400 		 * We do this only in TxDring mode to allow further callbaks to
3401 		 * continue while the msg worker thread processes the messages.
3402 		 * In RxDringData mode, we process the messages in the callback
3403 		 * itself and wake up rcv worker thread to process only data
3404 		 * info messages.
3405 		 */
3406 		mutex_exit(&ldcp->ldc_cblock);
3407 		mutex_enter(&ldcp->ldc_rxlock);
3408 	}
3409 
3410 	/*
3411 	 * Switch on vio_subtype envelope, then let lower routines
3412 	 * decide if its an INFO, ACK or NACK packet.
3413 	 */
3414 	if (env == VIO_DRING_DATA) {
3415 		ldcp->rx_dringdata(ldcp, dpkt);
3416 	} else if (env == VIO_PKT_DATA) {
3417 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3418 	} else if (env == VIO_DESC_DATA) {
3419 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3420 	} else {
3421 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n",
3422 		    __func__, env);
3423 	}
3424 
3425 	if (dring_mode == VIO_TX_DRING) {
3426 		mutex_exit(&ldcp->ldc_rxlock);
3427 		mutex_enter(&ldcp->ldc_cblock);
3428 	}
3429 
3430 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3431 }
3432 
3433 /*
3434  * dummy pkt data handler function for vnet protocol version 1.0
3435  */
3436 static void
3437 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3438 {
3439 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3440 }
3441 
3442 /*
3443  * This function handles raw pkt data messages received over the channel.
3444  * Currently, only priority-eth-type frames are received through this mechanism.
3445  * In this case, the frame(data) is present within the message itself which
3446  * is copied into an mblk before switching it.
3447  */
3448 static void
3449 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3450 {
3451 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3452 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3453 	uint32_t		size;
3454 	mblk_t			*mp;
3455 	vio_mblk_t		*vmp;
3456 	vsw_t			*vswp = ldcp->ldc_vswp;
3457 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3458 	lane_t			*lp = &ldcp->lane_out;
3459 
3460 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3461 	if (size < ETHERMIN || size > lp->mtu) {
3462 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3463 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3464 		    ldcp->ldc_id, size);
3465 		return;
3466 	}
3467 
3468 	vmp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3469 	if (vmp == NULL) {
3470 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3471 		if (mp == NULL) {
3472 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3473 			DWARN(vswp, "%s(%lld) allocb failure, "
3474 			    "unable to process priority frame\n", __func__,
3475 			    ldcp->ldc_id);
3476 			return;
3477 		}
3478 	} else {
3479 		mp = vmp->mp;
3480 	}
3481 
3482 	/* skip over the extra space for vlan tag */
3483 	mp->b_rptr += VLAN_TAGSZ;
3484 
3485 	/* copy the frame from the payload of raw data msg into the mblk */
3486 	bcopy(dpkt->data, mp->b_rptr, size);
3487 	mp->b_wptr = mp->b_rptr + size;
3488 
3489 	if (vmp != NULL) {
3490 		vmp->state = VIO_MBLK_HAS_DATA;
3491 	}
3492 
3493 	/* update stats */
3494 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3495 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3496 
3497 	/*
3498 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3499 	 */
3500 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3501 
3502 	/* switch the frame to destination */
3503 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3504 }
3505 
3506 /*
3507  * Process an in-band descriptor message (most likely from
3508  * OBP).
3509  */
3510 static void
3511 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3512 {
3513 	vnet_ibnd_desc_t	*ibnd_desc;
3514 	dring_info_t		*dp = NULL;
3515 	vsw_private_desc_t	*priv_addr = NULL;
3516 	vsw_t			*vswp = ldcp->ldc_vswp;
3517 	mblk_t			*mp = NULL;
3518 	size_t			nbytes = 0;
3519 	size_t			off = 0;
3520 	uint64_t		idx = 0;
3521 	uint32_t		num = 1, len, datalen = 0;
3522 	uint64_t		ncookies = 0;
3523 	int			i, rv;
3524 	int			j = 0;
3525 
3526 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3527 
3528 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3529 
3530 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3531 	case VIO_SUBTYPE_INFO:
3532 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3533 
3534 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3535 			return;
3536 
3537 		/*
3538 		 * Data is padded to align on a 8 byte boundary,
3539 		 * nbytes is actual data length, i.e. minus that
3540 		 * padding.
3541 		 */
3542 		datalen = ibnd_desc->nbytes;
3543 
3544 		D2(vswp, "%s(%lld): processing inband desc : "
3545 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3546 
3547 		ncookies = ibnd_desc->ncookies;
3548 
3549 		/*
3550 		 * allocb(9F) returns an aligned data block. We
3551 		 * need to ensure that we ask ldc for an aligned
3552 		 * number of bytes also.
3553 		 */
3554 		nbytes = datalen;
3555 		if (nbytes & 0x7) {
3556 			off = 8 - (nbytes & 0x7);
3557 			nbytes += off;
3558 		}
3559 
3560 		/* alloc extra space for VLAN_TAG */
3561 		mp = allocb(datalen + 8, BPRI_MED);
3562 		if (mp == NULL) {
3563 			DERR(vswp, "%s(%lld): allocb failed",
3564 			    __func__, ldcp->ldc_id);
3565 			ldcp->ldc_stats.rx_allocb_fail++;
3566 			return;
3567 		}
3568 
3569 		/* skip over the extra space for VLAN_TAG */
3570 		mp->b_rptr += 8;
3571 
3572 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3573 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3574 		    LDC_COPY_IN);
3575 
3576 		if (rv != 0) {
3577 			DERR(vswp, "%s(%d): unable to copy in data from "
3578 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3579 			freemsg(mp);
3580 			ldcp->ldc_stats.ierrors++;
3581 			return;
3582 		}
3583 
3584 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3585 		    __func__, ldcp->ldc_id, nbytes, ncookies);
3586 
3587 		/* point to the actual end of data */
3588 		mp->b_wptr = mp->b_rptr + datalen;
3589 		ldcp->ldc_stats.ipackets++;
3590 		ldcp->ldc_stats.rbytes += datalen;
3591 
3592 		/*
3593 		 * We ACK back every in-band descriptor message we process
3594 		 */
3595 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3596 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3597 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3598 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3599 
3600 		/*
3601 		 * there is extra space alloc'd for VLAN_TAG
3602 		 */
3603 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3604 
3605 		/* send the packet to be switched */
3606 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3607 		    ldcp->ldc_port, NULL);
3608 
3609 		break;
3610 
3611 	case VIO_SUBTYPE_ACK:
3612 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3613 
3614 		/* Verify the ACK is valid */
3615 		idx = ibnd_desc->hdr.desc_handle;
3616 
3617 		if (idx >= vsw_num_descriptors) {
3618 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3619 			    "(idx %ld)", vswp->instance, idx);
3620 			return;
3621 		}
3622 
3623 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3624 			DERR(vswp, "%s: no dring found", __func__);
3625 			return;
3626 		}
3627 
3628 		len = dp->num_descriptors;
3629 		/*
3630 		 * If the descriptor we are being ACK'ed for is not the
3631 		 * one we expected, then pkts were lost somwhere, either
3632 		 * when we tried to send a msg, or a previous ACK msg from
3633 		 * our peer. In either case we now reclaim the descriptors
3634 		 * in the range from the last ACK we received up to the
3635 		 * current ACK.
3636 		 */
3637 		if (idx != dp->last_ack_recv) {
3638 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3639 			    __func__, dp->last_ack_recv, idx);
3640 			num = idx >= dp->last_ack_recv ?
3641 			    idx - dp->last_ack_recv + 1:
3642 			    (len - dp->last_ack_recv + 1) + idx;
3643 		}
3644 
3645 		/*
3646 		 * When we sent the in-band message to our peer we
3647 		 * marked the copy in our private ring as READY. We now
3648 		 * check that the descriptor we are being ACK'ed for is in
3649 		 * fact READY, i.e. it is one we have shared with our peer.
3650 		 *
3651 		 * If its not we flag an error, but still reset the descr
3652 		 * back to FREE.
3653 		 */
3654 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3655 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3656 			mutex_enter(&priv_addr->dstate_lock);
3657 			if (priv_addr->dstate != VIO_DESC_READY) {
3658 				DERR(vswp, "%s: (%ld) desc at index %ld not "
3659 				    "READY (0x%lx)", __func__,
3660 				    ldcp->ldc_id, idx, priv_addr->dstate);
3661 				DERR(vswp, "%s: bound %d: ncookies %ld : "
3662 				    "datalen %ld", __func__,
3663 				    priv_addr->bound, priv_addr->ncookies,
3664 				    priv_addr->datalen);
3665 			}
3666 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3667 			    ldcp->ldc_id, idx);
3668 			/* release resources associated with sent msg */
3669 			priv_addr->datalen = 0;
3670 			priv_addr->dstate = VIO_DESC_FREE;
3671 			mutex_exit(&priv_addr->dstate_lock);
3672 		}
3673 		/* update to next expected value */
3674 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3675 
3676 		break;
3677 
3678 	case VIO_SUBTYPE_NACK:
3679 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3680 
3681 		/*
3682 		 * We should only get a NACK if our peer doesn't like
3683 		 * something about a message we have sent it. If this
3684 		 * happens we just release the resources associated with
3685 		 * the message. (We are relying on higher layers to decide
3686 		 * whether or not to resend.
3687 		 */
3688 
3689 		/* limit check */
3690 		idx = ibnd_desc->hdr.desc_handle;
3691 
3692 		if (idx >= vsw_num_descriptors) {
3693 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3694 			    __func__, idx);
3695 			return;
3696 		}
3697 
3698 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3699 			DERR(vswp, "%s: no dring found", __func__);
3700 			return;
3701 		}
3702 
3703 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3704 
3705 		/* move to correct location in ring */
3706 		priv_addr += idx;
3707 
3708 		/* release resources associated with sent msg */
3709 		mutex_enter(&priv_addr->dstate_lock);
3710 		priv_addr->datalen = 0;
3711 		priv_addr->dstate = VIO_DESC_FREE;
3712 		mutex_exit(&priv_addr->dstate_lock);
3713 
3714 		break;
3715 
3716 	default:
3717 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3718 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3719 	}
3720 
3721 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3722 }
3723 
3724 static void
3725 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3726 {
3727 	_NOTE(ARGUNUSED(epkt))
3728 
3729 	vsw_t		*vswp = ldcp->ldc_vswp;
3730 	uint16_t	env = tagp->vio_subtype_env;
3731 
3732 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3733 
3734 	/*
3735 	 * Error vio_subtypes have yet to be defined. So for
3736 	 * the moment we can't do anything.
3737 	 */
3738 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3739 
3740 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3741 }
3742 
3743 /* transmit the packet over the given port */
3744 int
3745 vsw_portsend(vsw_port_t *port, mblk_t *mp)
3746 {
3747 	mblk_t		*mpt;
3748 	int		count;
3749 	vsw_ldc_t 	*ldcp = port->ldcp;
3750 	int		status = 0;
3751 
3752 	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3753 	if (count != 0) {
3754 		status = ldcp->tx(ldcp, mp, mpt, count);
3755 	}
3756 	return (status);
3757 }
3758 
3759 /*
3760  * Break up frames into 2 seperate chains: normal and
3761  * priority, based on the frame type. The number of
3762  * priority frames is also counted and returned.
3763  *
3764  * Params:
3765  * 	vswp:	pointer to the instance of vsw
3766  *	np:	head of packet chain to be broken
3767  *	npt:	tail of packet chain to be broken
3768  *
3769  * Returns:
3770  *	np:	head of normal data packets
3771  *	npt:	tail of normal data packets
3772  *	hp:	head of high priority packets
3773  *	hpt:	tail of high priority packets
3774  */
3775 static uint32_t
3776 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
3777 	mblk_t **hp, mblk_t **hpt)
3778 {
3779 	mblk_t			*tmp = NULL;
3780 	mblk_t			*smp = NULL;
3781 	mblk_t			*hmp = NULL;	/* high prio pkts head */
3782 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
3783 	mblk_t			*nmp = NULL;	/* normal pkts head */
3784 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
3785 	uint32_t		count = 0;
3786 	int			i;
3787 	struct ether_header	*ehp;
3788 	uint32_t		num_types;
3789 	uint16_t		*types;
3790 
3791 	tmp = *np;
3792 	while (tmp != NULL) {
3793 
3794 		smp = tmp;
3795 		tmp = tmp->b_next;
3796 		smp->b_next = NULL;
3797 		smp->b_prev = NULL;
3798 
3799 		ehp = (struct ether_header *)smp->b_rptr;
3800 		num_types = vswp->pri_num_types;
3801 		types = vswp->pri_types;
3802 		for (i = 0; i < num_types; i++) {
3803 			if (ehp->ether_type == types[i]) {
3804 				/* high priority frame */
3805 
3806 				if (hmp != NULL) {
3807 					hmpt->b_next = smp;
3808 					hmpt = smp;
3809 				} else {
3810 					hmp = hmpt = smp;
3811 				}
3812 				count++;
3813 				break;
3814 			}
3815 		}
3816 		if (i == num_types) {
3817 			/* normal data frame */
3818 
3819 			if (nmp != NULL) {
3820 				nmpt->b_next = smp;
3821 				nmpt = smp;
3822 			} else {
3823 				nmp = nmpt = smp;
3824 			}
3825 		}
3826 	}
3827 
3828 	*hp = hmp;
3829 	*hpt = hmpt;
3830 	*np = nmp;
3831 	*npt = nmpt;
3832 
3833 	return (count);
3834 }
3835 
3836 /*
3837  * Wrapper function to transmit normal and/or priority frames over the channel.
3838  */
3839 static int
3840 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3841 {
3842 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
3843 	mblk_t			*tmp;
3844 	mblk_t			*smp;
3845 	mblk_t			*hmp;	/* high prio pkts head */
3846 	mblk_t			*hmpt;	/* high prio pkts tail */
3847 	mblk_t			*nmp;	/* normal pkts head */
3848 	mblk_t			*nmpt;	/* normal pkts tail */
3849 	uint32_t		n = 0;
3850 	vsw_t			*vswp = ldcp->ldc_vswp;
3851 
3852 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
3853 	ASSERT(count != 0);
3854 
3855 	nmp = mp;
3856 	nmpt = mpt;
3857 
3858 	/* gather any priority frames from the chain of packets */
3859 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
3860 
3861 	/* transmit priority frames */
3862 	tmp = hmp;
3863 	while (tmp != NULL) {
3864 		smp = tmp;
3865 		tmp = tmp->b_next;
3866 		smp->b_next = NULL;
3867 		vsw_ldcsend_pkt(ldcp, smp);
3868 	}
3869 
3870 	count -= n;
3871 
3872 	if (count == 0) {
3873 		/* no normal data frames to process */
3874 		return (0);
3875 	}
3876 
3877 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
3878 }
3879 
3880 /*
3881  * Wrapper function to transmit normal frames over the channel.
3882  */
3883 static int
3884 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3885 {
3886 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
3887 	mblk_t		*tmp = NULL;
3888 
3889 	ASSERT(count != 0);
3890 	/*
3891 	 * If the TX thread is enabled, then queue the
3892 	 * ordinary frames and signal the tx thread.
3893 	 */
3894 	if (ldcp->tx_thread != NULL) {
3895 
3896 		mutex_enter(&ldcp->tx_thr_lock);
3897 
3898 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
3899 			/*
3900 			 * If we reached queue limit,
3901 			 * do not queue new packets,
3902 			 * drop them.
3903 			 */
3904 			ldcp->ldc_stats.tx_qfull += count;
3905 			mutex_exit(&ldcp->tx_thr_lock);
3906 			freemsgchain(mp);
3907 			goto exit;
3908 		}
3909 		if (ldcp->tx_mhead == NULL) {
3910 			ldcp->tx_mhead = mp;
3911 			ldcp->tx_mtail = mpt;
3912 			cv_signal(&ldcp->tx_thr_cv);
3913 		} else {
3914 			ldcp->tx_mtail->b_next = mp;
3915 			ldcp->tx_mtail = mpt;
3916 		}
3917 		ldcp->tx_cnt += count;
3918 		mutex_exit(&ldcp->tx_thr_lock);
3919 	} else {
3920 		while (mp != NULL) {
3921 			tmp = mp->b_next;
3922 			mp->b_next = mp->b_prev = NULL;
3923 			(void) vsw_ldcsend(ldcp, mp, 1);
3924 			mp = tmp;
3925 		}
3926 	}
3927 
3928 exit:
3929 	return (0);
3930 }
3931 
3932 /*
3933  * This function transmits the frame in the payload of a raw data
3934  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
3935  * send special frames with high priorities, without going through
3936  * the normal data path which uses descriptor ring mechanism.
3937  */
3938 static void
3939 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
3940 {
3941 	vio_raw_data_msg_t	*pkt;
3942 	mblk_t			*bp;
3943 	mblk_t			*nmp = NULL;
3944 	vio_mblk_t		*vmp;
3945 	caddr_t			dst;
3946 	uint32_t		mblksz;
3947 	uint32_t		size;
3948 	uint32_t		nbytes;
3949 	int			rv;
3950 	vsw_t			*vswp = ldcp->ldc_vswp;
3951 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3952 
3953 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3954 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3955 		(void) atomic_inc_32(&statsp->tx_pri_fail);
3956 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
3957 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
3958 		    ldcp->lane_out.lstate);
3959 		goto send_pkt_exit;
3960 	}
3961 
3962 	size = msgsize(mp);
3963 
3964 	/* frame size bigger than available payload len of raw data msg ? */
3965 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
3966 		(void) atomic_inc_32(&statsp->tx_pri_fail);
3967 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3968 		    ldcp->ldc_id, size);
3969 		goto send_pkt_exit;
3970 	}
3971 
3972 	if (size < ETHERMIN)
3973 		size = ETHERMIN;
3974 
3975 	/* alloc space for a raw data message */
3976 	vmp = vio_allocb(vswp->pri_tx_vmp);
3977 	if (vmp == NULL) {
3978 		(void) atomic_inc_32(&statsp->tx_pri_fail);
3979 		DWARN(vswp, "vio_allocb failed\n");
3980 		goto send_pkt_exit;
3981 	} else {
3982 		nmp = vmp->mp;
3983 	}
3984 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
3985 
3986 	/* copy frame into the payload of raw data message */
3987 	dst = (caddr_t)pkt->data;
3988 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3989 		mblksz = MBLKL(bp);
3990 		bcopy(bp->b_rptr, dst, mblksz);
3991 		dst += mblksz;
3992 	}
3993 
3994 	vmp->state = VIO_MBLK_HAS_DATA;
3995 
3996 	/* setup the raw data msg */
3997 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
3998 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3999 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4000 	pkt->tag.vio_sid = ldcp->local_session;
4001 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4002 
4003 	/* send the msg over ldc */
4004 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4005 	if (rv != 0) {
4006 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4007 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4008 		    ldcp->ldc_id);
4009 		goto send_pkt_exit;
4010 	}
4011 
4012 	/* update stats */
4013 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4014 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4015 
4016 send_pkt_exit:
4017 	if (nmp != NULL)
4018 		freemsg(nmp);
4019 	freemsg(mp);
4020 }
4021 
4022 /*
4023  * Transmit the packet over the given LDC channel.
4024  *
4025  * The 'retries' argument indicates how many times a packet
4026  * is retried before it is dropped. Note, the retry is done
4027  * only for a resource related failure, for all other failures
4028  * the packet is dropped immediately.
4029  */
4030 static int
4031 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4032 {
4033 	int		i;
4034 	int		rc;
4035 	int		status = 0;
4036 	vsw_port_t	*port = ldcp->ldc_port;
4037 	dring_info_t	*dp = NULL;
4038 	lane_t		*lp = &ldcp->lane_out;
4039 
4040 	for (i = 0; i < retries; ) {
4041 		/*
4042 		 * Send the message out using the appropriate
4043 		 * transmit function which will free mblock when it
4044 		 * is finished with it.
4045 		 */
4046 		mutex_enter(&port->tx_lock);
4047 		if (port->transmit != NULL) {
4048 			status = (*port->transmit)(ldcp, mp);
4049 		}
4050 		if (status == LDC_TX_SUCCESS) {
4051 			mutex_exit(&port->tx_lock);
4052 			break;
4053 		}
4054 		i++;	/* increment the counter here */
4055 
4056 		/* If its the last retry, then update the oerror */
4057 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4058 			ldcp->ldc_stats.oerrors++;
4059 		}
4060 		mutex_exit(&port->tx_lock);
4061 
4062 		if (status != LDC_TX_NORESOURCES) {
4063 			/*
4064 			 * No retrying required for errors un-related
4065 			 * to resources.
4066 			 */
4067 			break;
4068 		}
4069 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4070 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4071 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4072 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4073 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4074 
4075 			/* Need to reclaim in TxDring mode. */
4076 			if (lp->dring_mode == VIO_TX_DRING) {
4077 				rc = vsw_reclaim_dring(dp, dp->end_idx);
4078 			}
4079 
4080 		} else {
4081 			/*
4082 			 * If there is no dring or the xfer_mode is
4083 			 * set to DESC_MODE(ie., OBP), then simply break here.
4084 			 */
4085 			break;
4086 		}
4087 
4088 		/*
4089 		 * Delay only if none were reclaimed
4090 		 * and its not the last retry.
4091 		 */
4092 		if ((rc == 0) && (i < retries)) {
4093 			delay(drv_usectohz(vsw_ldc_tx_delay));
4094 		}
4095 	}
4096 	freemsg(mp);
4097 	return (status);
4098 }
4099 
4100 /*
4101  * Send an in-band descriptor message over ldc.
4102  */
4103 static int
4104 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4105 {
4106 	vsw_t			*vswp = ldcp->ldc_vswp;
4107 	vnet_ibnd_desc_t	ibnd_msg;
4108 	vsw_private_desc_t	*priv_desc = NULL;
4109 	dring_info_t		*dp = NULL;
4110 	size_t			n, size = 0;
4111 	caddr_t			bufp;
4112 	mblk_t			*bp;
4113 	int			idx, i;
4114 	int			status = LDC_TX_SUCCESS;
4115 	static int		warn_msg = 1;
4116 	lane_t			*lp = &ldcp->lane_out;
4117 
4118 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4119 
4120 	ASSERT(mp != NULL);
4121 
4122 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4123 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4124 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4125 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4126 		    ldcp->lane_out.lstate);
4127 		ldcp->ldc_stats.oerrors++;
4128 		return (LDC_TX_FAILURE);
4129 	}
4130 
4131 	/*
4132 	 * The dring here is as an internal buffer,
4133 	 * rather than a transfer channel.
4134 	 */
4135 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4136 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4137 		    __func__, ldcp->ldc_id);
4138 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4139 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4140 		ldcp->ldc_stats.oerrors++;
4141 		return (LDC_TX_FAILURE);
4142 	}
4143 
4144 	size = msgsize(mp);
4145 	if (size > (size_t)lp->mtu) {
4146 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4147 		    ldcp->ldc_id, size);
4148 		ldcp->ldc_stats.oerrors++;
4149 		return (LDC_TX_FAILURE);
4150 	}
4151 
4152 	/*
4153 	 * Find a free descriptor in our buffer ring
4154 	 */
4155 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4156 		if (warn_msg) {
4157 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4158 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4159 			warn_msg = 0;
4160 		}
4161 
4162 		/* nothing more we can do */
4163 		status = LDC_TX_NORESOURCES;
4164 		goto vsw_descrsend_free_exit;
4165 	} else {
4166 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4167 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4168 		warn_msg = 1;
4169 	}
4170 
4171 	/* copy data into the descriptor */
4172 	bufp = priv_desc->datap;
4173 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4174 		n = MBLKL(bp);
4175 		bcopy(bp->b_rptr, bufp, n);
4176 		bufp += n;
4177 	}
4178 
4179 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4180 
4181 	/* create and send the in-band descp msg */
4182 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4183 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4184 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4185 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4186 
4187 	/*
4188 	 * Copy the mem cookies describing the data from the
4189 	 * private region of the descriptor ring into the inband
4190 	 * descriptor.
4191 	 */
4192 	for (i = 0; i < priv_desc->ncookies; i++) {
4193 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4194 		    sizeof (ldc_mem_cookie_t));
4195 	}
4196 
4197 	ibnd_msg.hdr.desc_handle = idx;
4198 	ibnd_msg.ncookies = priv_desc->ncookies;
4199 	ibnd_msg.nbytes = size;
4200 
4201 	ldcp->ldc_stats.opackets++;
4202 	ldcp->ldc_stats.obytes += size;
4203 
4204 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4205 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4206 
4207 vsw_descrsend_free_exit:
4208 
4209 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4210 	return (status);
4211 }
4212 
4213 static void
4214 vsw_send_ver(void *arg)
4215 {
4216 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4217 	vsw_t		*vswp = ldcp->ldc_vswp;
4218 	lane_t		*lp = &ldcp->lane_out;
4219 	vio_ver_msg_t	ver_msg;
4220 
4221 	D1(vswp, "%s enter", __func__);
4222 
4223 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4224 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4225 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4226 	ver_msg.tag.vio_sid = ldcp->local_session;
4227 
4228 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4229 		ver_msg.ver_major = vsw_versions[0].ver_major;
4230 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4231 	} else {
4232 		/* use the major,minor that we've ack'd */
4233 		lane_t	*lpi = &ldcp->lane_in;
4234 		ver_msg.ver_major = lpi->ver_major;
4235 		ver_msg.ver_minor = lpi->ver_minor;
4236 	}
4237 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4238 
4239 	lp->lstate |= VSW_VER_INFO_SENT;
4240 	lp->ver_major = ver_msg.ver_major;
4241 	lp->ver_minor = ver_msg.ver_minor;
4242 
4243 	DUMP_TAG(ver_msg.tag);
4244 
4245 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4246 
4247 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4248 }
4249 
4250 static void
4251 vsw_send_attr(vsw_ldc_t *ldcp)
4252 {
4253 	vsw_t			*vswp = ldcp->ldc_vswp;
4254 	lane_t			*lp = &ldcp->lane_out;
4255 	vnet_attr_msg_t		attr_msg;
4256 
4257 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4258 
4259 	/*
4260 	 * Subtype is set to INFO by default
4261 	 */
4262 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4263 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4264 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4265 	attr_msg.tag.vio_sid = ldcp->local_session;
4266 
4267 	/* payload copied from default settings for lane */
4268 	attr_msg.mtu = lp->mtu;
4269 	attr_msg.addr_type = lp->addr_type;
4270 	attr_msg.xfer_mode = lp->xfer_mode;
4271 	attr_msg.ack_freq = lp->xfer_mode;
4272 	attr_msg.options = lp->dring_mode;
4273 
4274 	READ_ENTER(&vswp->if_lockrw);
4275 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4276 	RW_EXIT(&vswp->if_lockrw);
4277 
4278 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4279 
4280 	DUMP_TAG(attr_msg.tag);
4281 
4282 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4283 
4284 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4285 }
4286 
4287 static void
4288 vsw_send_dring_info(vsw_ldc_t *ldcp)
4289 {
4290 	int		msgsize;
4291 	void		*msg;
4292 	vsw_t		*vswp = ldcp->ldc_vswp;
4293 	vsw_port_t	*port = ldcp->ldc_port;
4294 	lane_t		*lp = &ldcp->lane_out;
4295 	vgen_stats_t	*statsp = &ldcp->ldc_stats;
4296 
4297 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4298 
4299 	/* dring mode has been negotiated in attr phase; save in stats */
4300 	statsp->dring_mode = lp->dring_mode;
4301 
4302 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4303 		/*
4304 		 * Change the transmit routine for RxDringData mode.
4305 		 */
4306 		port->transmit = vsw_dringsend_shm;
4307 		msg = (void *) vsw_create_rx_dring_info(ldcp);
4308 		if (msg == NULL) {
4309 			return;
4310 		}
4311 		msgsize =
4312 		    VNET_DRING_REG_EXT_MSG_SIZE(lp->dringp->data_ncookies);
4313 		ldcp->rcv_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4314 		    vsw_ldc_rcv_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4315 		ldcp->rx_dringdata = vsw_process_dringdata_shm;
4316 	} else {
4317 		msg = (void *) vsw_create_tx_dring_info(ldcp);
4318 		if (msg == NULL) {
4319 			return;
4320 		}
4321 		msgsize = sizeof (vio_dring_reg_msg_t);
4322 		ldcp->msg_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4323 		    vsw_ldc_msg_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4324 		ldcp->rx_dringdata = vsw_process_dringdata;
4325 	}
4326 
4327 	lp->lstate |= VSW_DRING_INFO_SENT;
4328 	DUMP_TAG_PTR((vio_msg_tag_t *)msg);
4329 	(void) vsw_send_msg(ldcp, msg, msgsize, B_TRUE);
4330 	kmem_free(msg, msgsize);
4331 
4332 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4333 }
4334 
4335 static void
4336 vsw_send_rdx(vsw_ldc_t *ldcp)
4337 {
4338 	vsw_t		*vswp = ldcp->ldc_vswp;
4339 	vio_rdx_msg_t	rdx_msg;
4340 
4341 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4342 
4343 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4344 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4345 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4346 	rdx_msg.tag.vio_sid = ldcp->local_session;
4347 
4348 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4349 
4350 	DUMP_TAG(rdx_msg.tag);
4351 
4352 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4353 
4354 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4355 }
4356 
4357 /*
4358  * Remove the specified address from the list of address maintained
4359  * in this port node.
4360  */
4361 mcst_addr_t *
4362 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4363 {
4364 	vsw_t		*vswp = NULL;
4365 	vsw_port_t	*port = NULL;
4366 	mcst_addr_t	*prev_p = NULL;
4367 	mcst_addr_t	*curr_p = NULL;
4368 
4369 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4370 	    __func__, devtype, addr);
4371 
4372 	if (devtype == VSW_VNETPORT) {
4373 		port = (vsw_port_t *)arg;
4374 		mutex_enter(&port->mca_lock);
4375 		prev_p = curr_p = port->mcap;
4376 	} else {
4377 		vswp = (vsw_t *)arg;
4378 		mutex_enter(&vswp->mca_lock);
4379 		prev_p = curr_p = vswp->mcap;
4380 	}
4381 
4382 	while (curr_p != NULL) {
4383 		if (curr_p->addr == addr) {
4384 			D2(NULL, "%s: address found", __func__);
4385 			/* match found */
4386 			if (prev_p == curr_p) {
4387 				/* list head */
4388 				if (devtype == VSW_VNETPORT)
4389 					port->mcap = curr_p->nextp;
4390 				else
4391 					vswp->mcap = curr_p->nextp;
4392 			} else {
4393 				prev_p->nextp = curr_p->nextp;
4394 			}
4395 			break;
4396 		} else {
4397 			prev_p = curr_p;
4398 			curr_p = curr_p->nextp;
4399 		}
4400 	}
4401 
4402 	if (devtype == VSW_VNETPORT)
4403 		mutex_exit(&port->mca_lock);
4404 	else
4405 		mutex_exit(&vswp->mca_lock);
4406 
4407 	D1(NULL, "%s: exit", __func__);
4408 
4409 	return (curr_p);
4410 }
4411 
4412 /*
4413  * Create a ring consisting of just a private portion and link
4414  * it into the list of rings for the outbound lane.
4415  *
4416  * These type of rings are used primarily for temporary data
4417  * storage (i.e. as data buffers).
4418  */
4419 void
4420 vsw_create_privring(vsw_ldc_t *ldcp)
4421 {
4422 	dring_info_t		*dp;
4423 	vsw_t			*vswp = ldcp->ldc_vswp;
4424 
4425 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4426 
4427 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4428 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4429 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4430 	ldcp->lane_out.dringp = dp;
4431 
4432 	/* no public section */
4433 	dp->pub_addr = NULL;
4434 	dp->priv_addr = kmem_zalloc(
4435 	    (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP);
4436 	dp->num_descriptors = vsw_num_descriptors;
4437 
4438 	if (vsw_setup_tx_dring(ldcp, dp)) {
4439 		DERR(vswp, "%s: setup of ring failed", __func__);
4440 		vsw_destroy_tx_dring(ldcp);
4441 		return;
4442 	}
4443 
4444 	/* haven't used any descriptors yet */
4445 	dp->end_idx = 0;
4446 	dp->restart_reqd = B_TRUE;
4447 
4448 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4449 }
4450 
4451 /*
4452  * Set the default lane attributes. These are copied into
4453  * the attr msg we send to our peer. If they are not acceptable
4454  * then (currently) the handshake ends.
4455  */
4456 static void
4457 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
4458 {
4459 	bzero(lp, sizeof (lane_t));
4460 
4461 	READ_ENTER(&vswp->if_lockrw);
4462 	ether_copy(&(vswp->if_addr), &(lp->addr));
4463 	RW_EXIT(&vswp->if_lockrw);
4464 
4465 	lp->mtu = vswp->max_frame_size;
4466 	lp->addr_type = ADDR_TYPE_MAC;
4467 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
4468 	lp->ack_freq = 0;	/* for shared mode */
4469 	lp->seq_num = VNET_ISS;
4470 }
4471 
4472 /*
4473  * Map the descriptor ring exported by the peer.
4474  */
4475 static dring_info_t *
4476 vsw_map_dring(vsw_ldc_t *ldcp, void *pkt)
4477 {
4478 	dring_info_t	*dp = NULL;
4479 	lane_t		*lp = &ldcp->lane_out;
4480 
4481 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4482 		/*
4483 		 * In RxDringData mode, dring that we map in
4484 		 * becomes our transmit descriptor ring.
4485 		 */
4486 		dp =  vsw_map_tx_dring(ldcp, pkt);
4487 	} else {
4488 		/*
4489 		 * In TxDring mode, dring that we map in
4490 		 * becomes our receive descriptor ring.
4491 		 */
4492 		dp =  vsw_map_rx_dring(ldcp, pkt);
4493 	}
4494 	return (dp);
4495 }
4496 
4497 /*
4498  * Common dring mapping function used in both TxDring and RxDringData modes.
4499  */
4500 dring_info_t *
4501 vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt)
4502 {
4503 	int		rv;
4504 	dring_info_t	*dp;
4505 	ldc_mem_info_t	minfo;
4506 	vsw_t		*vswp = ldcp->ldc_vswp;
4507 
4508 	/*
4509 	 * If the dring params are unacceptable then we NACK back.
4510 	 */
4511 	if ((dring_pkt->num_descriptors == 0) ||
4512 	    (dring_pkt->descriptor_size == 0) ||
4513 	    (dring_pkt->ncookies != 1)) {
4514 		DERR(vswp, "%s (%lld): invalid dring info",
4515 		    __func__, ldcp->ldc_id);
4516 		return (NULL);
4517 	}
4518 
4519 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4520 
4521 	dp->num_descriptors = dring_pkt->num_descriptors;
4522 	dp->descriptor_size = dring_pkt->descriptor_size;
4523 	dp->options = dring_pkt->options;
4524 	dp->dring_ncookies = dring_pkt->ncookies;
4525 
4526 	/*
4527 	 * Note: should only get one cookie. Enforced in
4528 	 * the ldc layer.
4529 	 */
4530 	bcopy(&dring_pkt->cookie[0], &dp->dring_cookie[0],
4531 	    sizeof (ldc_mem_cookie_t));
4532 
4533 	rv = ldc_mem_dring_map(ldcp->ldc_handle, &dp->dring_cookie[0],
4534 	    dp->dring_ncookies, dp->num_descriptors, dp->descriptor_size,
4535 	    LDC_DIRECT_MAP, &(dp->dring_handle));
4536 	if (rv != 0) {
4537 		goto fail;
4538 	}
4539 
4540 	rv = ldc_mem_dring_info(dp->dring_handle, &minfo);
4541 	if (rv != 0) {
4542 		goto fail;
4543 	}
4544 	/* store the address of the ring */
4545 	dp->pub_addr = minfo.vaddr;
4546 
4547 	/* cache the dring mtype */
4548 	dp->dring_mtype = minfo.mtype;
4549 
4550 	/* no private section as we are importing */
4551 	dp->priv_addr = NULL;
4552 
4553 	/*
4554 	 * Using simple mono increasing int for ident at the moment.
4555 	 */
4556 	dp->ident = ldcp->next_ident;
4557 	ldcp->next_ident++;
4558 
4559 	/*
4560 	 * Acknowledge it; we send back a unique dring identifier that
4561 	 * the sending side will use in future to refer to this
4562 	 * descriptor ring.
4563 	 */
4564 	dring_pkt->dring_ident = dp->ident;
4565 
4566 	return (dp);
4567 fail:
4568 	if (dp->dring_handle != NULL) {
4569 		(void) ldc_mem_dring_unmap(dp->dring_handle);
4570 	}
4571 	kmem_free(dp, sizeof (*dp));
4572 	return (NULL);
4573 }
4574 
4575 /*
4576  * Unmap the descriptor ring exported by the peer.
4577  */
4578 static void
4579 vsw_unmap_dring(vsw_ldc_t *ldcp)
4580 {
4581 	lane_t	*lane_out = &ldcp->lane_out;
4582 
4583 	if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
4584 		vsw_unmap_tx_dring(ldcp);
4585 	} else {
4586 		vsw_unmap_rx_dring(ldcp);
4587 	}
4588 }
4589 
4590 /*
4591  * Map the shared memory data buffer area exported by the peer.
4592  * Used in RxDringData mode only.
4593  */
4594 static int
4595 vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt)
4596 {
4597 	int			rv;
4598 	vio_dring_reg_ext_msg_t	*emsg;
4599 	vio_dring_reg_msg_t	*msg = pkt;
4600 	uint8_t			*buf = (uint8_t *)msg->cookie;
4601 	vsw_t			*vswp = ldcp->ldc_vswp;
4602 
4603 	/* skip over dring cookies */
4604 	ASSERT(msg->ncookies == 1);
4605 	buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));
4606 
4607 	emsg = (vio_dring_reg_ext_msg_t *)buf;
4608 	if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
4609 		return (1);
4610 	}
4611 
4612 	/* save # of data area cookies */
4613 	dp->data_ncookies = emsg->data_ncookies;
4614 
4615 	/* save data area size */
4616 	dp->data_sz = emsg->data_area_size;
4617 
4618 	/* allocate ldc mem handle for data area */
4619 	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &dp->data_handle);
4620 	if (rv != 0) {
4621 		cmn_err(CE_WARN, "ldc_mem_alloc_handle failed\n");
4622 		DWARN(vswp, "%s (%lld) ldc_mem_alloc_handle() failed: %d\n",
4623 		    __func__, ldcp->ldc_id, rv);
4624 		return (1);
4625 	}
4626 
4627 	/* map the data area */
4628 	rv = ldc_mem_map(dp->data_handle, emsg->data_cookie,
4629 	    emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_R,
4630 	    (caddr_t *)&dp->data_addr, NULL);
4631 	if (rv != 0) {
4632 		cmn_err(CE_WARN, "ldc_mem_map failed\n");
4633 		DWARN(vswp, "%s (%lld) ldc_mem_map() failed: %d\n",
4634 		    __func__, ldcp->ldc_id, rv);
4635 		return (1);
4636 	}
4637 
4638 	/* allocate memory for data area cookies */
4639 	dp->data_cookie = kmem_zalloc(emsg->data_ncookies *
4640 	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
4641 
4642 	/* save data area cookies */
4643 	bcopy(emsg->data_cookie, dp->data_cookie,
4644 	    emsg->data_ncookies * sizeof (ldc_mem_cookie_t));
4645 
4646 	return (0);
4647 }
4648 
4649 /*
4650  * Reset and free all the resources associated with the channel.
4651  */
4652 static void
4653 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
4654 {
4655 	lane_t	*lp;
4656 
4657 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
4658 
4659 	if (dir == INBOUND) {
4660 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
4661 		    " of channel %lld", __func__, ldcp->ldc_id);
4662 		lp = &ldcp->lane_in;
4663 	} else {
4664 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
4665 		    " of channel %lld", __func__, ldcp->ldc_id);
4666 		lp = &ldcp->lane_out;
4667 	}
4668 
4669 	lp->lstate = VSW_LANE_INACTIV;
4670 	lp->seq_num = VNET_ISS;
4671 
4672 	if (dir == INBOUND) {
4673 		/* Unmap the remote dring which is imported from the peer */
4674 		vsw_unmap_dring(ldcp);
4675 	} else {
4676 		/* Destroy the local dring which is exported to the peer */
4677 		vsw_destroy_dring(ldcp);
4678 	}
4679 
4680 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
4681 }
4682 
4683 /*
4684  * Destroy the descriptor ring.
4685  */
4686 static void
4687 vsw_destroy_dring(vsw_ldc_t *ldcp)
4688 {
4689 	lane_t	*lp = &ldcp->lane_out;
4690 
4691 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4692 		vsw_destroy_rx_dring(ldcp);
4693 	} else {
4694 		vsw_destroy_tx_dring(ldcp);
4695 	}
4696 }
4697 
4698 /*
4699  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
4700  * This thread is woken up by the vsw_portsend to transmit
4701  * packets.
4702  */
4703 static void
4704 vsw_ldc_tx_worker(void *arg)
4705 {
4706 	callb_cpr_t	cprinfo;
4707 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
4708 	vsw_t *vswp = ldcp->ldc_vswp;
4709 	mblk_t *mp;
4710 	mblk_t *tmp;
4711 
4712 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4713 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
4714 	    "vnet_tx_thread");
4715 	mutex_enter(&ldcp->tx_thr_lock);
4716 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
4717 
4718 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4719 		/*
4720 		 * Wait until the data is received or a stop
4721 		 * request is received.
4722 		 */
4723 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
4724 		    (ldcp->tx_mhead == NULL)) {
4725 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
4726 		}
4727 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
4728 
4729 		/*
4730 		 * First process the stop request.
4731 		 */
4732 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
4733 			D2(vswp, "%s(%lld):tx thread stopped\n",
4734 			    __func__, ldcp->ldc_id);
4735 			break;
4736 		}
4737 		mp = ldcp->tx_mhead;
4738 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
4739 		ldcp->tx_cnt = 0;
4740 		mutex_exit(&ldcp->tx_thr_lock);
4741 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
4742 		    __func__, ldcp->ldc_id);
4743 		while (mp != NULL) {
4744 			tmp = mp->b_next;
4745 			mp->b_next = mp->b_prev = NULL;
4746 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
4747 			mp = tmp;
4748 		}
4749 		mutex_enter(&ldcp->tx_thr_lock);
4750 	}
4751 
4752 	/*
4753 	 * Update the run status and wakeup the thread that
4754 	 * has sent the stop request.
4755 	 */
4756 	ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
4757 	ldcp->tx_thread = NULL;
4758 	CALLB_CPR_EXIT(&cprinfo);
4759 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4760 	thread_exit();
4761 }
4762 
4763 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
4764 static void
4765 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
4766 {
4767 	kt_did_t	tid = 0;
4768 	vsw_t		*vswp = ldcp->ldc_vswp;
4769 
4770 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4771 	/*
4772 	 * Send a stop request by setting the stop flag and
4773 	 * wait until the receive thread stops.
4774 	 */
4775 	mutex_enter(&ldcp->tx_thr_lock);
4776 	if (ldcp->tx_thread != NULL) {
4777 		tid = ldcp->tx_thread->t_did;
4778 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
4779 		cv_signal(&ldcp->tx_thr_cv);
4780 	}
4781 	mutex_exit(&ldcp->tx_thr_lock);
4782 
4783 	if (tid != 0) {
4784 		thread_join(tid);
4785 	}
4786 
4787 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4788 }
4789 
4790 /*
4791  * Debugging routines
4792  */
4793 static void
4794 display_state(void)
4795 {
4796 	vsw_t		*vswp;
4797 	vsw_port_list_t	*plist;
4798 	vsw_port_t 	*port;
4799 	vsw_ldc_t 	*ldcp;
4800 	extern vsw_t 	*vsw_head;
4801 
4802 	cmn_err(CE_NOTE, "***** system state *****");
4803 
4804 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
4805 		plist = &vswp->plist;
4806 		READ_ENTER(&plist->lockrw);
4807 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
4808 		    vswp->instance, plist->num_ports);
4809 
4810 		for (port = plist->head; port != NULL; port = port->p_next) {
4811 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
4812 			    port->p_instance, port->num_ldcs);
4813 			ldcp = port->ldcp;
4814 			cmn_err(CE_CONT, "chan %lu : dev %d : "
4815 			    "status %d : phase %u\n",
4816 			    ldcp->ldc_id, ldcp->dev_class,
4817 			    ldcp->ldc_status, ldcp->hphase);
4818 			cmn_err(CE_CONT, "chan %lu : lsession %lu : "
4819 			    "psession %lu\n", ldcp->ldc_id,
4820 			    ldcp->local_session, ldcp->peer_session);
4821 
4822 			cmn_err(CE_CONT, "Inbound lane:\n");
4823 			display_lane(&ldcp->lane_in);
4824 			cmn_err(CE_CONT, "Outbound lane:\n");
4825 			display_lane(&ldcp->lane_out);
4826 		}
4827 		RW_EXIT(&plist->lockrw);
4828 	}
4829 	cmn_err(CE_NOTE, "***** system state *****");
4830 }
4831 
4832 static void
4833 display_lane(lane_t *lp)
4834 {
4835 	dring_info_t	*drp = lp->dringp;
4836 
4837 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
4838 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
4839 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
4840 	    lp->addr_type, lp->addr, lp->xfer_mode);
4841 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
4842 
4843 	cmn_err(CE_CONT, "Dring info:\n");
4844 	cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
4845 	    drp->num_descriptors, drp->descriptor_size);
4846 	cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->dring_handle);
4847 	cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
4848 	    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
4849 	cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
4850 	    drp->ident, drp->end_idx);
4851 	display_ring(drp);
4852 }
4853 
4854 static void
4855 display_ring(dring_info_t *dringp)
4856 {
4857 	uint64_t		i;
4858 	uint64_t		priv_count = 0;
4859 	uint64_t		pub_count = 0;
4860 	vnet_public_desc_t	*pub_addr = NULL;
4861 	vsw_private_desc_t	*priv_addr = NULL;
4862 
4863 	for (i = 0; i < vsw_num_descriptors; i++) {
4864 		if (dringp->pub_addr != NULL) {
4865 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
4866 
4867 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
4868 				pub_count++;
4869 		}
4870 
4871 		if (dringp->priv_addr != NULL) {
4872 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
4873 
4874 			if (priv_addr->dstate == VIO_DESC_FREE)
4875 				priv_count++;
4876 		}
4877 	}
4878 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
4879 	    i, priv_count, pub_count);
4880 }
4881 
4882 static void
4883 dump_flags(uint64_t state)
4884 {
4885 	int	i;
4886 
4887 	typedef struct flag_name {
4888 		int	flag_val;
4889 		char	*flag_name;
4890 	} flag_name_t;
4891 
4892 	flag_name_t	flags[] = {
4893 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
4894 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
4895 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
4896 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
4897 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
4898 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
4899 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
4900 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
4901 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
4902 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
4903 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
4904 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
4905 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
4906 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
4907 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
4908 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
4909 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
4910 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
4911 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
4912 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
4913 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
4914 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
4915 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
4916 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
4917 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
4918 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
4919 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
4920 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
4921 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
4922 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
4923 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
4924 
4925 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
4926 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
4927 		if (state & flags[i].flag_val)
4928 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
4929 	}
4930 }
4931