xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_ldc.c (revision ab5a7454a6d76e82a121d74c74d5589cc3d37a8f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/callb.h>
72 #include <sys/vlan.h>
73 
74 /* Port add/deletion/etc routines */
75 static	void vsw_port_delete(vsw_port_t *port);
76 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
77 static	void vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
78 static	int vsw_init_ldcs(vsw_port_t *port);
79 static	void vsw_uninit_ldcs(vsw_port_t *port);
80 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
81 static	void vsw_ldc_uninit(vsw_ldc_t *ldcp);
82 static	void vsw_drain_ldcs(vsw_port_t *port);
83 static	void vsw_drain_port_taskq(vsw_port_t *port);
84 static	void vsw_marker_task(void *);
85 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
86 void vsw_detach_ports(vsw_t *vswp);
87 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
88 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
89 int vsw_port_detach(vsw_t *vswp, int p_instance);
90 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
91 int vsw_port_attach(vsw_port_t *portp);
92 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
93 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
94 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
95 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
96 void vsw_reset_ports(vsw_t *vswp);
97 void vsw_port_reset(vsw_port_t *portp);
98 void vsw_physlink_update_ports(vsw_t *vswp);
99 static	void vsw_port_physlink_update(vsw_port_t *portp);
100 
101 /* Interrupt routines */
102 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
103 
104 /* Handshake routines */
105 static	void vsw_ldc_reinit(vsw_ldc_t *);
106 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
107 static	void vsw_conn_task(void *);
108 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
109 static	void vsw_next_milestone(vsw_ldc_t *);
110 static	int vsw_supported_version(vio_ver_msg_t *);
111 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
112 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
113 
114 /* Data processing routines */
115 static void vsw_process_pkt(void *);
116 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
117 static void vsw_process_ctrl_pkt(void *);
118 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
123 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
124 static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
125 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
126 	uint32_t);
127 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
128 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
129 static void vsw_process_pkt_data(void *, void *, uint32_t);
130 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
131 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
132 
133 /* Switching/data transmit routines */
134 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
135 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
136 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
137 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
138 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
139 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
140 
141 /* Packet creation routines */
142 static void vsw_send_ver(void *);
143 static void vsw_send_attr(vsw_ldc_t *);
144 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
145 static void vsw_send_dring_info(vsw_ldc_t *);
146 static void vsw_send_rdx(vsw_ldc_t *);
147 static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);
148 
149 /* Dring routines */
150 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
151 static void vsw_create_privring(vsw_ldc_t *);
152 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
153 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
154     int *);
155 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
156 static int vsw_reclaim_dring(dring_info_t *dp, int start);
157 
158 static void vsw_set_lane_attr(vsw_t *, lane_t *);
159 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
160 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
161 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
162 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
163 
164 /* Rcv/Tx thread routines */
165 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
166 static void vsw_ldc_tx_worker(void *arg);
167 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
168 static void vsw_ldc_rx_worker(void *arg);
169 
170 /* Misc support routines */
171 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
172 static void vsw_free_ring(dring_info_t *);
173 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
174 static int vsw_get_same_dest_list(struct ether_header *ehp,
175     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
176 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
177 
178 /* Debugging routines */
179 static void dump_flags(uint64_t);
180 static void display_state(void);
181 static void display_lane(lane_t *);
182 static void display_ring(dring_info_t *);
183 
184 /*
185  * Functions imported from other files.
186  */
187 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
188 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
189 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
190 extern void vsw_del_mcst_port(vsw_port_t *port);
191 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
192 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
193 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
194 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
195 extern void vsw_create_vlans(void *arg, int type);
196 extern void vsw_destroy_vlans(void *arg, int type);
197 extern void vsw_vlan_add_ids(void *arg, int type);
198 extern void vsw_vlan_remove_ids(void *arg, int type);
199 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
200 	struct ether_header *ehp, uint16_t *vidp);
201 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
202 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
203 	mblk_t **npt);
204 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
205 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
206 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
207 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
208 extern void vsw_hio_stop_port(vsw_port_t *portp);
209 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
210 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
211 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
212 extern void vsw_destroy_rxpools(void *arg);
213 
214 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
215 
216 /*
217  * Tunables used in this file.
218  */
219 extern int vsw_num_handshakes;
220 extern int vsw_wretries;
221 extern int vsw_desc_delay;
222 extern int vsw_read_attempts;
223 extern int vsw_ldc_tx_delay;
224 extern int vsw_ldc_tx_retries;
225 extern int vsw_ldc_retries;
226 extern int vsw_ldc_delay;
227 extern boolean_t vsw_ldc_rxthr_enabled;
228 extern boolean_t vsw_ldc_txthr_enabled;
229 extern uint32_t vsw_ntxds;
230 extern uint32_t vsw_max_tx_qcount;
231 extern uint32_t vsw_chain_len;
232 extern uint32_t vsw_mblk_size1;
233 extern uint32_t vsw_mblk_size2;
234 extern uint32_t vsw_mblk_size3;
235 extern uint32_t vsw_mblk_size4;
236 extern uint32_t vsw_num_mblks1;
237 extern uint32_t vsw_num_mblks2;
238 extern uint32_t vsw_num_mblks3;
239 extern uint32_t vsw_num_mblks4;
240 extern boolean_t vsw_obp_ver_proto_workaround;
241 extern uint32_t vsw_publish_macaddr_count;
242 extern boolean_t vsw_jumbo_rxpools;
243 
244 #define	LDC_ENTER_LOCK(ldcp)	\
245 				mutex_enter(&((ldcp)->ldc_cblock));\
246 				mutex_enter(&((ldcp)->ldc_rxlock));\
247 				mutex_enter(&((ldcp)->ldc_txlock));
248 #define	LDC_EXIT_LOCK(ldcp)	\
249 				mutex_exit(&((ldcp)->ldc_txlock));\
250 				mutex_exit(&((ldcp)->ldc_rxlock));\
251 				mutex_exit(&((ldcp)->ldc_cblock));
252 
253 #define	VSW_VER_EQ(ldcp, major, minor)	\
254 	((ldcp)->lane_out.ver_major == (major) &&	\
255 	    (ldcp)->lane_out.ver_minor == (minor))
256 
257 #define	VSW_VER_LT(ldcp, major, minor)	\
258 	(((ldcp)->lane_out.ver_major < (major)) ||	\
259 	    ((ldcp)->lane_out.ver_major == (major) &&	\
260 	    (ldcp)->lane_out.ver_minor < (minor)))
261 
262 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
263 	(((ldcp)->lane_out.ver_major > (major)) ||	\
264 	    ((ldcp)->lane_out.ver_major == (major) &&	\
265 	    (ldcp)->lane_out.ver_minor >= (minor)))
266 
267 /*
268  * VIO Protocol Version Info:
269  *
270  * The version specified below represents the version of protocol currently
271  * supported in the driver. It means the driver can negotiate with peers with
272  * versions <= this version. Here is a summary of the feature(s) that are
273  * supported at each version of the protocol:
274  *
275  * 1.0			Basic VIO protocol.
276  * 1.1			vDisk protocol update (no virtual network update).
277  * 1.2			Support for priority frames (priority-ether-types).
278  * 1.3			VLAN and HybridIO support.
279  * 1.4			Jumbo Frame support.
280  * 1.5			Link State Notification support with optional support
281  * 			for Physical Link information.
282  */
283 static	ver_sup_t	vsw_versions[] = { {1, 5} };
284 
285 /*
286  * For the moment the state dump routines have their own
287  * private flag.
288  */
289 #define	DUMP_STATE	0
290 
291 #if DUMP_STATE
292 
293 #define	DUMP_TAG(tag) \
294 {			\
295 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
296 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
297 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
298 }
299 
300 #define	DUMP_TAG_PTR(tag) \
301 {			\
302 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
303 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
304 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
305 }
306 
307 #define	DUMP_FLAGS(flags) dump_flags(flags);
308 #define	DISPLAY_STATE()	display_state()
309 
310 #else
311 
312 #define	DUMP_TAG(tag)
313 #define	DUMP_TAG_PTR(tag)
314 #define	DUMP_FLAGS(state)
315 #define	DISPLAY_STATE()
316 
317 #endif	/* DUMP_STATE */
318 
319 /*
320  * Attach the specified port.
321  *
322  * Returns 0 on success, 1 on failure.
323  */
324 int
325 vsw_port_attach(vsw_port_t *port)
326 {
327 	vsw_t			*vswp = port->p_vswp;
328 	vsw_port_list_t		*plist = &vswp->plist;
329 	vsw_port_t		*p, **pp;
330 	int			i;
331 	int			nids = port->num_ldcs;
332 	uint64_t		*ldcids;
333 	int			rv;
334 
335 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
336 
337 	/* port already exists? */
338 	READ_ENTER(&plist->lockrw);
339 	for (p = plist->head; p != NULL; p = p->p_next) {
340 		if (p->p_instance == port->p_instance) {
341 			DWARN(vswp, "%s: port instance %d already attached",
342 			    __func__, p->p_instance);
343 			RW_EXIT(&plist->lockrw);
344 			return (1);
345 		}
346 	}
347 	RW_EXIT(&plist->lockrw);
348 
349 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
350 
351 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
352 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
353 	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
354 
355 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
356 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
357 	port->state = VSW_PORT_INIT;
358 
359 	D2(vswp, "%s: %d nids", __func__, nids);
360 	ldcids = port->ldc_ids;
361 	for (i = 0; i < nids; i++) {
362 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
363 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
364 			DERR(vswp, "%s: ldc_attach failed", __func__);
365 			goto exit_error;
366 		}
367 	}
368 
369 	if (vswp->switching_setup_done == B_TRUE) {
370 		/*
371 		 * If the underlying network device has been setup,
372 		 * then open a mac client and porgram the mac address
373 		 * for this port.
374 		 */
375 		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
376 		if (rv != 0) {
377 			goto exit_error;
378 		}
379 	}
380 
381 	/* create the fdb entry for this port/mac address */
382 	vsw_fdbe_add(vswp, port);
383 
384 	vsw_create_vlans(port, VSW_VNETPORT);
385 
386 	WRITE_ENTER(&plist->lockrw);
387 
388 	/* link it into the list of ports for this vsw instance */
389 	pp = (vsw_port_t **)(&plist->head);
390 	port->p_next = *pp;
391 	*pp = port;
392 	plist->num_ports++;
393 
394 	RW_EXIT(&plist->lockrw);
395 
396 	/*
397 	 * Initialise the port and any ldc's under it.
398 	 */
399 	(void) vsw_init_ldcs(port);
400 
401 	/* announce macaddr of vnet to the physical switch */
402 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
403 		vsw_publish_macaddr(vswp, port);
404 	}
405 
406 	D1(vswp, "%s: exit", __func__);
407 	return (0);
408 
409 exit_error:
410 	rw_destroy(&port->p_ldclist.lockrw);
411 
412 	cv_destroy(&port->state_cv);
413 	mutex_destroy(&port->state_lock);
414 
415 	rw_destroy(&port->maccl_rwlock);
416 	mutex_destroy(&port->tx_lock);
417 	mutex_destroy(&port->mca_lock);
418 	kmem_free(port, sizeof (vsw_port_t));
419 	return (1);
420 }
421 
422 /*
423  * Detach the specified port.
424  *
425  * Returns 0 on success, 1 on failure.
426  */
427 int
428 vsw_port_detach(vsw_t *vswp, int p_instance)
429 {
430 	vsw_port_t	*port = NULL;
431 	vsw_port_list_t	*plist = &vswp->plist;
432 
433 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
434 
435 	WRITE_ENTER(&plist->lockrw);
436 
437 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
438 		RW_EXIT(&plist->lockrw);
439 		return (1);
440 	}
441 
442 	if (vsw_plist_del_node(vswp, port)) {
443 		RW_EXIT(&plist->lockrw);
444 		return (1);
445 	}
446 
447 	/* cleanup any HybridIO for this port */
448 	vsw_hio_stop_port(port);
449 
450 	/*
451 	 * No longer need to hold writer lock on port list now
452 	 * that we have unlinked the target port from the list.
453 	 */
454 	RW_EXIT(&plist->lockrw);
455 
456 	/* Cleanup and close the mac client */
457 	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
458 
459 	/* Remove the fdb entry for this port/mac address */
460 	vsw_fdbe_del(vswp, &(port->p_macaddr));
461 	vsw_destroy_vlans(port, VSW_VNETPORT);
462 
463 	/* Remove any multicast addresses.. */
464 	vsw_del_mcst_port(port);
465 
466 	vsw_port_delete(port);
467 
468 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
469 	return (0);
470 }
471 
472 /*
473  * Detach all active ports.
474  */
475 void
476 vsw_detach_ports(vsw_t *vswp)
477 {
478 	vsw_port_list_t 	*plist = &vswp->plist;
479 	vsw_port_t		*port = NULL;
480 
481 	D1(vswp, "%s: enter", __func__);
482 
483 	WRITE_ENTER(&plist->lockrw);
484 
485 	while ((port = plist->head) != NULL) {
486 		(void) vsw_plist_del_node(vswp, port);
487 
488 		/* cleanup any HybridIO for this port */
489 		vsw_hio_stop_port(port);
490 
491 		/* Cleanup and close the mac client */
492 		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
493 
494 		/* Remove the fdb entry for this port/mac address */
495 		vsw_fdbe_del(vswp, &(port->p_macaddr));
496 		vsw_destroy_vlans(port, VSW_VNETPORT);
497 
498 		/* Remove any multicast addresses.. */
499 		vsw_del_mcst_port(port);
500 
501 		/*
502 		 * No longer need to hold the lock on the port list
503 		 * now that we have unlinked the target port from the
504 		 * list.
505 		 */
506 		RW_EXIT(&plist->lockrw);
507 		vsw_port_delete(port);
508 		WRITE_ENTER(&plist->lockrw);
509 	}
510 	RW_EXIT(&plist->lockrw);
511 
512 	D1(vswp, "%s: exit", __func__);
513 }
514 
515 /*
516  * Delete the specified port.
517  */
518 static void
519 vsw_port_delete(vsw_port_t *port)
520 {
521 	vsw_ldc_list_t 		*ldcl;
522 	vsw_t			*vswp = port->p_vswp;
523 	int			num_ldcs;
524 
525 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
526 
527 	vsw_uninit_ldcs(port);
528 
529 	/*
530 	 * Wait for any pending ctrl msg tasks which reference this
531 	 * port to finish.
532 	 */
533 	vsw_drain_port_taskq(port);
534 
535 	/*
536 	 * Wait for any active callbacks to finish
537 	 */
538 	vsw_drain_ldcs(port);
539 
540 	ldcl = &port->p_ldclist;
541 	num_ldcs = port->num_ldcs;
542 	WRITE_ENTER(&ldcl->lockrw);
543 	while (num_ldcs > 0) {
544 		vsw_ldc_detach(port, ldcl->head->ldc_id);
545 		num_ldcs--;
546 	}
547 	RW_EXIT(&ldcl->lockrw);
548 
549 	rw_destroy(&port->p_ldclist.lockrw);
550 
551 	rw_destroy(&port->maccl_rwlock);
552 	mutex_destroy(&port->mca_lock);
553 	mutex_destroy(&port->tx_lock);
554 
555 	cv_destroy(&port->state_cv);
556 	mutex_destroy(&port->state_lock);
557 
558 	if (port->num_ldcs != 0) {
559 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
560 		port->num_ldcs = 0;
561 	}
562 
563 	if (port->nvids != 0) {
564 		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
565 	}
566 
567 	kmem_free(port, sizeof (vsw_port_t));
568 
569 	D1(vswp, "%s: exit", __func__);
570 }
571 
572 static int
573 vsw_init_multipools(vsw_ldc_t *ldcp, vsw_t *vswp)
574 {
575 	size_t		data_sz;
576 	int		rv;
577 	uint32_t	sz1 = 0;
578 	uint32_t	sz2 = 0;
579 	uint32_t	sz3 = 0;
580 	uint32_t	sz4 = 0;
581 
582 	/*
583 	 * We round up the mtu specified to be a multiple of 2K to limit the
584 	 * number of rx buffer pools created for a given mtu.
585 	 */
586 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
587 	data_sz = VNET_ROUNDUP_2K(data_sz);
588 
589 	/*
590 	 * If pool sizes are specified, use them. Note that the presence of
591 	 * the first tunable will be used as a hint.
592 	 */
593 	if (vsw_mblk_size1 != 0) {
594 		sz1 = vsw_mblk_size1;
595 		sz2 = vsw_mblk_size2;
596 		sz3 = vsw_mblk_size3;
597 		sz4 = vsw_mblk_size4;
598 
599 		if (sz4 == 0) { /* need 3 pools */
600 
601 			ldcp->max_rxpool_size = sz3;
602 			rv = vio_init_multipools(&ldcp->vmp,
603 			    VSW_NUM_VMPOOLS, sz1, sz2, sz3,
604 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
605 
606 		} else {
607 
608 			ldcp->max_rxpool_size = sz4;
609 			rv = vio_init_multipools(&ldcp->vmp,
610 			    VSW_NUM_VMPOOLS + 1, sz1, sz2, sz3, sz4,
611 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
612 			    vsw_num_mblks4);
613 
614 		}
615 
616 		return (rv);
617 	}
618 
619 	/*
620 	 * Pool sizes are not specified. We select the pool sizes based on the
621 	 * mtu if vnet_jumbo_rxpools is enabled.
622 	 */
623 	if (vsw_jumbo_rxpools == B_FALSE || data_sz == VNET_2K) {
624 		/*
625 		 * Receive buffer pool allocation based on mtu is disabled.
626 		 * Use the default mechanism of standard size pool allocation.
627 		 */
628 		sz1 = VSW_MBLK_SZ_128;
629 		sz2 = VSW_MBLK_SZ_256;
630 		sz3 = VSW_MBLK_SZ_2048;
631 		ldcp->max_rxpool_size = sz3;
632 
633 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
634 		    sz1, sz2, sz3,
635 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
636 
637 		return (rv);
638 	}
639 
640 	switch (data_sz) {
641 
642 	case VNET_4K:
643 
644 		sz1 = VSW_MBLK_SZ_128;
645 		sz2 = VSW_MBLK_SZ_256;
646 		sz3 = VSW_MBLK_SZ_2048;
647 		sz4 = sz3 << 1;			/* 4K */
648 		ldcp->max_rxpool_size = sz4;
649 
650 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
651 		    sz1, sz2, sz3, sz4,
652 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
653 		    vsw_num_mblks4);
654 		break;
655 
656 	default:	/* data_sz:  4K+ to 16K */
657 
658 		sz1 = VSW_MBLK_SZ_256;
659 		sz2 = VSW_MBLK_SZ_2048;
660 		sz3 = data_sz >> 1;	/* Jumbo-size/2 */
661 		sz4 = data_sz;	/* Jumbo-size */
662 		ldcp->max_rxpool_size = sz4;
663 
664 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
665 		    sz1, sz2, sz3, sz4,
666 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
667 		    vsw_num_mblks4);
668 		break;
669 	}
670 
671 	return (rv);
672 
673 }
674 
675 /*
676  * Attach a logical domain channel (ldc) under a specified port.
677  *
678  * Returns 0 on success, 1 on failure.
679  */
680 static int
681 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
682 {
683 	vsw_t 		*vswp = port->p_vswp;
684 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
685 	vsw_ldc_t 	*ldcp = NULL;
686 	ldc_attr_t 	attr;
687 	ldc_status_t	istatus;
688 	int 		status = DDI_FAILURE;
689 	char		kname[MAXNAMELEN];
690 	enum		{ PROG_init = 0x0,
691 			    PROG_callback = 0x1, PROG_rx_thread = 0x2,
692 			    PROG_tx_thread = 0x4}
693 			progress;
694 
695 	progress = PROG_init;
696 
697 	D1(vswp, "%s: enter", __func__);
698 
699 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
700 	if (ldcp == NULL) {
701 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
702 		return (1);
703 	}
704 	ldcp->ldc_id = ldc_id;
705 
706 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
707 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
708 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
709 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
710 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
711 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
712 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
713 
714 	/* required for handshake with peer */
715 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
716 	ldcp->peer_session = 0;
717 	ldcp->session_status = 0;
718 	ldcp->hss_id = 1;	/* Initial handshake session id */
719 
720 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
721 
722 	/* only set for outbound lane, inbound set by peer */
723 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
724 
725 	attr.devclass = LDC_DEV_NT_SVC;
726 	attr.instance = ddi_get_instance(vswp->dip);
727 	attr.mode = LDC_MODE_UNRELIABLE;
728 	attr.mtu = VSW_LDC_MTU;
729 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
730 	if (status != 0) {
731 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
732 		    __func__, ldc_id, status);
733 		goto ldc_attach_fail;
734 	}
735 
736 	if (vsw_ldc_rxthr_enabled) {
737 		ldcp->rx_thr_flags = 0;
738 
739 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
740 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
741 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
742 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
743 
744 		progress |= PROG_rx_thread;
745 		if (ldcp->rx_thread == NULL) {
746 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
747 			    __func__, ldc_id);
748 			goto ldc_attach_fail;
749 		}
750 	}
751 
752 	if (vsw_ldc_txthr_enabled) {
753 		ldcp->tx_thr_flags = 0;
754 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
755 
756 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
757 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
758 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
759 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
760 
761 		progress |= PROG_tx_thread;
762 		if (ldcp->tx_thread == NULL) {
763 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
764 			    __func__, ldc_id);
765 			goto ldc_attach_fail;
766 		}
767 	}
768 
769 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
770 	if (status != 0) {
771 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
772 		    __func__, ldc_id, status);
773 		(void) ldc_fini(ldcp->ldc_handle);
774 		goto ldc_attach_fail;
775 	}
776 	/*
777 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
778 	 * data msgs, including raw data msgs used to recv priority frames.
779 	 */
780 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
781 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
782 
783 	progress |= PROG_callback;
784 
785 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
786 
787 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
788 		DERR(vswp, "%s: ldc_status failed", __func__);
789 		mutex_destroy(&ldcp->status_lock);
790 		goto ldc_attach_fail;
791 	}
792 
793 	ldcp->ldc_status = istatus;
794 	ldcp->ldc_port = port;
795 	ldcp->ldc_vswp = vswp;
796 
797 	vsw_reset_vnet_proto_ops(ldcp);
798 
799 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
800 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
801 	    kname, &ldcp->ldc_stats);
802 	if (ldcp->ksp == NULL) {
803 		DERR(vswp, "%s: kstats setup failed", __func__);
804 		goto ldc_attach_fail;
805 	}
806 
807 	/* link it into the list of channels for this port */
808 	WRITE_ENTER(&ldcl->lockrw);
809 	ldcp->ldc_next = ldcl->head;
810 	ldcl->head = ldcp;
811 	RW_EXIT(&ldcl->lockrw);
812 
813 	D1(vswp, "%s: exit", __func__);
814 	return (0);
815 
816 ldc_attach_fail:
817 
818 	if (progress & PROG_callback) {
819 		(void) ldc_unreg_callback(ldcp->ldc_handle);
820 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
821 	}
822 
823 	if (progress & PROG_rx_thread) {
824 		if (ldcp->rx_thread != NULL) {
825 			vsw_stop_rx_thread(ldcp);
826 		}
827 		mutex_destroy(&ldcp->rx_thr_lock);
828 		cv_destroy(&ldcp->rx_thr_cv);
829 	}
830 
831 	if (progress & PROG_tx_thread) {
832 		if (ldcp->tx_thread != NULL) {
833 			vsw_stop_tx_thread(ldcp);
834 		}
835 		mutex_destroy(&ldcp->tx_thr_lock);
836 		cv_destroy(&ldcp->tx_thr_cv);
837 	}
838 	if (ldcp->ksp != NULL) {
839 		vgen_destroy_kstats(ldcp->ksp);
840 	}
841 	mutex_destroy(&ldcp->ldc_txlock);
842 	mutex_destroy(&ldcp->ldc_rxlock);
843 	mutex_destroy(&ldcp->ldc_cblock);
844 	mutex_destroy(&ldcp->drain_cv_lock);
845 
846 	cv_destroy(&ldcp->drain_cv);
847 
848 	rw_destroy(&ldcp->lane_in.dlistrw);
849 	rw_destroy(&ldcp->lane_out.dlistrw);
850 
851 	kmem_free(ldcp, sizeof (vsw_ldc_t));
852 
853 	return (1);
854 }
855 
856 /*
857  * Detach a logical domain channel (ldc) belonging to a
858  * particular port.
859  */
860 static void
861 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
862 {
863 	vsw_t 		*vswp = port->p_vswp;
864 	vsw_ldc_t 	*ldcp, *prev_ldcp;
865 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
866 	int 		rv;
867 	int		retries = 0;
868 	vio_mblk_pool_t *fvmp = NULL;
869 
870 	prev_ldcp = ldcl->head;
871 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
872 		if (ldcp->ldc_id == ldc_id) {
873 			break;
874 		}
875 	}
876 
877 	/* specified ldc id not found */
878 	ASSERT(ldcp != NULL);
879 
880 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
881 
882 	/* Stop the receive thread */
883 	if (ldcp->rx_thread != NULL) {
884 		vsw_stop_rx_thread(ldcp);
885 		mutex_destroy(&ldcp->rx_thr_lock);
886 		cv_destroy(&ldcp->rx_thr_cv);
887 	}
888 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
889 
890 	/* Stop the tx thread */
891 	if (ldcp->tx_thread != NULL) {
892 		vsw_stop_tx_thread(ldcp);
893 		mutex_destroy(&ldcp->tx_thr_lock);
894 		cv_destroy(&ldcp->tx_thr_cv);
895 		if (ldcp->tx_mhead != NULL) {
896 			freemsgchain(ldcp->tx_mhead);
897 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
898 			ldcp->tx_cnt = 0;
899 		}
900 	}
901 
902 	/* Destory kstats */
903 	vgen_destroy_kstats(ldcp->ksp);
904 
905 	/*
906 	 * Before we can close the channel we must release any mapped
907 	 * resources (e.g. drings).
908 	 */
909 	vsw_free_lane_resources(ldcp, INBOUND);
910 	vsw_free_lane_resources(ldcp, OUTBOUND);
911 
912 	/*
913 	 * Close the channel, retry on EAAGIN.
914 	 */
915 	while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
916 		if (++retries > vsw_ldc_retries) {
917 			break;
918 		}
919 		drv_usecwait(vsw_ldc_delay);
920 	}
921 	if (rv != 0) {
922 		cmn_err(CE_NOTE,
923 		    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
924 		    vswp->instance, rv, ldcp->ldc_id);
925 	}
926 
927 	(void) ldc_fini(ldcp->ldc_handle);
928 
929 	ldcp->ldc_status = LDC_INIT;
930 	ldcp->ldc_handle = NULL;
931 	ldcp->ldc_vswp = NULL;
932 
933 
934 	/*
935 	 * If we can't destroy all the rx pools for this channel, dispatch
936 	 * a task to retry and clean up those rx pools. Note that we don't
937 	 * need to wait for the task to complete. If the vsw device itself
938 	 * gets detached (vsw_detach()), it will wait for the task to complete
939 	 * implicitly in ddi_taskq_destroy().
940 	 */
941 	vio_destroy_multipools(&ldcp->vmp, &fvmp);
942 	if (fvmp != NULL) {
943 		(void) ddi_taskq_dispatch(vswp->rxp_taskq,
944 		    vsw_destroy_rxpools, fvmp, DDI_SLEEP);
945 	}
946 
947 	/* unlink it from the list */
948 	prev_ldcp = ldcp->ldc_next;
949 
950 	mutex_destroy(&ldcp->ldc_txlock);
951 	mutex_destroy(&ldcp->ldc_rxlock);
952 	mutex_destroy(&ldcp->ldc_cblock);
953 	cv_destroy(&ldcp->drain_cv);
954 	mutex_destroy(&ldcp->drain_cv_lock);
955 	mutex_destroy(&ldcp->status_lock);
956 	rw_destroy(&ldcp->lane_in.dlistrw);
957 	rw_destroy(&ldcp->lane_out.dlistrw);
958 
959 	kmem_free(ldcp, sizeof (vsw_ldc_t));
960 }
961 
962 /*
963  * Open and attempt to bring up the channel. Note that channel
964  * can only be brought up if peer has also opened channel.
965  *
966  * Returns 0 if can open and bring up channel, otherwise
967  * returns 1.
968  */
969 static int
970 vsw_ldc_init(vsw_ldc_t *ldcp)
971 {
972 	vsw_t 		*vswp = ldcp->ldc_vswp;
973 	ldc_status_t	istatus = 0;
974 	int		rv;
975 
976 	D1(vswp, "%s: enter", __func__);
977 
978 	LDC_ENTER_LOCK(ldcp);
979 
980 	/* don't start at 0 in case clients don't like that */
981 	ldcp->next_ident = 1;
982 
983 	rv = ldc_open(ldcp->ldc_handle);
984 	if (rv != 0) {
985 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
986 		    __func__, ldcp->ldc_id, rv);
987 		LDC_EXIT_LOCK(ldcp);
988 		return (1);
989 	}
990 
991 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
992 		DERR(vswp, "%s: unable to get status", __func__);
993 		LDC_EXIT_LOCK(ldcp);
994 		return (1);
995 
996 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
997 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
998 		    __func__, ldcp->ldc_id, istatus);
999 		LDC_EXIT_LOCK(ldcp);
1000 		return (1);
1001 	}
1002 
1003 	mutex_enter(&ldcp->status_lock);
1004 	ldcp->ldc_status = istatus;
1005 	mutex_exit(&ldcp->status_lock);
1006 
1007 	rv = ldc_up(ldcp->ldc_handle);
1008 	if (rv != 0) {
1009 		/*
1010 		 * Not a fatal error for ldc_up() to fail, as peer
1011 		 * end point may simply not be ready yet.
1012 		 */
1013 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
1014 		    ldcp->ldc_id, rv);
1015 		LDC_EXIT_LOCK(ldcp);
1016 		return (1);
1017 	}
1018 
1019 	/*
1020 	 * ldc_up() call is non-blocking so need to explicitly
1021 	 * check channel status to see if in fact the channel
1022 	 * is UP.
1023 	 */
1024 	mutex_enter(&ldcp->status_lock);
1025 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
1026 		DERR(vswp, "%s: unable to get status", __func__);
1027 		mutex_exit(&ldcp->status_lock);
1028 		LDC_EXIT_LOCK(ldcp);
1029 		return (1);
1030 
1031 	}
1032 
1033 	if (ldcp->ldc_status == LDC_UP) {
1034 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
1035 		    ldcp->ldc_id, istatus);
1036 		mutex_exit(&ldcp->status_lock);
1037 		LDC_EXIT_LOCK(ldcp);
1038 
1039 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1040 		return (0);
1041 	}
1042 
1043 	mutex_exit(&ldcp->status_lock);
1044 	LDC_EXIT_LOCK(ldcp);
1045 
1046 	D1(vswp, "%s: exit", __func__);
1047 	return (0);
1048 }
1049 
1050 /* disable callbacks on the channel */
1051 static void
1052 vsw_ldc_uninit(vsw_ldc_t *ldcp)
1053 {
1054 	vsw_t	*vswp = ldcp->ldc_vswp;
1055 	int	rv;
1056 
1057 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
1058 
1059 	LDC_ENTER_LOCK(ldcp);
1060 
1061 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
1062 	if (rv != 0) {
1063 		cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
1064 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
1065 	}
1066 
1067 	mutex_enter(&ldcp->status_lock);
1068 	ldcp->ldc_status = LDC_INIT;
1069 	mutex_exit(&ldcp->status_lock);
1070 
1071 	LDC_EXIT_LOCK(ldcp);
1072 
1073 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
1074 }
1075 
1076 static int
1077 vsw_init_ldcs(vsw_port_t *port)
1078 {
1079 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1080 	vsw_ldc_t	*ldcp;
1081 
1082 	READ_ENTER(&ldcl->lockrw);
1083 	ldcp =  ldcl->head;
1084 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1085 		(void) vsw_ldc_init(ldcp);
1086 	}
1087 	RW_EXIT(&ldcl->lockrw);
1088 
1089 	return (0);
1090 }
1091 
1092 static void
1093 vsw_uninit_ldcs(vsw_port_t *port)
1094 {
1095 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1096 	vsw_ldc_t	*ldcp;
1097 
1098 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1099 
1100 	READ_ENTER(&ldcl->lockrw);
1101 	ldcp =  ldcl->head;
1102 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1103 		vsw_ldc_uninit(ldcp);
1104 	}
1105 	RW_EXIT(&ldcl->lockrw);
1106 
1107 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1108 }
1109 
1110 /*
1111  * Wait until the callback(s) associated with the ldcs under the specified
1112  * port have completed.
1113  *
1114  * Prior to this function being invoked each channel under this port
1115  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1116  *
1117  * A short explaination of what we are doing below..
1118  *
1119  * The simplest approach would be to have a reference counter in
1120  * the ldc structure which is increment/decremented by the callbacks as
1121  * they use the channel. The drain function could then simply disable any
1122  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1123  * there is a tiny window here - before the callback is able to get the lock
1124  * on the channel it is interrupted and this function gets to execute. It
1125  * sees that the ref count is zero and believes its free to delete the
1126  * associated data structures.
1127  *
1128  * We get around this by taking advantage of the fact that before the ldc
1129  * framework invokes a callback it sets a flag to indicate that there is a
1130  * callback active (or about to become active). If when we attempt to
1131  * unregister a callback when this active flag is set then the unregister
1132  * will fail with EWOULDBLOCK.
1133  *
1134  * If the unregister fails we do a cv_timedwait. We will either be signaled
1135  * by the callback as it is exiting (note we have to wait a short period to
1136  * allow the callback to return fully to the ldc framework and it to clear
1137  * the active flag), or by the timer expiring. In either case we again attempt
1138  * the unregister. We repeat this until we can succesfully unregister the
1139  * callback.
1140  *
1141  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1142  * the case where the callback has finished but the ldc framework has not yet
1143  * cleared the active flag. In this case we would never get a cv_signal.
1144  */
1145 static void
1146 vsw_drain_ldcs(vsw_port_t *port)
1147 {
1148 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1149 	vsw_ldc_t	*ldcp;
1150 	vsw_t		*vswp = port->p_vswp;
1151 
1152 	D1(vswp, "%s: enter", __func__);
1153 
1154 	READ_ENTER(&ldcl->lockrw);
1155 
1156 	ldcp = ldcl->head;
1157 
1158 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1159 		/*
1160 		 * If we can unregister the channel callback then we
1161 		 * know that there is no callback either running or
1162 		 * scheduled to run for this channel so move on to next
1163 		 * channel in the list.
1164 		 */
1165 		mutex_enter(&ldcp->drain_cv_lock);
1166 
1167 		/* prompt active callbacks to quit */
1168 		ldcp->drain_state = VSW_LDC_DRAINING;
1169 
1170 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1171 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1172 			    ldcp->ldc_id);
1173 			mutex_exit(&ldcp->drain_cv_lock);
1174 			continue;
1175 		} else {
1176 			/*
1177 			 * If we end up here we know that either 1) a callback
1178 			 * is currently executing, 2) is about to start (i.e.
1179 			 * the ldc framework has set the active flag but
1180 			 * has not actually invoked the callback yet, or 3)
1181 			 * has finished and has returned to the ldc framework
1182 			 * but the ldc framework has not yet cleared the
1183 			 * active bit.
1184 			 *
1185 			 * Wait for it to finish.
1186 			 */
1187 			while (ldc_unreg_callback(ldcp->ldc_handle)
1188 			    == EWOULDBLOCK)
1189 				(void) cv_reltimedwait(&ldcp->drain_cv,
1190 				    &ldcp->drain_cv_lock, hz, TR_CLOCK_TICK);
1191 
1192 			mutex_exit(&ldcp->drain_cv_lock);
1193 			D2(vswp, "%s: unreg callback for chan %ld after "
1194 			    "timeout", __func__, ldcp->ldc_id);
1195 		}
1196 	}
1197 	RW_EXIT(&ldcl->lockrw);
1198 
1199 	D1(vswp, "%s: exit", __func__);
1200 }
1201 
1202 /*
1203  * Wait until all tasks which reference this port have completed.
1204  *
1205  * Prior to this function being invoked each channel under this port
1206  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1207  */
1208 static void
1209 vsw_drain_port_taskq(vsw_port_t *port)
1210 {
1211 	vsw_t		*vswp = port->p_vswp;
1212 
1213 	D1(vswp, "%s: enter", __func__);
1214 
1215 	/*
1216 	 * Mark the port as in the process of being detached, and
1217 	 * dispatch a marker task to the queue so we know when all
1218 	 * relevant tasks have completed.
1219 	 */
1220 	mutex_enter(&port->state_lock);
1221 	port->state = VSW_PORT_DETACHING;
1222 
1223 	if ((vswp->taskq_p == NULL) ||
1224 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1225 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1226 		cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
1227 		    vswp->instance);
1228 		mutex_exit(&port->state_lock);
1229 		return;
1230 	}
1231 
1232 	/*
1233 	 * Wait for the marker task to finish.
1234 	 */
1235 	while (port->state != VSW_PORT_DETACHABLE)
1236 		cv_wait(&port->state_cv, &port->state_lock);
1237 
1238 	mutex_exit(&port->state_lock);
1239 
1240 	D1(vswp, "%s: exit", __func__);
1241 }
1242 
1243 static void
1244 vsw_marker_task(void *arg)
1245 {
1246 	vsw_port_t	*port = arg;
1247 	vsw_t		*vswp = port->p_vswp;
1248 
1249 	D1(vswp, "%s: enter", __func__);
1250 
1251 	mutex_enter(&port->state_lock);
1252 
1253 	/*
1254 	 * No further tasks should be dispatched which reference
1255 	 * this port so ok to mark it as safe to detach.
1256 	 */
1257 	port->state = VSW_PORT_DETACHABLE;
1258 
1259 	cv_signal(&port->state_cv);
1260 
1261 	mutex_exit(&port->state_lock);
1262 
1263 	D1(vswp, "%s: exit", __func__);
1264 }
1265 
1266 vsw_port_t *
1267 vsw_lookup_port(vsw_t *vswp, int p_instance)
1268 {
1269 	vsw_port_list_t *plist = &vswp->plist;
1270 	vsw_port_t	*port;
1271 
1272 	for (port = plist->head; port != NULL; port = port->p_next) {
1273 		if (port->p_instance == p_instance) {
1274 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1275 			return (port);
1276 		}
1277 	}
1278 
1279 	return (NULL);
1280 }
1281 
1282 void
1283 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1284 {
1285 	vsw_ldc_list_t 	*ldclp;
1286 	vsw_ldc_t	*ldcp;
1287 
1288 	ldclp = &portp->p_ldclist;
1289 
1290 	READ_ENTER(&ldclp->lockrw);
1291 
1292 	/*
1293 	 * NOTE: for now, we will assume we have a single channel.
1294 	 */
1295 	if (ldclp->head == NULL) {
1296 		RW_EXIT(&ldclp->lockrw);
1297 		return;
1298 	}
1299 	ldcp = ldclp->head;
1300 
1301 	mutex_enter(&ldcp->ldc_cblock);
1302 
1303 	/*
1304 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1305 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1306 	 */
1307 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1308 	    portp->nvids != 0) {
1309 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1310 	}
1311 
1312 	mutex_exit(&ldcp->ldc_cblock);
1313 
1314 	RW_EXIT(&ldclp->lockrw);
1315 }
1316 
1317 void
1318 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1319 {
1320 	vsw_ldc_list_t	*ldclp;
1321 	vsw_ldc_t	*ldcp;
1322 
1323 	ldclp = &portp->p_ldclist;
1324 
1325 	READ_ENTER(&ldclp->lockrw);
1326 
1327 	/*
1328 	 * NOTE: for now, we will assume we have a single channel.
1329 	 */
1330 	if (ldclp->head == NULL) {
1331 		RW_EXIT(&ldclp->lockrw);
1332 		return;
1333 	}
1334 	ldcp = ldclp->head;
1335 
1336 	mutex_enter(&ldcp->ldc_cblock);
1337 
1338 	/*
1339 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1340 	 * to trigger re-negotiation, which inturn trigger HybridIO
1341 	 * setup/cleanup.
1342 	 */
1343 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1344 	    (portp->p_hio_capable == B_TRUE)) {
1345 		if (immediate == B_TRUE) {
1346 			(void) ldc_down(ldcp->ldc_handle);
1347 		} else {
1348 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1349 		}
1350 	}
1351 
1352 	mutex_exit(&ldcp->ldc_cblock);
1353 
1354 	RW_EXIT(&ldclp->lockrw);
1355 }
1356 
1357 void
1358 vsw_port_reset(vsw_port_t *portp)
1359 {
1360 	vsw_ldc_list_t 	*ldclp;
1361 	vsw_ldc_t	*ldcp;
1362 
1363 	ldclp = &portp->p_ldclist;
1364 
1365 	READ_ENTER(&ldclp->lockrw);
1366 
1367 	/*
1368 	 * NOTE: for now, we will assume we have a single channel.
1369 	 */
1370 	if (ldclp->head == NULL) {
1371 		RW_EXIT(&ldclp->lockrw);
1372 		return;
1373 	}
1374 	ldcp = ldclp->head;
1375 
1376 	mutex_enter(&ldcp->ldc_cblock);
1377 
1378 	/*
1379 	 * reset channel and terminate the connection.
1380 	 */
1381 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1382 
1383 	mutex_exit(&ldcp->ldc_cblock);
1384 
1385 	RW_EXIT(&ldclp->lockrw);
1386 }
1387 
1388 void
1389 vsw_reset_ports(vsw_t *vswp)
1390 {
1391 	vsw_port_list_t	*plist = &vswp->plist;
1392 	vsw_port_t	*portp;
1393 
1394 	READ_ENTER(&plist->lockrw);
1395 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1396 		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1397 			vsw_hio_stop_port(portp);
1398 		}
1399 		vsw_port_reset(portp);
1400 	}
1401 	RW_EXIT(&plist->lockrw);
1402 }
1403 
1404 static void
1405 vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
1406 {
1407 	vnet_physlink_msg_t	msg;
1408 	vnet_physlink_msg_t	*msgp = &msg;
1409 	uint32_t		physlink_info = 0;
1410 
1411 	if (plink_state == LINK_STATE_UP) {
1412 		physlink_info |= VNET_PHYSLINK_STATE_UP;
1413 	} else {
1414 		physlink_info |= VNET_PHYSLINK_STATE_DOWN;
1415 	}
1416 
1417 	msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
1418 	msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
1419 	msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
1420 	msgp->tag.vio_sid = ldcp->local_session;
1421 	msgp->physlink_info = physlink_info;
1422 
1423 	(void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
1424 }
1425 
1426 static void
1427 vsw_port_physlink_update(vsw_port_t *portp)
1428 {
1429 	vsw_ldc_list_t 	*ldclp;
1430 	vsw_ldc_t	*ldcp;
1431 	vsw_t		*vswp;
1432 
1433 	vswp = portp->p_vswp;
1434 	ldclp = &portp->p_ldclist;
1435 
1436 	READ_ENTER(&ldclp->lockrw);
1437 
1438 	/*
1439 	 * NOTE: for now, we will assume we have a single channel.
1440 	 */
1441 	if (ldclp->head == NULL) {
1442 		RW_EXIT(&ldclp->lockrw);
1443 		return;
1444 	}
1445 	ldcp = ldclp->head;
1446 
1447 	mutex_enter(&ldcp->ldc_cblock);
1448 
1449 	/*
1450 	 * If handshake has completed successfully and if the vnet device
1451 	 * has negotiated to get physical link state updates, send a message
1452 	 * with the current state.
1453 	 */
1454 	if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
1455 		vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
1456 	}
1457 
1458 	mutex_exit(&ldcp->ldc_cblock);
1459 
1460 	RW_EXIT(&ldclp->lockrw);
1461 }
1462 
1463 void
1464 vsw_physlink_update_ports(vsw_t *vswp)
1465 {
1466 	vsw_port_list_t	*plist = &vswp->plist;
1467 	vsw_port_t	*portp;
1468 
1469 	READ_ENTER(&plist->lockrw);
1470 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1471 		vsw_port_physlink_update(portp);
1472 	}
1473 	RW_EXIT(&plist->lockrw);
1474 }
1475 
1476 /*
1477  * Search for and remove the specified port from the port
1478  * list. Returns 0 if able to locate and remove port, otherwise
1479  * returns 1.
1480  */
1481 static int
1482 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1483 {
1484 	vsw_port_list_t *plist = &vswp->plist;
1485 	vsw_port_t	*curr_p, *prev_p;
1486 
1487 	if (plist->head == NULL)
1488 		return (1);
1489 
1490 	curr_p = prev_p = plist->head;
1491 
1492 	while (curr_p != NULL) {
1493 		if (curr_p == port) {
1494 			if (prev_p == curr_p) {
1495 				plist->head = curr_p->p_next;
1496 			} else {
1497 				prev_p->p_next = curr_p->p_next;
1498 			}
1499 			plist->num_ports--;
1500 			break;
1501 		} else {
1502 			prev_p = curr_p;
1503 			curr_p = curr_p->p_next;
1504 		}
1505 	}
1506 	return (0);
1507 }
1508 
1509 /*
1510  * Interrupt handler for ldc messages.
1511  */
1512 static uint_t
1513 vsw_ldc_cb(uint64_t event, caddr_t arg)
1514 {
1515 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1516 	vsw_t 		*vswp = ldcp->ldc_vswp;
1517 
1518 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1519 
1520 	mutex_enter(&ldcp->ldc_cblock);
1521 	ldcp->ldc_stats.callbacks++;
1522 
1523 	mutex_enter(&ldcp->status_lock);
1524 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1525 		mutex_exit(&ldcp->status_lock);
1526 		mutex_exit(&ldcp->ldc_cblock);
1527 		return (LDC_SUCCESS);
1528 	}
1529 	mutex_exit(&ldcp->status_lock);
1530 
1531 	if (event & LDC_EVT_UP) {
1532 		/*
1533 		 * Channel has come up.
1534 		 */
1535 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1536 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1537 
1538 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1539 
1540 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1541 	}
1542 
1543 	if (event & LDC_EVT_READ) {
1544 		/*
1545 		 * Data available for reading.
1546 		 */
1547 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1548 		    __func__, ldcp->ldc_id, event);
1549 
1550 		if (ldcp->rx_thread != NULL) {
1551 			/*
1552 			 * If the receive thread is enabled, then
1553 			 * wakeup the receive thread to process the
1554 			 * LDC messages.
1555 			 */
1556 			mutex_exit(&ldcp->ldc_cblock);
1557 			mutex_enter(&ldcp->rx_thr_lock);
1558 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1559 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1560 				cv_signal(&ldcp->rx_thr_cv);
1561 			}
1562 			mutex_exit(&ldcp->rx_thr_lock);
1563 			mutex_enter(&ldcp->ldc_cblock);
1564 		} else {
1565 			vsw_process_pkt(ldcp);
1566 		}
1567 
1568 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1569 
1570 		goto vsw_cb_exit;
1571 	}
1572 
1573 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1574 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1575 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1576 
1577 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1578 	}
1579 
1580 	/*
1581 	 * Catch either LDC_EVT_WRITE which we don't support or any
1582 	 * unknown event.
1583 	 */
1584 	if (event &
1585 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1586 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1587 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1588 	}
1589 
1590 vsw_cb_exit:
1591 	mutex_exit(&ldcp->ldc_cblock);
1592 
1593 	/*
1594 	 * Let the drain function know we are finishing if it
1595 	 * is waiting.
1596 	 */
1597 	mutex_enter(&ldcp->drain_cv_lock);
1598 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1599 		cv_signal(&ldcp->drain_cv);
1600 	mutex_exit(&ldcp->drain_cv_lock);
1601 
1602 	return (LDC_SUCCESS);
1603 }
1604 
1605 /*
1606  * Reinitialise data structures associated with the channel.
1607  */
1608 static void
1609 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1610 {
1611 	vsw_t		*vswp = ldcp->ldc_vswp;
1612 	vsw_port_t	*port;
1613 	vsw_ldc_list_t	*ldcl;
1614 	vio_mblk_pool_t *fvmp = NULL;
1615 
1616 	D1(vswp, "%s: enter", __func__);
1617 
1618 	/*
1619 	 * If we can't destroy all the rx pools for this channel, dispatch
1620 	 * a task to retry and clean up those rx pools. Note that we don't
1621 	 * need to wait for the task to complete. If the vsw device itself
1622 	 * gets detached (vsw_detach()), it will wait for the task to complete
1623 	 * implicitly in ddi_taskq_destroy().
1624 	 */
1625 	vio_destroy_multipools(&ldcp->vmp, &fvmp);
1626 	if (fvmp != NULL) {
1627 		(void) ddi_taskq_dispatch(vswp->rxp_taskq,
1628 		    vsw_destroy_rxpools, fvmp, DDI_SLEEP);
1629 	}
1630 
1631 	port = ldcp->ldc_port;
1632 	ldcl = &port->p_ldclist;
1633 
1634 	READ_ENTER(&ldcl->lockrw);
1635 
1636 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1637 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1638 
1639 	vsw_free_lane_resources(ldcp, INBOUND);
1640 	vsw_free_lane_resources(ldcp, OUTBOUND);
1641 	RW_EXIT(&ldcl->lockrw);
1642 
1643 	ldcp->lane_in.lstate = 0;
1644 	ldcp->lane_out.lstate = 0;
1645 
1646 	/* Remove the fdb entry for this port/mac address */
1647 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1648 
1649 	/* remove the port from vlans it has been assigned to */
1650 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1651 
1652 	/*
1653 	 * Remove parent port from any multicast groups
1654 	 * it may have registered with. Client must resend
1655 	 * multicast add command after handshake completes.
1656 	 */
1657 	vsw_del_mcst_port(port);
1658 
1659 	ldcp->peer_session = 0;
1660 	ldcp->session_status = 0;
1661 	ldcp->hcnt = 0;
1662 	ldcp->hphase = VSW_MILESTONE0;
1663 
1664 	vsw_reset_vnet_proto_ops(ldcp);
1665 
1666 	D1(vswp, "%s: exit", __func__);
1667 }
1668 
1669 /*
1670  * Process a connection event.
1671  *
1672  * Note - care must be taken to ensure that this function is
1673  * not called with the dlistrw lock held.
1674  */
1675 static void
1676 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1677 {
1678 	vsw_t		*vswp = ldcp->ldc_vswp;
1679 	vsw_conn_evt_t	*conn = NULL;
1680 
1681 	D1(vswp, "%s: enter", __func__);
1682 
1683 	/*
1684 	 * Check if either a reset or restart event is pending
1685 	 * or in progress. If so just return.
1686 	 *
1687 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1688 	 * being received by the callback handler, or a ECONNRESET error
1689 	 * code being returned from a ldc_read() or ldc_write() call.
1690 	 *
1691 	 * A VSW_CONN_RESTART event occurs when some error checking code
1692 	 * decides that there is a problem with data from the channel,
1693 	 * and that the handshake should be restarted.
1694 	 */
1695 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1696 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1697 		return;
1698 
1699 	/*
1700 	 * If it is an LDC_UP event we first check the recorded
1701 	 * state of the channel. If this is UP then we know that
1702 	 * the channel moving to the UP state has already been dealt
1703 	 * with and don't need to dispatch a  new task.
1704 	 *
1705 	 * The reason for this check is that when we do a ldc_up(),
1706 	 * depending on the state of the peer, we may or may not get
1707 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1708 	 * every time we do ldc_up() we explicitly check the channel
1709 	 * status to see has it come up (ldc_up() is asynch and will
1710 	 * complete at some undefined time), and take the appropriate
1711 	 * action.
1712 	 *
1713 	 * The flip side of this is that we may get a LDC_UP event
1714 	 * when we have already seen that the channel is up and have
1715 	 * dealt with that.
1716 	 */
1717 	mutex_enter(&ldcp->status_lock);
1718 	if (evt == VSW_CONN_UP) {
1719 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1720 			mutex_exit(&ldcp->status_lock);
1721 			return;
1722 		}
1723 	}
1724 	mutex_exit(&ldcp->status_lock);
1725 
1726 	/*
1727 	 * The transaction group id allows us to identify and discard
1728 	 * any tasks which are still pending on the taskq and refer
1729 	 * to the handshake session we are about to restart or reset.
1730 	 * These stale messages no longer have any real meaning.
1731 	 */
1732 	(void) atomic_inc_32(&ldcp->hss_id);
1733 
1734 	ASSERT(vswp->taskq_p != NULL);
1735 
1736 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1737 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1738 		    " connection event", vswp->instance);
1739 		goto err_exit;
1740 	}
1741 
1742 	conn->evt = evt;
1743 	conn->ldcp = ldcp;
1744 
1745 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1746 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1747 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1748 		    vswp->instance);
1749 
1750 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1751 		goto err_exit;
1752 	}
1753 
1754 	D1(vswp, "%s: exit", __func__);
1755 	return;
1756 
1757 err_exit:
1758 	/*
1759 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1760 	 * that future requests will at least be attempted and will hopefully
1761 	 * succeed.
1762 	 */
1763 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1764 		ldcp->reset_active = 0;
1765 }
1766 
1767 /*
1768  * Deal with events relating to a connection. Invoked from a taskq.
1769  */
1770 static void
1771 vsw_conn_task(void *arg)
1772 {
1773 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1774 	vsw_ldc_t	*ldcp = NULL;
1775 	vsw_port_t	*portp;
1776 	vsw_t		*vswp = NULL;
1777 	uint16_t	evt;
1778 	ldc_status_t	curr_status;
1779 
1780 	ldcp = conn->ldcp;
1781 	evt = conn->evt;
1782 	vswp = ldcp->ldc_vswp;
1783 	portp = ldcp->ldc_port;
1784 
1785 	D1(vswp, "%s: enter", __func__);
1786 
1787 	/* can safely free now have copied out data */
1788 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1789 
1790 	mutex_enter(&ldcp->status_lock);
1791 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1792 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1793 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1794 		mutex_exit(&ldcp->status_lock);
1795 		return;
1796 	}
1797 
1798 	/*
1799 	 * If we wish to restart the handshake on this channel, then if
1800 	 * the channel is UP we bring it DOWN to flush the underlying
1801 	 * ldc queue.
1802 	 */
1803 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1804 		(void) ldc_down(ldcp->ldc_handle);
1805 
1806 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1807 		vsw_hio_stop(vswp, ldcp);
1808 	}
1809 
1810 	/*
1811 	 * re-init all the associated data structures.
1812 	 */
1813 	vsw_ldc_reinit(ldcp);
1814 
1815 	/*
1816 	 * Bring the channel back up (note it does no harm to
1817 	 * do this even if the channel is already UP, Just
1818 	 * becomes effectively a no-op).
1819 	 */
1820 	(void) ldc_up(ldcp->ldc_handle);
1821 
1822 	/*
1823 	 * Check if channel is now UP. This will only happen if
1824 	 * peer has also done a ldc_up().
1825 	 */
1826 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1827 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1828 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1829 		mutex_exit(&ldcp->status_lock);
1830 		return;
1831 	}
1832 
1833 	ldcp->ldc_status = curr_status;
1834 
1835 	/* channel UP so restart handshake by sending version info */
1836 	if (curr_status == LDC_UP) {
1837 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1838 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1839 			    " handshake attempts (%d) on channel %ld",
1840 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1841 			mutex_exit(&ldcp->status_lock);
1842 			return;
1843 		}
1844 
1845 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1846 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1847 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1848 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1849 			    vswp->instance);
1850 
1851 			/*
1852 			 * Don't count as valid restart attempt if couldn't
1853 			 * send version msg.
1854 			 */
1855 			if (ldcp->hcnt > 0)
1856 				ldcp->hcnt--;
1857 		}
1858 	}
1859 
1860 	/*
1861 	 * Mark that the process is complete by clearing the flag.
1862 	 *
1863 	 * Note is it possible that the taskq dispatch above may have failed,
1864 	 * most likely due to memory shortage. We still clear the flag so
1865 	 * future attempts will at least be attempted and will hopefully
1866 	 * succeed.
1867 	 */
1868 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1869 		ldcp->reset_active = 0;
1870 
1871 	mutex_exit(&ldcp->status_lock);
1872 
1873 	D1(vswp, "%s: exit", __func__);
1874 }
1875 
1876 /*
1877  * returns 0 if legal for event signified by flag to have
1878  * occured at the time it did. Otherwise returns 1.
1879  */
1880 int
1881 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1882 {
1883 	vsw_t		*vswp = ldcp->ldc_vswp;
1884 	uint64_t	state;
1885 	uint64_t	phase;
1886 
1887 	if (dir == INBOUND)
1888 		state = ldcp->lane_in.lstate;
1889 	else
1890 		state = ldcp->lane_out.lstate;
1891 
1892 	phase = ldcp->hphase;
1893 
1894 	switch (flag) {
1895 	case VSW_VER_INFO_RECV:
1896 		if (phase > VSW_MILESTONE0) {
1897 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1898 			    " when in state %d\n", ldcp->ldc_id, phase);
1899 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1900 			return (1);
1901 		}
1902 		break;
1903 
1904 	case VSW_VER_ACK_RECV:
1905 	case VSW_VER_NACK_RECV:
1906 		if (!(state & VSW_VER_INFO_SENT)) {
1907 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1908 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1909 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1910 			return (1);
1911 		} else
1912 			state &= ~VSW_VER_INFO_SENT;
1913 		break;
1914 
1915 	case VSW_ATTR_INFO_RECV:
1916 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1917 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1918 			    " when in state %d\n", ldcp->ldc_id, phase);
1919 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1920 			return (1);
1921 		}
1922 		break;
1923 
1924 	case VSW_ATTR_ACK_RECV:
1925 	case VSW_ATTR_NACK_RECV:
1926 		if (!(state & VSW_ATTR_INFO_SENT)) {
1927 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1928 			    " or ATTR_NACK when in state %d\n",
1929 			    ldcp->ldc_id, phase);
1930 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1931 			return (1);
1932 		} else
1933 			state &= ~VSW_ATTR_INFO_SENT;
1934 		break;
1935 
1936 	case VSW_DRING_INFO_RECV:
1937 		if (phase < VSW_MILESTONE1) {
1938 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1939 			    " when in state %d\n", ldcp->ldc_id, phase);
1940 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1941 			return (1);
1942 		}
1943 		break;
1944 
1945 	case VSW_DRING_ACK_RECV:
1946 	case VSW_DRING_NACK_RECV:
1947 		if (!(state & VSW_DRING_INFO_SENT)) {
1948 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1949 			    " or DRING_NACK when in state %d\n",
1950 			    ldcp->ldc_id, phase);
1951 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1952 			return (1);
1953 		} else
1954 			state &= ~VSW_DRING_INFO_SENT;
1955 		break;
1956 
1957 	case VSW_RDX_INFO_RECV:
1958 		if (phase < VSW_MILESTONE3) {
1959 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1960 			    " when in state %d\n", ldcp->ldc_id, phase);
1961 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1962 			return (1);
1963 		}
1964 		break;
1965 
1966 	case VSW_RDX_ACK_RECV:
1967 	case VSW_RDX_NACK_RECV:
1968 		if (!(state & VSW_RDX_INFO_SENT)) {
1969 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1970 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1971 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1972 			return (1);
1973 		} else
1974 			state &= ~VSW_RDX_INFO_SENT;
1975 		break;
1976 
1977 	case VSW_MCST_INFO_RECV:
1978 		if (phase < VSW_MILESTONE3) {
1979 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1980 			    " when in state %d\n", ldcp->ldc_id, phase);
1981 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1982 			return (1);
1983 		}
1984 		break;
1985 
1986 	default:
1987 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1988 		    ldcp->ldc_id, flag);
1989 		return (1);
1990 	}
1991 
1992 	if (dir == INBOUND)
1993 		ldcp->lane_in.lstate = state;
1994 	else
1995 		ldcp->lane_out.lstate = state;
1996 
1997 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1998 
1999 	return (0);
2000 }
2001 
2002 void
2003 vsw_next_milestone(vsw_ldc_t *ldcp)
2004 {
2005 	vsw_t		*vswp = ldcp->ldc_vswp;
2006 	vsw_port_t	*portp = ldcp->ldc_port;
2007 
2008 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
2009 	    ldcp->ldc_id, ldcp->hphase);
2010 
2011 	DUMP_FLAGS(ldcp->lane_in.lstate);
2012 	DUMP_FLAGS(ldcp->lane_out.lstate);
2013 
2014 	switch (ldcp->hphase) {
2015 
2016 	case VSW_MILESTONE0:
2017 		/*
2018 		 * If we haven't started to handshake with our peer,
2019 		 * start to do so now.
2020 		 */
2021 		if (ldcp->lane_out.lstate == 0) {
2022 			D2(vswp, "%s: (chan %lld) starting handshake "
2023 			    "with peer", __func__, ldcp->ldc_id);
2024 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
2025 		}
2026 
2027 		/*
2028 		 * Only way to pass this milestone is to have successfully
2029 		 * negotiated version info.
2030 		 */
2031 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
2032 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
2033 
2034 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
2035 			    __func__, ldcp->ldc_id);
2036 
2037 			vsw_set_vnet_proto_ops(ldcp);
2038 
2039 			/*
2040 			 * Next milestone is passed when attribute
2041 			 * information has been successfully exchanged.
2042 			 */
2043 			ldcp->hphase = VSW_MILESTONE1;
2044 			vsw_send_attr(ldcp);
2045 
2046 		}
2047 		break;
2048 
2049 	case VSW_MILESTONE1:
2050 		/*
2051 		 * Only way to pass this milestone is to have successfully
2052 		 * negotiated attribute information.
2053 		 */
2054 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
2055 
2056 			ldcp->hphase = VSW_MILESTONE2;
2057 
2058 			/*
2059 			 * If the peer device has said it wishes to
2060 			 * use descriptor rings then we send it our ring
2061 			 * info, otherwise we just set up a private ring
2062 			 * which we use an internal buffer
2063 			 */
2064 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2065 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2066 			    (VSW_VER_LT(ldcp, 1, 2) &&
2067 			    (ldcp->lane_in.xfer_mode ==
2068 			    VIO_DRING_MODE_V1_0))) {
2069 				vsw_send_dring_info(ldcp);
2070 			}
2071 		}
2072 		break;
2073 
2074 	case VSW_MILESTONE2:
2075 		/*
2076 		 * If peer has indicated in its attribute message that
2077 		 * it wishes to use descriptor rings then the only way
2078 		 * to pass this milestone is for us to have received
2079 		 * valid dring info.
2080 		 *
2081 		 * If peer is not using descriptor rings then just fall
2082 		 * through.
2083 		 */
2084 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2085 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2086 		    (VSW_VER_LT(ldcp, 1, 2) &&
2087 		    (ldcp->lane_in.xfer_mode ==
2088 		    VIO_DRING_MODE_V1_0))) {
2089 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
2090 				break;
2091 		}
2092 
2093 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
2094 		    __func__, ldcp->ldc_id);
2095 
2096 		ldcp->hphase = VSW_MILESTONE3;
2097 		vsw_send_rdx(ldcp);
2098 		break;
2099 
2100 	case VSW_MILESTONE3:
2101 		/*
2102 		 * Pass this milestone when all paramaters have been
2103 		 * successfully exchanged and RDX sent in both directions.
2104 		 *
2105 		 * Mark outbound lane as available to transmit data.
2106 		 */
2107 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
2108 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
2109 
2110 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
2111 			    __func__, ldcp->ldc_id);
2112 			D2(vswp, "%s: ** handshake complete (0x%llx : "
2113 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
2114 			    ldcp->lane_out.lstate);
2115 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
2116 			ldcp->hphase = VSW_MILESTONE4;
2117 			ldcp->hcnt = 0;
2118 			DISPLAY_STATE();
2119 			/* Start HIO if enabled and capable */
2120 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
2121 				D2(vswp, "%s: start HybridIO setup", __func__);
2122 				vsw_hio_start(vswp, ldcp);
2123 			}
2124 
2125 			if (ldcp->pls_negotiated == B_TRUE) {
2126 				/*
2127 				 * The vnet device has negotiated to get phys
2128 				 * link updates. Now that the handshake with
2129 				 * the vnet device is complete, send an initial
2130 				 * update with the current physical link state.
2131 				 */
2132 				vsw_send_physlink_msg(ldcp,
2133 				    vswp->phys_link_state);
2134 			}
2135 
2136 		} else {
2137 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
2138 			    __func__, ldcp->lane_in.lstate,
2139 			    ldcp->lane_out.lstate);
2140 		}
2141 		break;
2142 
2143 	case VSW_MILESTONE4:
2144 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
2145 		    ldcp->ldc_id);
2146 		break;
2147 
2148 	default:
2149 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
2150 		    ldcp->ldc_id, ldcp->hphase);
2151 	}
2152 
2153 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
2154 	    ldcp->hphase);
2155 }
2156 
2157 /*
2158  * Check if major version is supported.
2159  *
2160  * Returns 0 if finds supported major number, and if necessary
2161  * adjusts the minor field.
2162  *
2163  * Returns 1 if can't match major number exactly. Sets mjor/minor
2164  * to next lowest support values, or to zero if no other values possible.
2165  */
2166 static int
2167 vsw_supported_version(vio_ver_msg_t *vp)
2168 {
2169 	int	i;
2170 
2171 	D1(NULL, "vsw_supported_version: enter");
2172 
2173 	for (i = 0; i < VSW_NUM_VER; i++) {
2174 		if (vsw_versions[i].ver_major == vp->ver_major) {
2175 			/*
2176 			 * Matching or lower major version found. Update
2177 			 * minor number if necessary.
2178 			 */
2179 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
2180 				D2(NULL, "%s: adjusting minor value from %d "
2181 				    "to %d", __func__, vp->ver_minor,
2182 				    vsw_versions[i].ver_minor);
2183 				vp->ver_minor = vsw_versions[i].ver_minor;
2184 			}
2185 
2186 			return (0);
2187 		}
2188 
2189 		/*
2190 		 * If the message contains a higher major version number, set
2191 		 * the message's major/minor versions to the current values
2192 		 * and return false, so this message will get resent with
2193 		 * these values.
2194 		 */
2195 		if (vsw_versions[i].ver_major < vp->ver_major) {
2196 			D2(NULL, "%s: adjusting major and minor "
2197 			    "values to %d, %d\n",
2198 			    __func__, vsw_versions[i].ver_major,
2199 			    vsw_versions[i].ver_minor);
2200 			vp->ver_major = vsw_versions[i].ver_major;
2201 			vp->ver_minor = vsw_versions[i].ver_minor;
2202 			return (1);
2203 		}
2204 	}
2205 
2206 	/* No match was possible, zero out fields */
2207 	vp->ver_major = 0;
2208 	vp->ver_minor = 0;
2209 
2210 	D1(NULL, "vsw_supported_version: exit");
2211 
2212 	return (1);
2213 }
2214 
2215 /*
2216  * Set vnet-protocol-version dependent functions based on version.
2217  */
2218 static void
2219 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
2220 {
2221 	vsw_t	*vswp = ldcp->ldc_vswp;
2222 	lane_t	*lp = &ldcp->lane_out;
2223 
2224 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2225 		/*
2226 		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
2227 		 * Support), set the mtu in our attributes to max_frame_size.
2228 		 */
2229 		lp->mtu = vswp->max_frame_size;
2230 	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
2231 		/*
2232 		 * If the version negotiated with peer is == 1.3 (Vlan Tag
2233 		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
2234 		 */
2235 		lp->mtu = ETHERMAX + VLAN_TAGSZ;
2236 	} else {
2237 		vsw_port_t	*portp = ldcp->ldc_port;
2238 		/*
2239 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2240 		 * We can negotiate that size with those peers provided only
2241 		 * pvid is defined for our peer and there are no vids. Then we
2242 		 * can send/recv only untagged frames of max size ETHERMAX.
2243 		 * Note that pvid of the peer can be different, as vsw has to
2244 		 * serve the vnet in that vlan even if itself is not assigned
2245 		 * to that vlan.
2246 		 */
2247 		if (portp->nvids == 0) {
2248 			lp->mtu = ETHERMAX;
2249 		}
2250 	}
2251 
2252 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2253 		/* Versions >= 1.2 */
2254 
2255 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2256 			/*
2257 			 * enable priority routines and pkt mode only if
2258 			 * at least one pri-eth-type is specified in MD.
2259 			 */
2260 			ldcp->tx = vsw_ldctx_pri;
2261 			ldcp->rx_pktdata = vsw_process_pkt_data;
2262 
2263 			/* set xfer mode for vsw_send_attr() */
2264 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2265 		} else {
2266 			/* no priority eth types defined in MD */
2267 
2268 			ldcp->tx = vsw_ldctx;
2269 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2270 
2271 			/* set xfer mode for vsw_send_attr() */
2272 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2273 		}
2274 
2275 	} else {
2276 		/* Versions prior to 1.2  */
2277 
2278 		vsw_reset_vnet_proto_ops(ldcp);
2279 	}
2280 }
2281 
2282 /*
2283  * Reset vnet-protocol-version dependent functions to v1.0.
2284  */
2285 static void
2286 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2287 {
2288 	lane_t	*lp = &ldcp->lane_out;
2289 
2290 	ldcp->tx = vsw_ldctx;
2291 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2292 
2293 	/* set xfer mode for vsw_send_attr() */
2294 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2295 }
2296 
2297 /*
2298  * Main routine for processing messages received over LDC.
2299  */
2300 static void
2301 vsw_process_pkt(void *arg)
2302 {
2303 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2304 	vsw_t 		*vswp = ldcp->ldc_vswp;
2305 	size_t		msglen;
2306 	vio_msg_tag_t	*tagp;
2307 	uint64_t	*ldcmsg;
2308 	int 		rv = 0;
2309 
2310 
2311 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2312 
2313 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2314 
2315 	ldcmsg = ldcp->ldcmsg;
2316 	/*
2317 	 * If channel is up read messages until channel is empty.
2318 	 */
2319 	do {
2320 		msglen = ldcp->msglen;
2321 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2322 
2323 		if (rv != 0) {
2324 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2325 			    __func__, ldcp->ldc_id, rv, msglen);
2326 		}
2327 
2328 		/* channel has been reset */
2329 		if (rv == ECONNRESET) {
2330 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2331 			break;
2332 		}
2333 
2334 		if (msglen == 0) {
2335 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2336 			    ldcp->ldc_id);
2337 			break;
2338 		}
2339 
2340 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2341 		    ldcp->ldc_id, msglen);
2342 
2343 		/*
2344 		 * Figure out what sort of packet we have gotten by
2345 		 * examining the msg tag, and then switch it appropriately.
2346 		 */
2347 		tagp = (vio_msg_tag_t *)ldcmsg;
2348 
2349 		switch (tagp->vio_msgtype) {
2350 		case VIO_TYPE_CTRL:
2351 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2352 			break;
2353 		case VIO_TYPE_DATA:
2354 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2355 			break;
2356 		case VIO_TYPE_ERR:
2357 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2358 			break;
2359 		default:
2360 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2361 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2362 			break;
2363 		}
2364 	} while (msglen);
2365 
2366 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2367 }
2368 
2369 /*
2370  * Dispatch a task to process a VIO control message.
2371  */
2372 static void
2373 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2374 {
2375 	vsw_ctrl_task_t		*ctaskp = NULL;
2376 	vsw_port_t		*port = ldcp->ldc_port;
2377 	vsw_t			*vswp = port->p_vswp;
2378 
2379 	D1(vswp, "%s: enter", __func__);
2380 
2381 	/*
2382 	 * We need to handle RDX ACK messages in-band as once they
2383 	 * are exchanged it is possible that we will get an
2384 	 * immediate (legitimate) data packet.
2385 	 */
2386 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2387 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2388 
2389 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2390 			return;
2391 
2392 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2393 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2394 		    "(ostate 0x%llx : hphase %d)", __func__,
2395 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2396 		vsw_next_milestone(ldcp);
2397 		return;
2398 	}
2399 
2400 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2401 
2402 	if (ctaskp == NULL) {
2403 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2404 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2405 		return;
2406 	}
2407 
2408 	ctaskp->ldcp = ldcp;
2409 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2410 	ctaskp->hss_id = ldcp->hss_id;
2411 
2412 	/*
2413 	 * Dispatch task to processing taskq if port is not in
2414 	 * the process of being detached.
2415 	 */
2416 	mutex_enter(&port->state_lock);
2417 	if (port->state == VSW_PORT_INIT) {
2418 		if ((vswp->taskq_p == NULL) ||
2419 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2420 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2421 			mutex_exit(&port->state_lock);
2422 			DERR(vswp, "%s: unable to dispatch task to taskq",
2423 			    __func__);
2424 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2425 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2426 			return;
2427 		}
2428 	} else {
2429 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2430 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2431 		    "task", __func__, port->p_instance);
2432 	}
2433 
2434 	mutex_exit(&port->state_lock);
2435 
2436 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2437 	    ldcp->ldc_id);
2438 	D1(vswp, "%s: exit", __func__);
2439 }
2440 
2441 /*
2442  * Process a VIO ctrl message. Invoked from taskq.
2443  */
2444 static void
2445 vsw_process_ctrl_pkt(void *arg)
2446 {
2447 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2448 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2449 	vsw_t 		*vswp = ldcp->ldc_vswp;
2450 	vio_msg_tag_t	tag;
2451 	uint16_t	env;
2452 
2453 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2454 
2455 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2456 	env = tag.vio_subtype_env;
2457 
2458 	/* stale pkt check */
2459 	if (ctaskp->hss_id < ldcp->hss_id) {
2460 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2461 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2462 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2463 		return;
2464 	}
2465 
2466 	/* session id check */
2467 	if (ldcp->session_status & VSW_PEER_SESSION) {
2468 		if (ldcp->peer_session != tag.vio_sid) {
2469 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2470 			    __func__, ldcp->ldc_id, tag.vio_sid);
2471 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2472 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2473 			return;
2474 		}
2475 	}
2476 
2477 	/*
2478 	 * Switch on vio_subtype envelope, then let lower routines
2479 	 * decide if its an INFO, ACK or NACK packet.
2480 	 */
2481 	switch (env) {
2482 	case VIO_VER_INFO:
2483 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2484 		break;
2485 	case VIO_DRING_REG:
2486 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2487 		break;
2488 	case VIO_DRING_UNREG:
2489 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2490 		break;
2491 	case VIO_ATTR_INFO:
2492 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2493 		break;
2494 	case VNET_MCAST_INFO:
2495 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2496 		break;
2497 	case VIO_RDX:
2498 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2499 		break;
2500 	case VIO_DDS_INFO:
2501 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2502 		break;
2503 
2504 	case VNET_PHYSLINK_INFO:
2505 		vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
2506 		break;
2507 	default:
2508 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2509 	}
2510 
2511 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2512 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2513 }
2514 
2515 /*
2516  * Version negotiation. We can end up here either because our peer
2517  * has responded to a handshake message we have sent it, or our peer
2518  * has initiated a handshake with us. If its the former then can only
2519  * be ACK or NACK, if its the later can only be INFO.
2520  *
2521  * If its an ACK we move to the next stage of the handshake, namely
2522  * attribute exchange. If its a NACK we see if we can specify another
2523  * version, if we can't we stop.
2524  *
2525  * If it is an INFO we reset all params associated with communication
2526  * in that direction over this channel (remember connection is
2527  * essentially 2 independent simplex channels).
2528  */
2529 void
2530 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2531 {
2532 	vio_ver_msg_t	*ver_pkt;
2533 	vsw_t 		*vswp = ldcp->ldc_vswp;
2534 
2535 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2536 
2537 	/*
2538 	 * We know this is a ctrl/version packet so
2539 	 * cast it into the correct structure.
2540 	 */
2541 	ver_pkt = (vio_ver_msg_t *)pkt;
2542 
2543 	switch (ver_pkt->tag.vio_subtype) {
2544 	case VIO_SUBTYPE_INFO:
2545 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2546 
2547 		/*
2548 		 * Record the session id, which we will use from now
2549 		 * until we see another VER_INFO msg. Even then the
2550 		 * session id in most cases will be unchanged, execpt
2551 		 * if channel was reset.
2552 		 */
2553 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2554 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2555 			DERR(vswp, "%s: updating session id for chan %lld "
2556 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2557 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2558 		}
2559 
2560 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2561 		ldcp->session_status |= VSW_PEER_SESSION;
2562 
2563 		/* Legal message at this time ? */
2564 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2565 			return;
2566 
2567 		/*
2568 		 * First check the device class. Currently only expect
2569 		 * to be talking to a network device. In the future may
2570 		 * also talk to another switch.
2571 		 */
2572 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2573 			DERR(vswp, "%s: illegal device class %d", __func__,
2574 			    ver_pkt->dev_class);
2575 
2576 			ver_pkt->tag.vio_sid = ldcp->local_session;
2577 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2578 
2579 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2580 
2581 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2582 			    sizeof (vio_ver_msg_t), B_TRUE);
2583 
2584 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2585 			vsw_next_milestone(ldcp);
2586 			return;
2587 		} else {
2588 			ldcp->dev_class = ver_pkt->dev_class;
2589 		}
2590 
2591 		/*
2592 		 * Now check the version.
2593 		 */
2594 		if (vsw_supported_version(ver_pkt) == 0) {
2595 			/*
2596 			 * Support this major version and possibly
2597 			 * adjusted minor version.
2598 			 */
2599 
2600 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2601 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2602 
2603 			/* Store accepted values */
2604 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2605 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2606 
2607 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2608 
2609 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2610 
2611 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2612 				/*
2613 				 * Send a version info message
2614 				 * using the accepted version that
2615 				 * we are about to ack. Also note that
2616 				 * we send our ver info before we ack.
2617 				 * Otherwise, as soon as receiving the
2618 				 * ack, obp sends attr info msg, which
2619 				 * breaks vsw_check_flag() invoked
2620 				 * from vsw_process_ctrl_attr_pkt();
2621 				 * as we also need VSW_VER_ACK_RECV to
2622 				 * be set in lane_out.lstate, before
2623 				 * we can receive attr info.
2624 				 */
2625 				vsw_send_ver(ldcp);
2626 			}
2627 		} else {
2628 			/*
2629 			 * NACK back with the next lower major/minor
2630 			 * pairing we support (if don't suuport any more
2631 			 * versions then they will be set to zero.
2632 			 */
2633 
2634 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2635 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2636 
2637 			/* Store updated values */
2638 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2639 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2640 
2641 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2642 
2643 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2644 		}
2645 
2646 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2647 		ver_pkt->tag.vio_sid = ldcp->local_session;
2648 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2649 		    sizeof (vio_ver_msg_t), B_TRUE);
2650 
2651 		vsw_next_milestone(ldcp);
2652 		break;
2653 
2654 	case VIO_SUBTYPE_ACK:
2655 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2656 
2657 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2658 			return;
2659 
2660 		/* Store updated values */
2661 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2662 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2663 
2664 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2665 		vsw_next_milestone(ldcp);
2666 
2667 		break;
2668 
2669 	case VIO_SUBTYPE_NACK:
2670 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2671 
2672 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2673 			return;
2674 
2675 		/*
2676 		 * If our peer sent us a NACK with the ver fields set to
2677 		 * zero then there is nothing more we can do. Otherwise see
2678 		 * if we support either the version suggested, or a lesser
2679 		 * one.
2680 		 */
2681 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2682 			DERR(vswp, "%s: peer unable to negotiate any "
2683 			    "further.", __func__);
2684 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2685 			vsw_next_milestone(ldcp);
2686 			return;
2687 		}
2688 
2689 		/*
2690 		 * Check to see if we support this major version or
2691 		 * a lower one. If we don't then maj/min will be set
2692 		 * to zero.
2693 		 */
2694 		(void) vsw_supported_version(ver_pkt);
2695 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2696 			/* Nothing more we can do */
2697 			DERR(vswp, "%s: version negotiation failed.\n",
2698 			    __func__);
2699 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2700 			vsw_next_milestone(ldcp);
2701 		} else {
2702 			/* found a supported major version */
2703 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2704 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2705 
2706 			D2(vswp, "%s: resending with updated values (%x, %x)",
2707 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2708 
2709 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2710 			ver_pkt->tag.vio_sid = ldcp->local_session;
2711 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2712 
2713 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2714 
2715 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2716 			    sizeof (vio_ver_msg_t), B_TRUE);
2717 
2718 			vsw_next_milestone(ldcp);
2719 
2720 		}
2721 		break;
2722 
2723 	default:
2724 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2725 		    ver_pkt->tag.vio_subtype);
2726 	}
2727 
2728 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2729 }
2730 
2731 /*
2732  * Process an attribute packet. We can end up here either because our peer
2733  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2734  * peer has sent us an attribute INFO message
2735  *
2736  * If its an ACK we then move to the next stage of the handshake which
2737  * is to send our descriptor ring info to our peer. If its a NACK then
2738  * there is nothing more we can (currently) do.
2739  *
2740  * If we get a valid/acceptable INFO packet (and we have already negotiated
2741  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2742  * NACK back and reset channel state to INACTIV.
2743  *
2744  * FUTURE: in time we will probably negotiate over attributes, but for
2745  * the moment unacceptable attributes are regarded as a fatal error.
2746  *
2747  */
2748 void
2749 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2750 {
2751 	vnet_attr_msg_t		*attr_pkt;
2752 	vsw_t			*vswp = ldcp->ldc_vswp;
2753 	vsw_port_t		*port = ldcp->ldc_port;
2754 	uint64_t		macaddr = 0;
2755 	lane_t			*lane_out = &ldcp->lane_out;
2756 	lane_t			*lane_in = &ldcp->lane_in;
2757 	uint32_t		mtu;
2758 	boolean_t		ack = B_TRUE;
2759 	int			i;
2760 
2761 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2762 
2763 	/*
2764 	 * We know this is a ctrl/attr packet so
2765 	 * cast it into the correct structure.
2766 	 */
2767 	attr_pkt = (vnet_attr_msg_t *)pkt;
2768 
2769 	switch (attr_pkt->tag.vio_subtype) {
2770 	case VIO_SUBTYPE_INFO:
2771 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2772 
2773 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2774 			return;
2775 
2776 		/*
2777 		 * If the attributes are unacceptable then we NACK back.
2778 		 */
2779 		if (vsw_check_attr(attr_pkt, ldcp)) {
2780 			ack = B_FALSE;
2781 
2782 			DERR(vswp, "%s (chan %d): invalid attributes",
2783 			    __func__, ldcp->ldc_id);
2784 
2785 		} else {
2786 
2787 			if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2788 				/*
2789 				 * Versions >= 1.4:
2790 				 * The mtu is negotiated down to the
2791 				 * minimum of our mtu and peer's mtu.
2792 				 */
2793 				mtu = MIN(attr_pkt->mtu, vswp->max_frame_size);
2794 
2795 				/*
2796 				 * If we have received an ack for the attr info
2797 				 * that we sent, then check if the mtu computed
2798 				 * above matches the mtu that the peer had ack'd
2799 				 * (saved in local hparams). If they don't
2800 				 * match, we fail the handshake.
2801 				 */
2802 				if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2803 					if (mtu != lane_out->mtu) {
2804 						/* send NACK */
2805 						ack = B_FALSE;
2806 					}
2807 				} else {
2808 					/*
2809 					 * Save the mtu computed above in our
2810 					 * attr parameters, so it gets sent in
2811 					 * the attr info from us to the peer.
2812 					 */
2813 					lane_out->mtu = mtu;
2814 				}
2815 			}
2816 
2817 		}
2818 
2819 		if (ack == B_FALSE) {
2820 
2821 			vsw_free_lane_resources(ldcp, INBOUND);
2822 
2823 			attr_pkt->tag.vio_sid = ldcp->local_session;
2824 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2825 
2826 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2827 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2828 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2829 			    sizeof (vnet_attr_msg_t), B_TRUE);
2830 
2831 			vsw_next_milestone(ldcp);
2832 			return;
2833 		}
2834 
2835 		/*
2836 		 * Otherwise store attributes for this lane and update
2837 		 * lane state.
2838 		 */
2839 		lane_in->mtu = attr_pkt->mtu;
2840 		lane_in->addr = attr_pkt->addr;
2841 		lane_in->addr_type = attr_pkt->addr_type;
2842 		lane_in->xfer_mode = attr_pkt->xfer_mode;
2843 		lane_in->ack_freq = attr_pkt->ack_freq;
2844 		lane_in->physlink_update = attr_pkt->physlink_update;
2845 
2846 		/*
2847 		 * Check if the client has requested physlink state updates.
2848 		 * If there is a physical device bound to this vswitch (L2
2849 		 * mode), set the ack bits to indicate it is supported.
2850 		 * Otherwise, set the nack bits.
2851 		 */
2852 		if (VSW_VER_GTEQ(ldcp, 1, 5)) {	/* Protocol ver >= 1.5 */
2853 
2854 			/* Does the vnet need phys link state updates ? */
2855 			if ((lane_in->physlink_update &
2856 			    PHYSLINK_UPDATE_STATE_MASK) ==
2857 			    PHYSLINK_UPDATE_STATE) {
2858 
2859 				if (vswp->smode & VSW_LAYER2) {
2860 					/* is a net-dev assigned to us ? */
2861 					attr_pkt->physlink_update =
2862 					    PHYSLINK_UPDATE_STATE_ACK;
2863 					ldcp->pls_negotiated = B_TRUE;
2864 				} else {
2865 					/* not in L2 mode */
2866 					attr_pkt->physlink_update =
2867 					    PHYSLINK_UPDATE_STATE_NACK;
2868 					ldcp->pls_negotiated = B_FALSE;
2869 				}
2870 
2871 			} else {
2872 				attr_pkt->physlink_update =
2873 				    PHYSLINK_UPDATE_NONE;
2874 				ldcp->pls_negotiated = B_FALSE;
2875 			}
2876 
2877 		} else {
2878 			/*
2879 			 * physlink_update bits are ignored
2880 			 * if set by clients < v1.5 protocol.
2881 			 */
2882 			attr_pkt->physlink_update = PHYSLINK_UPDATE_NONE;
2883 			ldcp->pls_negotiated = B_FALSE;
2884 		}
2885 
2886 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2887 			/* save the MIN mtu in the msg to be replied */
2888 			attr_pkt->mtu = mtu;
2889 		}
2890 
2891 		macaddr = lane_in->addr;
2892 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2893 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2894 			macaddr >>= 8;
2895 		}
2896 
2897 		/* create the fdb entry for this port/mac address */
2898 		vsw_fdbe_add(vswp, port);
2899 
2900 		/* add the port to the specified vlans */
2901 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2902 
2903 		/* setup device specifc xmit routines */
2904 		mutex_enter(&port->tx_lock);
2905 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2906 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2907 		    (VSW_VER_LT(ldcp, 1, 2) &&
2908 		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2909 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2910 			port->transmit = vsw_dringsend;
2911 		} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2912 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2913 			vsw_create_privring(ldcp);
2914 			port->transmit = vsw_descrsend;
2915 			lane_out->xfer_mode = VIO_DESC_MODE;
2916 		}
2917 
2918 		/*
2919 		 * HybridIO is supported only vnet, not by OBP.
2920 		 * So, set hio_capable to true only when in DRING mode.
2921 		 */
2922 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2923 		    (lane_in->xfer_mode != VIO_DESC_MODE)) {
2924 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2925 		} else {
2926 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2927 		}
2928 
2929 		mutex_exit(&port->tx_lock);
2930 
2931 		attr_pkt->tag.vio_sid = ldcp->local_session;
2932 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2933 
2934 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2935 
2936 		lane_in->lstate |= VSW_ATTR_ACK_SENT;
2937 
2938 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2939 		    sizeof (vnet_attr_msg_t), B_TRUE);
2940 
2941 		vsw_next_milestone(ldcp);
2942 		break;
2943 
2944 	case VIO_SUBTYPE_ACK:
2945 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2946 
2947 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2948 			return;
2949 
2950 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2951 			/*
2952 			 * Versions >= 1.4:
2953 			 * The ack msg sent by the peer contains the minimum of
2954 			 * our mtu (that we had sent in our attr info) and the
2955 			 * peer's mtu.
2956 			 *
2957 			 * If we have sent an ack for the attr info msg from
2958 			 * the peer, check if the mtu that was computed then
2959 			 * (saved in lane_out params) matches the mtu that the
2960 			 * peer has ack'd. If they don't match, we fail the
2961 			 * handshake.
2962 			 */
2963 			if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2964 				if (lane_out->mtu != attr_pkt->mtu) {
2965 					return;
2966 				}
2967 			} else {
2968 				/*
2969 				 * If the mtu ack'd by the peer is > our mtu
2970 				 * fail handshake. Otherwise, save the mtu, so
2971 				 * we can validate it when we receive attr info
2972 				 * from our peer.
2973 				 */
2974 				if (attr_pkt->mtu > lane_out->mtu) {
2975 					return;
2976 				}
2977 				if (attr_pkt->mtu <= lane_out->mtu) {
2978 					lane_out->mtu = attr_pkt->mtu;
2979 				}
2980 			}
2981 		}
2982 
2983 		lane_out->lstate |= VSW_ATTR_ACK_RECV;
2984 		vsw_next_milestone(ldcp);
2985 		break;
2986 
2987 	case VIO_SUBTYPE_NACK:
2988 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2989 
2990 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2991 			return;
2992 
2993 		lane_out->lstate |= VSW_ATTR_NACK_RECV;
2994 		vsw_next_milestone(ldcp);
2995 		break;
2996 
2997 	default:
2998 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2999 		    attr_pkt->tag.vio_subtype);
3000 	}
3001 
3002 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3003 }
3004 
3005 /*
3006  * Process a dring info packet. We can end up here either because our peer
3007  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3008  * peer has sent us a dring INFO message.
3009  *
3010  * If we get a valid/acceptable INFO packet (and we have already negotiated
3011  * a version) we ACK back and update the lane state, otherwise we NACK back.
3012  *
3013  * FUTURE: nothing to stop client from sending us info on multiple dring's
3014  * but for the moment we will just use the first one we are given.
3015  *
3016  */
3017 void
3018 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3019 {
3020 	vio_dring_reg_msg_t	*dring_pkt;
3021 	vsw_t			*vswp = ldcp->ldc_vswp;
3022 	ldc_mem_info_t		minfo;
3023 	dring_info_t		*dp, *dbp;
3024 	int			dring_found = 0;
3025 
3026 	/*
3027 	 * We know this is a ctrl/dring packet so
3028 	 * cast it into the correct structure.
3029 	 */
3030 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
3031 
3032 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3033 
3034 	switch (dring_pkt->tag.vio_subtype) {
3035 	case VIO_SUBTYPE_INFO:
3036 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3037 
3038 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3039 			return;
3040 
3041 		/*
3042 		 * If the dring params are unacceptable then we NACK back.
3043 		 */
3044 		if (vsw_check_dring_info(dring_pkt)) {
3045 
3046 			DERR(vswp, "%s (%lld): invalid dring info",
3047 			    __func__, ldcp->ldc_id);
3048 
3049 			vsw_free_lane_resources(ldcp, INBOUND);
3050 
3051 			dring_pkt->tag.vio_sid = ldcp->local_session;
3052 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3053 
3054 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3055 
3056 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3057 
3058 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3059 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
3060 
3061 			vsw_next_milestone(ldcp);
3062 			return;
3063 		}
3064 
3065 		/*
3066 		 * Otherwise, attempt to map in the dring using the
3067 		 * cookie. If that succeeds we send back a unique dring
3068 		 * identifier that the sending side will use in future
3069 		 * to refer to this descriptor ring.
3070 		 */
3071 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
3072 
3073 		dp->num_descriptors = dring_pkt->num_descriptors;
3074 		dp->descriptor_size = dring_pkt->descriptor_size;
3075 		dp->options = dring_pkt->options;
3076 		dp->ncookies = dring_pkt->ncookies;
3077 
3078 		/*
3079 		 * Note: should only get one cookie. Enforced in
3080 		 * the ldc layer.
3081 		 */
3082 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
3083 		    sizeof (ldc_mem_cookie_t));
3084 
3085 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
3086 		    dp->num_descriptors, dp->descriptor_size);
3087 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
3088 		    dp->options, dp->ncookies);
3089 
3090 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
3091 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
3092 		    LDC_DIRECT_MAP, &(dp->handle))) != 0) {
3093 
3094 			DERR(vswp, "%s: dring_map failed\n", __func__);
3095 
3096 			kmem_free(dp, sizeof (dring_info_t));
3097 			vsw_free_lane_resources(ldcp, INBOUND);
3098 
3099 			dring_pkt->tag.vio_sid = ldcp->local_session;
3100 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3101 
3102 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3103 
3104 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3105 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3106 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
3107 
3108 			vsw_next_milestone(ldcp);
3109 			return;
3110 		}
3111 
3112 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
3113 
3114 			DERR(vswp, "%s: dring_addr failed\n", __func__);
3115 
3116 			kmem_free(dp, sizeof (dring_info_t));
3117 			vsw_free_lane_resources(ldcp, INBOUND);
3118 
3119 			dring_pkt->tag.vio_sid = ldcp->local_session;
3120 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3121 
3122 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3123 
3124 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3125 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3126 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
3127 
3128 			vsw_next_milestone(ldcp);
3129 			return;
3130 		} else {
3131 			/* store the address of the pub part of ring */
3132 			dp->pub_addr = minfo.vaddr;
3133 
3134 			/* cache the dring mtype */
3135 			dp->dring_mtype = minfo.mtype;
3136 		}
3137 
3138 		/* no private section as we are importing */
3139 		dp->priv_addr = NULL;
3140 
3141 		/*
3142 		 * Using simple mono increasing int for ident at
3143 		 * the moment.
3144 		 */
3145 		dp->ident = ldcp->next_ident;
3146 		ldcp->next_ident++;
3147 
3148 		dp->end_idx = 0;
3149 		dp->next = NULL;
3150 
3151 		/*
3152 		 * Link it onto the end of the list of drings
3153 		 * for this lane.
3154 		 */
3155 		if (ldcp->lane_in.dringp == NULL) {
3156 			D2(vswp, "%s: adding first INBOUND dring", __func__);
3157 			ldcp->lane_in.dringp = dp;
3158 		} else {
3159 			dbp = ldcp->lane_in.dringp;
3160 
3161 			while (dbp->next != NULL)
3162 				dbp = dbp->next;
3163 
3164 			dbp->next = dp;
3165 		}
3166 
3167 		/* acknowledge it */
3168 		dring_pkt->tag.vio_sid = ldcp->local_session;
3169 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3170 		dring_pkt->dring_ident = dp->ident;
3171 
3172 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3173 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
3174 
3175 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
3176 		vsw_next_milestone(ldcp);
3177 		break;
3178 
3179 	case VIO_SUBTYPE_ACK:
3180 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3181 
3182 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
3183 			return;
3184 
3185 		/*
3186 		 * Peer is acknowledging our dring info and will have
3187 		 * sent us a dring identifier which we will use to
3188 		 * refer to this ring w.r.t. our peer.
3189 		 */
3190 		dp = ldcp->lane_out.dringp;
3191 		if (dp != NULL) {
3192 			/*
3193 			 * Find the ring this ident should be associated
3194 			 * with.
3195 			 */
3196 			if (vsw_dring_match(dp, dring_pkt)) {
3197 				dring_found = 1;
3198 
3199 			} else while (dp != NULL) {
3200 				if (vsw_dring_match(dp, dring_pkt)) {
3201 					dring_found = 1;
3202 					break;
3203 				}
3204 				dp = dp->next;
3205 			}
3206 
3207 			if (dring_found == 0) {
3208 				DERR(NULL, "%s: unrecognised ring cookie",
3209 				    __func__);
3210 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3211 				return;
3212 			}
3213 
3214 		} else {
3215 			DERR(vswp, "%s: DRING ACK received but no drings "
3216 			    "allocated", __func__);
3217 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3218 			return;
3219 		}
3220 
3221 		/* store ident */
3222 		dp->ident = dring_pkt->dring_ident;
3223 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
3224 		vsw_next_milestone(ldcp);
3225 		break;
3226 
3227 	case VIO_SUBTYPE_NACK:
3228 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3229 
3230 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3231 			return;
3232 
3233 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
3234 		vsw_next_milestone(ldcp);
3235 		break;
3236 
3237 	default:
3238 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3239 		    dring_pkt->tag.vio_subtype);
3240 	}
3241 
3242 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3243 }
3244 
3245 /*
3246  * Process a request from peer to unregister a dring.
3247  *
3248  * For the moment we just restart the handshake if our
3249  * peer endpoint attempts to unregister a dring.
3250  */
3251 void
3252 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3253 {
3254 	vsw_t			*vswp = ldcp->ldc_vswp;
3255 	vio_dring_unreg_msg_t	*dring_pkt;
3256 
3257 	/*
3258 	 * We know this is a ctrl/dring packet so
3259 	 * cast it into the correct structure.
3260 	 */
3261 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3262 
3263 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3264 
3265 	switch (dring_pkt->tag.vio_subtype) {
3266 	case VIO_SUBTYPE_INFO:
3267 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3268 
3269 		DWARN(vswp, "%s: restarting handshake..", __func__);
3270 		break;
3271 
3272 	case VIO_SUBTYPE_ACK:
3273 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3274 
3275 		DWARN(vswp, "%s: restarting handshake..", __func__);
3276 		break;
3277 
3278 	case VIO_SUBTYPE_NACK:
3279 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3280 
3281 		DWARN(vswp, "%s: restarting handshake..", __func__);
3282 		break;
3283 
3284 	default:
3285 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3286 		    dring_pkt->tag.vio_subtype);
3287 	}
3288 
3289 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3290 
3291 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3292 }
3293 
3294 #define	SND_MCST_NACK(ldcp, pkt) \
3295 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3296 	pkt->tag.vio_sid = ldcp->local_session; \
3297 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3298 			sizeof (vnet_mcast_msg_t), B_TRUE);
3299 
3300 /*
3301  * Process a multicast request from a vnet.
3302  *
3303  * Vnet's specify a multicast address that they are interested in. This
3304  * address is used as a key into the hash table which forms the multicast
3305  * forwarding database (mFDB).
3306  *
3307  * The table keys are the multicast addresses, while the table entries
3308  * are pointers to lists of ports which wish to receive packets for the
3309  * specified multicast address.
3310  *
3311  * When a multicast packet is being switched we use the address as a key
3312  * into the hash table, and then walk the appropriate port list forwarding
3313  * the pkt to each port in turn.
3314  *
3315  * If a vnet is no longer interested in a particular multicast grouping
3316  * we simply find the correct location in the hash table and then delete
3317  * the relevant port from the port list.
3318  *
3319  * To deal with the case whereby a port is being deleted without first
3320  * removing itself from the lists in the hash table, we maintain a list
3321  * of multicast addresses the port has registered an interest in, within
3322  * the port structure itself. We then simply walk that list of addresses
3323  * using them as keys into the hash table and remove the port from the
3324  * appropriate lists.
3325  */
3326 static void
3327 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3328 {
3329 	vnet_mcast_msg_t	*mcst_pkt;
3330 	vsw_port_t		*port = ldcp->ldc_port;
3331 	vsw_t			*vswp = ldcp->ldc_vswp;
3332 	int			i;
3333 
3334 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3335 
3336 	/*
3337 	 * We know this is a ctrl/mcast packet so
3338 	 * cast it into the correct structure.
3339 	 */
3340 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3341 
3342 	switch (mcst_pkt->tag.vio_subtype) {
3343 	case VIO_SUBTYPE_INFO:
3344 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3345 
3346 		/*
3347 		 * Check if in correct state to receive a multicast
3348 		 * message (i.e. handshake complete). If not reset
3349 		 * the handshake.
3350 		 */
3351 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3352 			return;
3353 
3354 		/*
3355 		 * Before attempting to add or remove address check
3356 		 * that they are valid multicast addresses.
3357 		 * If not, then NACK back.
3358 		 */
3359 		for (i = 0; i < mcst_pkt->count; i++) {
3360 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3361 				DERR(vswp, "%s: invalid multicast address",
3362 				    __func__);
3363 				SND_MCST_NACK(ldcp, mcst_pkt);
3364 				return;
3365 			}
3366 		}
3367 
3368 		/*
3369 		 * Now add/remove the addresses. If this fails we
3370 		 * NACK back.
3371 		 */
3372 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3373 			SND_MCST_NACK(ldcp, mcst_pkt);
3374 			return;
3375 		}
3376 
3377 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3378 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3379 
3380 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3381 
3382 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3383 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3384 		break;
3385 
3386 	case VIO_SUBTYPE_ACK:
3387 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3388 
3389 		/*
3390 		 * We shouldn't ever get a multicast ACK message as
3391 		 * at the moment we never request multicast addresses
3392 		 * to be set on some other device. This may change in
3393 		 * the future if we have cascading switches.
3394 		 */
3395 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3396 			return;
3397 
3398 				/* Do nothing */
3399 		break;
3400 
3401 	case VIO_SUBTYPE_NACK:
3402 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3403 
3404 		/*
3405 		 * We shouldn't get a multicast NACK packet for the
3406 		 * same reasons as we shouldn't get a ACK packet.
3407 		 */
3408 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3409 			return;
3410 
3411 				/* Do nothing */
3412 		break;
3413 
3414 	default:
3415 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3416 		    mcst_pkt->tag.vio_subtype);
3417 	}
3418 
3419 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3420 }
3421 
3422 static void
3423 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3424 {
3425 	vio_rdx_msg_t	*rdx_pkt;
3426 	vsw_t		*vswp = ldcp->ldc_vswp;
3427 
3428 	/*
3429 	 * We know this is a ctrl/rdx packet so
3430 	 * cast it into the correct structure.
3431 	 */
3432 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3433 
3434 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3435 
3436 	switch (rdx_pkt->tag.vio_subtype) {
3437 	case VIO_SUBTYPE_INFO:
3438 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3439 
3440 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3441 			return;
3442 
3443 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3444 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3445 
3446 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3447 
3448 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3449 
3450 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3451 		    sizeof (vio_rdx_msg_t), B_TRUE);
3452 
3453 		vsw_next_milestone(ldcp);
3454 		break;
3455 
3456 	case VIO_SUBTYPE_ACK:
3457 		/*
3458 		 * Should be handled in-band by callback handler.
3459 		 */
3460 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3461 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3462 		break;
3463 
3464 	case VIO_SUBTYPE_NACK:
3465 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3466 
3467 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3468 			return;
3469 
3470 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3471 		vsw_next_milestone(ldcp);
3472 		break;
3473 
3474 	default:
3475 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3476 		    rdx_pkt->tag.vio_subtype);
3477 	}
3478 
3479 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3480 }
3481 
3482 static void
3483 vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
3484 {
3485 	vnet_physlink_msg_t	*msgp;
3486 	vsw_t			*vswp = ldcp->ldc_vswp;
3487 
3488 	msgp = (vnet_physlink_msg_t *)pkt;
3489 
3490 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3491 
3492 	switch (msgp->tag.vio_subtype) {
3493 	case VIO_SUBTYPE_INFO:
3494 
3495 		/* vsw shouldn't recv physlink info */
3496 		DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
3497 		break;
3498 
3499 	case VIO_SUBTYPE_ACK:
3500 
3501 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3502 		break;
3503 
3504 	case VIO_SUBTYPE_NACK:
3505 
3506 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3507 		break;
3508 
3509 	default:
3510 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3511 		    msgp->tag.vio_subtype);
3512 	}
3513 
3514 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3515 }
3516 
3517 static void
3518 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3519 	uint32_t msglen)
3520 {
3521 	uint16_t	env = tagp->vio_subtype_env;
3522 	vsw_t		*vswp = ldcp->ldc_vswp;
3523 
3524 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3525 
3526 	/* session id check */
3527 	if (ldcp->session_status & VSW_PEER_SESSION) {
3528 		if (ldcp->peer_session != tagp->vio_sid) {
3529 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3530 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3531 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3532 			return;
3533 		}
3534 	}
3535 
3536 	/*
3537 	 * It is an error for us to be getting data packets
3538 	 * before the handshake has completed.
3539 	 */
3540 	if (ldcp->hphase != VSW_MILESTONE4) {
3541 		DERR(vswp, "%s: got data packet before handshake complete "
3542 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3543 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3544 		DUMP_FLAGS(ldcp->lane_in.lstate);
3545 		DUMP_FLAGS(ldcp->lane_out.lstate);
3546 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3547 		return;
3548 	}
3549 
3550 	/*
3551 	 * To reduce the locking contention, release the
3552 	 * ldc_cblock here and re-acquire it once we are done
3553 	 * receiving packets.
3554 	 */
3555 	mutex_exit(&ldcp->ldc_cblock);
3556 	mutex_enter(&ldcp->ldc_rxlock);
3557 
3558 	/*
3559 	 * Switch on vio_subtype envelope, then let lower routines
3560 	 * decide if its an INFO, ACK or NACK packet.
3561 	 */
3562 	if (env == VIO_DRING_DATA) {
3563 		vsw_process_data_dring_pkt(ldcp, dpkt);
3564 	} else if (env == VIO_PKT_DATA) {
3565 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3566 	} else if (env == VIO_DESC_DATA) {
3567 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3568 	} else {
3569 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3570 	}
3571 
3572 	mutex_exit(&ldcp->ldc_rxlock);
3573 	mutex_enter(&ldcp->ldc_cblock);
3574 
3575 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3576 }
3577 
3578 #define	SND_DRING_NACK(ldcp, pkt) \
3579 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3580 	pkt->tag.vio_sid = ldcp->local_session; \
3581 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3582 			sizeof (vio_dring_msg_t), B_TRUE);
3583 
3584 static void
3585 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3586 {
3587 	vio_dring_msg_t		*dring_pkt;
3588 	vnet_public_desc_t	desc, *pub_addr = NULL;
3589 	vsw_private_desc_t	*priv_addr = NULL;
3590 	dring_info_t		*dp = NULL;
3591 	vsw_t			*vswp = ldcp->ldc_vswp;
3592 	mblk_t			*mp = NULL;
3593 	mblk_t			*bp = NULL;
3594 	mblk_t			*bpt = NULL;
3595 	size_t			nbytes = 0;
3596 	uint64_t		chain = 0;
3597 	uint64_t		len;
3598 	uint32_t		pos, start;
3599 	uint32_t		range_start, range_end;
3600 	int32_t			end, num, cnt = 0;
3601 	int			i, rv, rng_rv = 0, msg_rv = 0;
3602 	boolean_t		prev_desc_ack = B_FALSE;
3603 	int			read_attempts = 0;
3604 	struct ether_header	*ehp;
3605 	lane_t			*lp = &ldcp->lane_out;
3606 
3607 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3608 
3609 	/*
3610 	 * We know this is a data/dring packet so
3611 	 * cast it into the correct structure.
3612 	 */
3613 	dring_pkt = (vio_dring_msg_t *)dpkt;
3614 
3615 	/*
3616 	 * Switch on the vio_subtype. If its INFO then we need to
3617 	 * process the data. If its an ACK we need to make sure
3618 	 * it makes sense (i.e did we send an earlier data/info),
3619 	 * and if its a NACK then we maybe attempt a retry.
3620 	 */
3621 	switch (dring_pkt->tag.vio_subtype) {
3622 	case VIO_SUBTYPE_INFO:
3623 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3624 
3625 		READ_ENTER(&ldcp->lane_in.dlistrw);
3626 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3627 		    dring_pkt->dring_ident)) == NULL) {
3628 			RW_EXIT(&ldcp->lane_in.dlistrw);
3629 
3630 			DERR(vswp, "%s(%lld): unable to find dring from "
3631 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3632 			    dring_pkt->dring_ident);
3633 
3634 			SND_DRING_NACK(ldcp, dring_pkt);
3635 			return;
3636 		}
3637 
3638 		start = pos = dring_pkt->start_idx;
3639 		end = dring_pkt->end_idx;
3640 		len = dp->num_descriptors;
3641 
3642 		range_start = range_end = pos;
3643 
3644 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3645 		    __func__, ldcp->ldc_id, start, end);
3646 
3647 		if (end == -1) {
3648 			num = -1;
3649 		} else if (end >= 0) {
3650 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3651 
3652 			/* basic sanity check */
3653 			if (end > len) {
3654 				RW_EXIT(&ldcp->lane_in.dlistrw);
3655 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3656 				    "ring length %lld", __func__,
3657 				    ldcp->ldc_id, end, len);
3658 
3659 				SND_DRING_NACK(ldcp, dring_pkt);
3660 				return;
3661 			}
3662 		} else {
3663 			RW_EXIT(&ldcp->lane_in.dlistrw);
3664 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3665 			    __func__, ldcp->ldc_id, end);
3666 			SND_DRING_NACK(ldcp, dring_pkt);
3667 			return;
3668 		}
3669 
3670 		while (cnt != num) {
3671 vsw_recheck_desc:
3672 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3673 
3674 			if ((rng_rv = vnet_dring_entry_copy(pub_addr,
3675 			    &desc, dp->dring_mtype, dp->handle,
3676 			    pos, pos)) != 0) {
3677 				DERR(vswp, "%s(%lld): unable to copy "
3678 				    "descriptor at pos %d: err %d",
3679 				    __func__, pos, ldcp->ldc_id, rng_rv);
3680 				ldcp->ldc_stats.ierrors++;
3681 				break;
3682 			}
3683 
3684 			/*
3685 			 * When given a bounded range of descriptors
3686 			 * to process, its an error to hit a descriptor
3687 			 * which is not ready. In the non-bounded case
3688 			 * (end_idx == -1) this simply indicates we have
3689 			 * reached the end of the current active range.
3690 			 */
3691 			if (desc.hdr.dstate != VIO_DESC_READY) {
3692 				/* unbound - no error */
3693 				if (end == -1) {
3694 					if (read_attempts == vsw_read_attempts)
3695 						break;
3696 
3697 					delay(drv_usectohz(vsw_desc_delay));
3698 					read_attempts++;
3699 					goto vsw_recheck_desc;
3700 				}
3701 
3702 				/* bounded - error - so NACK back */
3703 				RW_EXIT(&ldcp->lane_in.dlistrw);
3704 				DERR(vswp, "%s(%lld): descriptor not READY "
3705 				    "(%d)", __func__, ldcp->ldc_id,
3706 				    desc.hdr.dstate);
3707 				SND_DRING_NACK(ldcp, dring_pkt);
3708 				return;
3709 			}
3710 
3711 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3712 
3713 			range_end = pos;
3714 
3715 			/*
3716 			 * If we ACK'd the previous descriptor then now
3717 			 * record the new range start position for later
3718 			 * ACK's.
3719 			 */
3720 			if (prev_desc_ack) {
3721 				range_start = pos;
3722 
3723 				D2(vswp, "%s(%lld): updating range start to be "
3724 				    "%d", __func__, ldcp->ldc_id, range_start);
3725 
3726 				prev_desc_ack = B_FALSE;
3727 			}
3728 
3729 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3730 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3731 			    __func__, ldcp->ldc_id, pos, &desc,
3732 			    desc.hdr.dstate, desc.nbytes);
3733 
3734 			if ((desc.nbytes < ETHERMIN) ||
3735 			    (desc.nbytes > lp->mtu)) {
3736 				/* invalid size; drop the packet */
3737 				ldcp->ldc_stats.ierrors++;
3738 				goto vsw_process_desc_done;
3739 			}
3740 
3741 			/*
3742 			 * Ensure that we ask ldc for an aligned
3743 			 * number of bytes. Data is padded to align on 8
3744 			 * byte boundary, desc.nbytes is actual data length,
3745 			 * i.e. minus that padding.
3746 			 */
3747 			nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7;
3748 			if (nbytes > ldcp->max_rxpool_size) {
3749 				mp = allocb(desc.nbytes + VNET_IPALIGN + 8,
3750 				    BPRI_MED);
3751 			} else {
3752 				mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3753 				if (mp == NULL) {
3754 					ldcp->ldc_stats.rx_vio_allocb_fail++;
3755 					/*
3756 					 * No free receive buffers available,
3757 					 * so fallback onto allocb(9F). Make
3758 					 * sure that we get a data buffer which
3759 					 * is a multiple of 8 as this is
3760 					 * required by ldc_mem_copy.
3761 					 */
3762 					DTRACE_PROBE(allocb);
3763 					mp = allocb(desc.nbytes +
3764 					    VNET_IPALIGN + 8, BPRI_MED);
3765 				}
3766 			}
3767 			if (mp == NULL) {
3768 				DERR(vswp, "%s(%ld): allocb failed",
3769 				    __func__, ldcp->ldc_id);
3770 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3771 				    dp->dring_mtype, dp->handle, pos, pos,
3772 				    VIO_DESC_DONE);
3773 				ldcp->ldc_stats.ierrors++;
3774 				ldcp->ldc_stats.rx_allocb_fail++;
3775 				break;
3776 			}
3777 
3778 			rv = ldc_mem_copy(ldcp->ldc_handle,
3779 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3780 			    desc.memcookie, desc.ncookies, LDC_COPY_IN);
3781 			if (rv != 0) {
3782 				DERR(vswp, "%s(%d): unable to copy in data "
3783 				    "from %d cookies in desc %d (rv %d)",
3784 				    __func__, ldcp->ldc_id, desc.ncookies,
3785 				    pos, rv);
3786 				freemsg(mp);
3787 
3788 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3789 				    dp->dring_mtype, dp->handle, pos, pos,
3790 				    VIO_DESC_DONE);
3791 				ldcp->ldc_stats.ierrors++;
3792 				break;
3793 			} else {
3794 				D2(vswp, "%s(%d): copied in %ld bytes"
3795 				    " using %d cookies", __func__,
3796 				    ldcp->ldc_id, nbytes, desc.ncookies);
3797 			}
3798 
3799 			/* adjust the read pointer to skip over the padding */
3800 			mp->b_rptr += VNET_IPALIGN;
3801 
3802 			/* point to the actual end of data */
3803 			mp->b_wptr = mp->b_rptr + desc.nbytes;
3804 
3805 			/* update statistics */
3806 			ehp = (struct ether_header *)mp->b_rptr;
3807 			if (IS_BROADCAST(ehp))
3808 				ldcp->ldc_stats.brdcstrcv++;
3809 			else if (IS_MULTICAST(ehp))
3810 				ldcp->ldc_stats.multircv++;
3811 
3812 			ldcp->ldc_stats.ipackets++;
3813 			ldcp->ldc_stats.rbytes += desc.nbytes;
3814 
3815 			/*
3816 			 * IPALIGN space can be used for VLAN_TAG
3817 			 */
3818 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3819 			    VSW_VNETPORT, mp);
3820 
3821 			/* build a chain of received packets */
3822 			if (bp == NULL) {
3823 				/* first pkt */
3824 				bp = mp;
3825 				bp->b_next = bp->b_prev = NULL;
3826 				bpt = bp;
3827 				chain = 1;
3828 			} else {
3829 				mp->b_next = mp->b_prev = NULL;
3830 				bpt->b_next = mp;
3831 				bpt = mp;
3832 				chain++;
3833 			}
3834 
3835 vsw_process_desc_done:
3836 			/* mark we are finished with this descriptor */
3837 			if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3838 			    dp->dring_mtype, dp->handle, pos, pos,
3839 			    VIO_DESC_DONE)) != 0) {
3840 				DERR(vswp, "%s(%lld): unable to update "
3841 				    "dstate at pos %d: err %d",
3842 				    __func__, pos, ldcp->ldc_id, rng_rv);
3843 				ldcp->ldc_stats.ierrors++;
3844 				break;
3845 			}
3846 
3847 			/*
3848 			 * Send an ACK back to peer if requested.
3849 			 */
3850 			if (desc.hdr.ack) {
3851 				dring_pkt->start_idx = range_start;
3852 				dring_pkt->end_idx = range_end;
3853 
3854 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3855 				    " requested", __func__, ldcp->ldc_id,
3856 				    dring_pkt->start_idx, dring_pkt->end_idx);
3857 
3858 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3859 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3860 				dring_pkt->tag.vio_sid = ldcp->local_session;
3861 
3862 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3863 				    sizeof (vio_dring_msg_t), B_FALSE);
3864 
3865 				/*
3866 				 * Check if ACK was successfully sent. If not
3867 				 * we break and deal with that below.
3868 				 */
3869 				if (msg_rv != 0)
3870 					break;
3871 
3872 				prev_desc_ack = B_TRUE;
3873 				range_start = pos;
3874 			}
3875 
3876 			/* next descriptor */
3877 			pos = (pos + 1) % len;
3878 			cnt++;
3879 
3880 			/*
3881 			 * Break out of loop here and stop processing to
3882 			 * allow some other network device (or disk) to
3883 			 * get access to the cpu.
3884 			 */
3885 			if (chain > vsw_chain_len) {
3886 				D3(vswp, "%s(%lld): switching chain of %d "
3887 				    "msgs", __func__, ldcp->ldc_id, chain);
3888 				break;
3889 			}
3890 		}
3891 		RW_EXIT(&ldcp->lane_in.dlistrw);
3892 
3893 		/* send the chain of packets to be switched */
3894 		if (bp != NULL) {
3895 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3896 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3897 			    __func__, ldcp->ldc_id, chain);
3898 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3899 			    ldcp->ldc_port, NULL);
3900 		}
3901 
3902 		/*
3903 		 * If when we encountered an error when attempting to
3904 		 * access an imported dring, initiate a connection reset.
3905 		 */
3906 		if (rng_rv != 0) {
3907 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3908 			break;
3909 		}
3910 
3911 		/*
3912 		 * If when we attempted to send the ACK we found that the
3913 		 * channel had been reset then now handle this. We deal with
3914 		 * it here as we cannot reset the channel while holding the
3915 		 * dlistrw lock, and we don't want to acquire/release it
3916 		 * continuously in the above loop, as a channel reset should
3917 		 * be a rare event.
3918 		 */
3919 		if (msg_rv == ECONNRESET) {
3920 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3921 			break;
3922 		}
3923 
3924 		DTRACE_PROBE1(msg_cnt, int, cnt);
3925 
3926 		/*
3927 		 * We are now finished so ACK back with the state
3928 		 * set to STOPPING so our peer knows we are finished
3929 		 */
3930 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3931 		dring_pkt->tag.vio_sid = ldcp->local_session;
3932 
3933 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3934 
3935 		DTRACE_PROBE(stop_process_sent);
3936 
3937 		/*
3938 		 * We have not processed any more descriptors beyond
3939 		 * the last one we ACK'd.
3940 		 */
3941 		if (prev_desc_ack)
3942 			range_start = range_end;
3943 
3944 		dring_pkt->start_idx = range_start;
3945 		dring_pkt->end_idx = range_end;
3946 
3947 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3948 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3949 		    dring_pkt->end_idx);
3950 
3951 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3952 		    sizeof (vio_dring_msg_t), B_TRUE);
3953 		break;
3954 
3955 	case VIO_SUBTYPE_ACK:
3956 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3957 		/*
3958 		 * Verify that the relevant descriptors are all
3959 		 * marked as DONE
3960 		 */
3961 		READ_ENTER(&ldcp->lane_out.dlistrw);
3962 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3963 		    dring_pkt->dring_ident)) == NULL) {
3964 			RW_EXIT(&ldcp->lane_out.dlistrw);
3965 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3966 			return;
3967 		}
3968 
3969 		start = end = 0;
3970 		start = dring_pkt->start_idx;
3971 		end = dring_pkt->end_idx;
3972 		len = dp->num_descriptors;
3973 
3974 
3975 		mutex_enter(&dp->dlock);
3976 		dp->last_ack_recv = end;
3977 		ldcp->ldc_stats.dring_data_acks++;
3978 		mutex_exit(&dp->dlock);
3979 
3980 		(void) vsw_reclaim_dring(dp, start);
3981 
3982 		/*
3983 		 * If our peer is stopping processing descriptors then
3984 		 * we check to make sure it has processed all the descriptors
3985 		 * we have updated. If not then we send it a new message
3986 		 * to prompt it to restart.
3987 		 */
3988 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3989 			DTRACE_PROBE(stop_process_recv);
3990 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3991 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3992 			    dring_pkt->end_idx);
3993 
3994 			/*
3995 			 * Check next descriptor in public section of ring.
3996 			 * If its marked as READY then we need to prompt our
3997 			 * peer to start processing the ring again.
3998 			 */
3999 			i = (end + 1) % len;
4000 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
4001 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4002 
4003 			/*
4004 			 * Hold the restart lock across all of this to
4005 			 * make sure that its not possible for us to
4006 			 * decide that a msg needs to be sent in the future
4007 			 * but the sending code having already checked is
4008 			 * about to exit.
4009 			 */
4010 			mutex_enter(&dp->restart_lock);
4011 			ldcp->ldc_stats.dring_stopped_acks++;
4012 			mutex_enter(&priv_addr->dstate_lock);
4013 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
4014 
4015 				mutex_exit(&priv_addr->dstate_lock);
4016 
4017 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4018 				dring_pkt->tag.vio_sid = ldcp->local_session;
4019 
4020 				dring_pkt->start_idx = (end + 1) % len;
4021 				dring_pkt->end_idx = -1;
4022 
4023 				D2(vswp, "%s(%lld) : sending restart msg:"
4024 				    " %d : %d", __func__, ldcp->ldc_id,
4025 				    dring_pkt->start_idx, dring_pkt->end_idx);
4026 
4027 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
4028 				    sizeof (vio_dring_msg_t), B_FALSE);
4029 				ldcp->ldc_stats.dring_data_msgs++;
4030 
4031 			} else {
4032 				mutex_exit(&priv_addr->dstate_lock);
4033 				dp->restart_reqd = B_TRUE;
4034 			}
4035 			mutex_exit(&dp->restart_lock);
4036 		}
4037 		RW_EXIT(&ldcp->lane_out.dlistrw);
4038 
4039 		/* only do channel reset after dropping dlistrw lock */
4040 		if (msg_rv == ECONNRESET)
4041 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4042 
4043 		break;
4044 
4045 	case VIO_SUBTYPE_NACK:
4046 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
4047 		    __func__, ldcp->ldc_id);
4048 		/*
4049 		 * Something is badly wrong if we are getting NACK's
4050 		 * for our data pkts. So reset the channel.
4051 		 */
4052 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4053 
4054 		break;
4055 
4056 	default:
4057 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4058 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
4059 	}
4060 
4061 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4062 }
4063 
4064 /*
4065  * dummy pkt data handler function for vnet protocol version 1.0
4066  */
4067 static void
4068 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
4069 {
4070 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
4071 }
4072 
4073 /*
4074  * This function handles raw pkt data messages received over the channel.
4075  * Currently, only priority-eth-type frames are received through this mechanism.
4076  * In this case, the frame(data) is present within the message itself which
4077  * is copied into an mblk before switching it.
4078  */
4079 static void
4080 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
4081 {
4082 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
4083 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
4084 	uint32_t		size;
4085 	mblk_t			*mp;
4086 	vsw_t			*vswp = ldcp->ldc_vswp;
4087 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4088 	lane_t			*lp = &ldcp->lane_out;
4089 
4090 	size = msglen - VIO_PKT_DATA_HDRSIZE;
4091 	if (size < ETHERMIN || size > lp->mtu) {
4092 		(void) atomic_inc_32(&statsp->rx_pri_fail);
4093 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4094 		    ldcp->ldc_id, size);
4095 		return;
4096 	}
4097 
4098 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
4099 	if (mp == NULL) {
4100 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
4101 		if (mp == NULL) {
4102 			(void) atomic_inc_32(&statsp->rx_pri_fail);
4103 			DWARN(vswp, "%s(%lld) allocb failure, "
4104 			    "unable to process priority frame\n", __func__,
4105 			    ldcp->ldc_id);
4106 			return;
4107 		}
4108 	}
4109 
4110 	/* skip over the extra space for vlan tag */
4111 	mp->b_rptr += VLAN_TAGSZ;
4112 
4113 	/* copy the frame from the payload of raw data msg into the mblk */
4114 	bcopy(dpkt->data, mp->b_rptr, size);
4115 	mp->b_wptr = mp->b_rptr + size;
4116 
4117 	/* update stats */
4118 	(void) atomic_inc_64(&statsp->rx_pri_packets);
4119 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
4120 
4121 	/*
4122 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
4123 	 */
4124 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
4125 
4126 	/* switch the frame to destination */
4127 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
4128 }
4129 
4130 /*
4131  * Process an in-band descriptor message (most likely from
4132  * OBP).
4133  */
4134 static void
4135 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
4136 {
4137 	vnet_ibnd_desc_t	*ibnd_desc;
4138 	dring_info_t		*dp = NULL;
4139 	vsw_private_desc_t	*priv_addr = NULL;
4140 	vsw_t			*vswp = ldcp->ldc_vswp;
4141 	mblk_t			*mp = NULL;
4142 	size_t			nbytes = 0;
4143 	size_t			off = 0;
4144 	uint64_t		idx = 0;
4145 	uint32_t		num = 1, len, datalen = 0;
4146 	uint64_t		ncookies = 0;
4147 	int			i, rv;
4148 	int			j = 0;
4149 
4150 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4151 
4152 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
4153 
4154 	switch (ibnd_desc->hdr.tag.vio_subtype) {
4155 	case VIO_SUBTYPE_INFO:
4156 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4157 
4158 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
4159 			return;
4160 
4161 		/*
4162 		 * Data is padded to align on a 8 byte boundary,
4163 		 * nbytes is actual data length, i.e. minus that
4164 		 * padding.
4165 		 */
4166 		datalen = ibnd_desc->nbytes;
4167 
4168 		D2(vswp, "%s(%lld): processing inband desc : "
4169 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
4170 
4171 		ncookies = ibnd_desc->ncookies;
4172 
4173 		/*
4174 		 * allocb(9F) returns an aligned data block. We
4175 		 * need to ensure that we ask ldc for an aligned
4176 		 * number of bytes also.
4177 		 */
4178 		nbytes = datalen;
4179 		if (nbytes & 0x7) {
4180 			off = 8 - (nbytes & 0x7);
4181 			nbytes += off;
4182 		}
4183 
4184 		/* alloc extra space for VLAN_TAG */
4185 		mp = allocb(datalen + 8, BPRI_MED);
4186 		if (mp == NULL) {
4187 			DERR(vswp, "%s(%lld): allocb failed",
4188 			    __func__, ldcp->ldc_id);
4189 			ldcp->ldc_stats.rx_allocb_fail++;
4190 			return;
4191 		}
4192 
4193 		/* skip over the extra space for VLAN_TAG */
4194 		mp->b_rptr += 8;
4195 
4196 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
4197 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
4198 		    LDC_COPY_IN);
4199 
4200 		if (rv != 0) {
4201 			DERR(vswp, "%s(%d): unable to copy in data from "
4202 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
4203 			freemsg(mp);
4204 			ldcp->ldc_stats.ierrors++;
4205 			return;
4206 		}
4207 
4208 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
4209 		    __func__, ldcp->ldc_id, nbytes, ncookies);
4210 
4211 		/* point to the actual end of data */
4212 		mp->b_wptr = mp->b_rptr + datalen;
4213 		ldcp->ldc_stats.ipackets++;
4214 		ldcp->ldc_stats.rbytes += datalen;
4215 
4216 		/*
4217 		 * We ACK back every in-band descriptor message we process
4218 		 */
4219 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
4220 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
4221 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
4222 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
4223 
4224 		/*
4225 		 * there is extra space alloc'd for VLAN_TAG
4226 		 */
4227 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
4228 
4229 		/* send the packet to be switched */
4230 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
4231 		    ldcp->ldc_port, NULL);
4232 
4233 		break;
4234 
4235 	case VIO_SUBTYPE_ACK:
4236 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4237 
4238 		/* Verify the ACK is valid */
4239 		idx = ibnd_desc->hdr.desc_handle;
4240 
4241 		if (idx >= vsw_ntxds) {
4242 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
4243 			    "(idx %ld)", vswp->instance, idx);
4244 			return;
4245 		}
4246 
4247 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4248 			DERR(vswp, "%s: no dring found", __func__);
4249 			return;
4250 		}
4251 
4252 		len = dp->num_descriptors;
4253 		/*
4254 		 * If the descriptor we are being ACK'ed for is not the
4255 		 * one we expected, then pkts were lost somwhere, either
4256 		 * when we tried to send a msg, or a previous ACK msg from
4257 		 * our peer. In either case we now reclaim the descriptors
4258 		 * in the range from the last ACK we received up to the
4259 		 * current ACK.
4260 		 */
4261 		if (idx != dp->last_ack_recv) {
4262 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
4263 			    __func__, dp->last_ack_recv, idx);
4264 			num = idx >= dp->last_ack_recv ?
4265 			    idx - dp->last_ack_recv + 1:
4266 			    (len - dp->last_ack_recv + 1) + idx;
4267 		}
4268 
4269 		/*
4270 		 * When we sent the in-band message to our peer we
4271 		 * marked the copy in our private ring as READY. We now
4272 		 * check that the descriptor we are being ACK'ed for is in
4273 		 * fact READY, i.e. it is one we have shared with our peer.
4274 		 *
4275 		 * If its not we flag an error, but still reset the descr
4276 		 * back to FREE.
4277 		 */
4278 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
4279 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4280 			mutex_enter(&priv_addr->dstate_lock);
4281 			if (priv_addr->dstate != VIO_DESC_READY) {
4282 				DERR(vswp, "%s: (%ld) desc at index %ld not "
4283 				    "READY (0x%lx)", __func__,
4284 				    ldcp->ldc_id, idx, priv_addr->dstate);
4285 				DERR(vswp, "%s: bound %d: ncookies %ld : "
4286 				    "datalen %ld", __func__,
4287 				    priv_addr->bound, priv_addr->ncookies,
4288 				    priv_addr->datalen);
4289 			}
4290 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
4291 			    ldcp->ldc_id, idx);
4292 			/* release resources associated with sent msg */
4293 			priv_addr->datalen = 0;
4294 			priv_addr->dstate = VIO_DESC_FREE;
4295 			mutex_exit(&priv_addr->dstate_lock);
4296 		}
4297 		/* update to next expected value */
4298 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
4299 
4300 		break;
4301 
4302 	case VIO_SUBTYPE_NACK:
4303 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4304 
4305 		/*
4306 		 * We should only get a NACK if our peer doesn't like
4307 		 * something about a message we have sent it. If this
4308 		 * happens we just release the resources associated with
4309 		 * the message. (We are relying on higher layers to decide
4310 		 * whether or not to resend.
4311 		 */
4312 
4313 		/* limit check */
4314 		idx = ibnd_desc->hdr.desc_handle;
4315 
4316 		if (idx >= vsw_ntxds) {
4317 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
4318 			    __func__, idx);
4319 			return;
4320 		}
4321 
4322 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4323 			DERR(vswp, "%s: no dring found", __func__);
4324 			return;
4325 		}
4326 
4327 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4328 
4329 		/* move to correct location in ring */
4330 		priv_addr += idx;
4331 
4332 		/* release resources associated with sent msg */
4333 		mutex_enter(&priv_addr->dstate_lock);
4334 		priv_addr->datalen = 0;
4335 		priv_addr->dstate = VIO_DESC_FREE;
4336 		mutex_exit(&priv_addr->dstate_lock);
4337 
4338 		break;
4339 
4340 	default:
4341 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4342 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
4343 	}
4344 
4345 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4346 }
4347 
4348 static void
4349 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
4350 {
4351 	_NOTE(ARGUNUSED(epkt))
4352 
4353 	vsw_t		*vswp = ldcp->ldc_vswp;
4354 	uint16_t	env = tagp->vio_subtype_env;
4355 
4356 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4357 
4358 	/*
4359 	 * Error vio_subtypes have yet to be defined. So for
4360 	 * the moment we can't do anything.
4361 	 */
4362 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
4363 
4364 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4365 }
4366 
4367 /* transmit the packet over the given port */
4368 int
4369 vsw_portsend(vsw_port_t *port, mblk_t *mp)
4370 {
4371 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
4372 	vsw_ldc_t 	*ldcp;
4373 	mblk_t		*mpt;
4374 	int		count;
4375 	int		status = 0;
4376 
4377 	READ_ENTER(&ldcl->lockrw);
4378 	/*
4379 	 * Note for now, we have a single channel.
4380 	 */
4381 	ldcp = ldcl->head;
4382 	if (ldcp == NULL) {
4383 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
4384 		freemsgchain(mp);
4385 		RW_EXIT(&ldcl->lockrw);
4386 		return (1);
4387 	}
4388 
4389 	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
4390 
4391 	if (count != 0) {
4392 		status = ldcp->tx(ldcp, mp, mpt, count);
4393 	}
4394 
4395 	RW_EXIT(&ldcl->lockrw);
4396 	return (status);
4397 }
4398 
4399 /*
4400  * Break up frames into 2 seperate chains: normal and
4401  * priority, based on the frame type. The number of
4402  * priority frames is also counted and returned.
4403  *
4404  * Params:
4405  * 	vswp:	pointer to the instance of vsw
4406  *	np:	head of packet chain to be broken
4407  *	npt:	tail of packet chain to be broken
4408  *
4409  * Returns:
4410  *	np:	head of normal data packets
4411  *	npt:	tail of normal data packets
4412  *	hp:	head of high priority packets
4413  *	hpt:	tail of high priority packets
4414  */
4415 static uint32_t
4416 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4417 	mblk_t **hp, mblk_t **hpt)
4418 {
4419 	mblk_t			*tmp = NULL;
4420 	mblk_t			*smp = NULL;
4421 	mblk_t			*hmp = NULL;	/* high prio pkts head */
4422 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4423 	mblk_t			*nmp = NULL;	/* normal pkts head */
4424 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4425 	uint32_t		count = 0;
4426 	int			i;
4427 	struct ether_header	*ehp;
4428 	uint32_t		num_types;
4429 	uint16_t		*types;
4430 
4431 	tmp = *np;
4432 	while (tmp != NULL) {
4433 
4434 		smp = tmp;
4435 		tmp = tmp->b_next;
4436 		smp->b_next = NULL;
4437 		smp->b_prev = NULL;
4438 
4439 		ehp = (struct ether_header *)smp->b_rptr;
4440 		num_types = vswp->pri_num_types;
4441 		types = vswp->pri_types;
4442 		for (i = 0; i < num_types; i++) {
4443 			if (ehp->ether_type == types[i]) {
4444 				/* high priority frame */
4445 
4446 				if (hmp != NULL) {
4447 					hmpt->b_next = smp;
4448 					hmpt = smp;
4449 				} else {
4450 					hmp = hmpt = smp;
4451 				}
4452 				count++;
4453 				break;
4454 			}
4455 		}
4456 		if (i == num_types) {
4457 			/* normal data frame */
4458 
4459 			if (nmp != NULL) {
4460 				nmpt->b_next = smp;
4461 				nmpt = smp;
4462 			} else {
4463 				nmp = nmpt = smp;
4464 			}
4465 		}
4466 	}
4467 
4468 	*hp = hmp;
4469 	*hpt = hmpt;
4470 	*np = nmp;
4471 	*npt = nmpt;
4472 
4473 	return (count);
4474 }
4475 
4476 /*
4477  * Wrapper function to transmit normal and/or priority frames over the channel.
4478  */
4479 static int
4480 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4481 {
4482 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4483 	mblk_t			*tmp;
4484 	mblk_t			*smp;
4485 	mblk_t			*hmp;	/* high prio pkts head */
4486 	mblk_t			*hmpt;	/* high prio pkts tail */
4487 	mblk_t			*nmp;	/* normal pkts head */
4488 	mblk_t			*nmpt;	/* normal pkts tail */
4489 	uint32_t		n = 0;
4490 	vsw_t			*vswp = ldcp->ldc_vswp;
4491 
4492 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4493 	ASSERT(count != 0);
4494 
4495 	nmp = mp;
4496 	nmpt = mpt;
4497 
4498 	/* gather any priority frames from the chain of packets */
4499 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4500 
4501 	/* transmit priority frames */
4502 	tmp = hmp;
4503 	while (tmp != NULL) {
4504 		smp = tmp;
4505 		tmp = tmp->b_next;
4506 		smp->b_next = NULL;
4507 		vsw_ldcsend_pkt(ldcp, smp);
4508 	}
4509 
4510 	count -= n;
4511 
4512 	if (count == 0) {
4513 		/* no normal data frames to process */
4514 		return (0);
4515 	}
4516 
4517 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4518 }
4519 
4520 /*
4521  * Wrapper function to transmit normal frames over the channel.
4522  */
4523 static int
4524 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4525 {
4526 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4527 	mblk_t		*tmp = NULL;
4528 
4529 	ASSERT(count != 0);
4530 	/*
4531 	 * If the TX thread is enabled, then queue the
4532 	 * ordinary frames and signal the tx thread.
4533 	 */
4534 	if (ldcp->tx_thread != NULL) {
4535 
4536 		mutex_enter(&ldcp->tx_thr_lock);
4537 
4538 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4539 			/*
4540 			 * If we reached queue limit,
4541 			 * do not queue new packets,
4542 			 * drop them.
4543 			 */
4544 			ldcp->ldc_stats.tx_qfull += count;
4545 			mutex_exit(&ldcp->tx_thr_lock);
4546 			freemsgchain(mp);
4547 			goto exit;
4548 		}
4549 		if (ldcp->tx_mhead == NULL) {
4550 			ldcp->tx_mhead = mp;
4551 			ldcp->tx_mtail = mpt;
4552 			cv_signal(&ldcp->tx_thr_cv);
4553 		} else {
4554 			ldcp->tx_mtail->b_next = mp;
4555 			ldcp->tx_mtail = mpt;
4556 		}
4557 		ldcp->tx_cnt += count;
4558 		mutex_exit(&ldcp->tx_thr_lock);
4559 	} else {
4560 		while (mp != NULL) {
4561 			tmp = mp->b_next;
4562 			mp->b_next = mp->b_prev = NULL;
4563 			(void) vsw_ldcsend(ldcp, mp, 1);
4564 			mp = tmp;
4565 		}
4566 	}
4567 
4568 exit:
4569 	return (0);
4570 }
4571 
4572 /*
4573  * This function transmits the frame in the payload of a raw data
4574  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4575  * send special frames with high priorities, without going through
4576  * the normal data path which uses descriptor ring mechanism.
4577  */
4578 static void
4579 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4580 {
4581 	vio_raw_data_msg_t	*pkt;
4582 	mblk_t			*bp;
4583 	mblk_t			*nmp = NULL;
4584 	caddr_t			dst;
4585 	uint32_t		mblksz;
4586 	uint32_t		size;
4587 	uint32_t		nbytes;
4588 	int			rv;
4589 	vsw_t			*vswp = ldcp->ldc_vswp;
4590 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4591 
4592 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4593 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4594 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4595 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4596 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4597 		    ldcp->lane_out.lstate);
4598 		goto send_pkt_exit;
4599 	}
4600 
4601 	size = msgsize(mp);
4602 
4603 	/* frame size bigger than available payload len of raw data msg ? */
4604 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4605 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4606 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4607 		    ldcp->ldc_id, size);
4608 		goto send_pkt_exit;
4609 	}
4610 
4611 	if (size < ETHERMIN)
4612 		size = ETHERMIN;
4613 
4614 	/* alloc space for a raw data message */
4615 	nmp = vio_allocb(vswp->pri_tx_vmp);
4616 	if (nmp == NULL) {
4617 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4618 		DWARN(vswp, "vio_allocb failed\n");
4619 		goto send_pkt_exit;
4620 	}
4621 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4622 
4623 	/* copy frame into the payload of raw data message */
4624 	dst = (caddr_t)pkt->data;
4625 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4626 		mblksz = MBLKL(bp);
4627 		bcopy(bp->b_rptr, dst, mblksz);
4628 		dst += mblksz;
4629 	}
4630 
4631 	/* setup the raw data msg */
4632 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4633 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4634 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4635 	pkt->tag.vio_sid = ldcp->local_session;
4636 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4637 
4638 	/* send the msg over ldc */
4639 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4640 	if (rv != 0) {
4641 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4642 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4643 		    ldcp->ldc_id);
4644 		goto send_pkt_exit;
4645 	}
4646 
4647 	/* update stats */
4648 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4649 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4650 
4651 send_pkt_exit:
4652 	if (nmp != NULL)
4653 		freemsg(nmp);
4654 	freemsg(mp);
4655 }
4656 
4657 /*
4658  * Transmit the packet over the given LDC channel.
4659  *
4660  * The 'retries' argument indicates how many times a packet
4661  * is retried before it is dropped. Note, the retry is done
4662  * only for a resource related failure, for all other failures
4663  * the packet is dropped immediately.
4664  */
4665 static int
4666 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4667 {
4668 	int i;
4669 	int rc;
4670 	int status = 0;
4671 	vsw_port_t *port = ldcp->ldc_port;
4672 	dring_info_t *dp = NULL;
4673 
4674 
4675 	for (i = 0; i < retries; ) {
4676 		/*
4677 		 * Send the message out using the appropriate
4678 		 * transmit function which will free mblock when it
4679 		 * is finished with it.
4680 		 */
4681 		mutex_enter(&port->tx_lock);
4682 		if (port->transmit != NULL) {
4683 			status = (*port->transmit)(ldcp, mp);
4684 		}
4685 		if (status == LDC_TX_SUCCESS) {
4686 			mutex_exit(&port->tx_lock);
4687 			break;
4688 		}
4689 		i++;	/* increment the counter here */
4690 
4691 		/* If its the last retry, then update the oerror */
4692 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4693 			ldcp->ldc_stats.oerrors++;
4694 		}
4695 		mutex_exit(&port->tx_lock);
4696 
4697 		if (status != LDC_TX_NORESOURCES) {
4698 			/*
4699 			 * No retrying required for errors un-related
4700 			 * to resources.
4701 			 */
4702 			break;
4703 		}
4704 		READ_ENTER(&ldcp->lane_out.dlistrw);
4705 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4706 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4707 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4708 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4709 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4710 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4711 		} else {
4712 			/*
4713 			 * If there is no dring or the xfer_mode is
4714 			 * set to DESC_MODE(ie., OBP), then simply break here.
4715 			 */
4716 			RW_EXIT(&ldcp->lane_out.dlistrw);
4717 			break;
4718 		}
4719 		RW_EXIT(&ldcp->lane_out.dlistrw);
4720 
4721 		/*
4722 		 * Delay only if none were reclaimed
4723 		 * and its not the last retry.
4724 		 */
4725 		if ((rc == 0) && (i < retries)) {
4726 			delay(drv_usectohz(vsw_ldc_tx_delay));
4727 		}
4728 	}
4729 	freemsg(mp);
4730 	return (status);
4731 }
4732 
4733 /*
4734  * Send packet out via descriptor ring to a logical device.
4735  */
4736 static int
4737 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4738 {
4739 	vio_dring_msg_t		dring_pkt;
4740 	dring_info_t		*dp = NULL;
4741 	vsw_private_desc_t	*priv_desc = NULL;
4742 	vnet_public_desc_t	*pub = NULL;
4743 	vsw_t			*vswp = ldcp->ldc_vswp;
4744 	mblk_t			*bp;
4745 	size_t			n, size;
4746 	caddr_t			bufp;
4747 	int			idx;
4748 	int			status = LDC_TX_SUCCESS;
4749 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4750 	lane_t			*lp = &ldcp->lane_out;
4751 
4752 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4753 
4754 	/* TODO: make test a macro */
4755 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4756 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4757 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4758 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4759 		    ldcp->lane_out.lstate);
4760 		ldcp->ldc_stats.oerrors++;
4761 		return (LDC_TX_FAILURE);
4762 	}
4763 
4764 	/*
4765 	 * Note - using first ring only, this may change
4766 	 * in the future.
4767 	 */
4768 	READ_ENTER(&ldcp->lane_out.dlistrw);
4769 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4770 		RW_EXIT(&ldcp->lane_out.dlistrw);
4771 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4772 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4773 		ldcp->ldc_stats.oerrors++;
4774 		return (LDC_TX_FAILURE);
4775 	}
4776 
4777 	size = msgsize(mp);
4778 	if (size > (size_t)lp->mtu) {
4779 		RW_EXIT(&ldcp->lane_out.dlistrw);
4780 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4781 		    ldcp->ldc_id, size);
4782 		ldcp->ldc_stats.oerrors++;
4783 		return (LDC_TX_FAILURE);
4784 	}
4785 
4786 	/*
4787 	 * Find a free descriptor
4788 	 *
4789 	 * Note: for the moment we are assuming that we will only
4790 	 * have one dring going from the switch to each of its
4791 	 * peers. This may change in the future.
4792 	 */
4793 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4794 		D2(vswp, "%s(%lld): no descriptor available for ring "
4795 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4796 
4797 		/* nothing more we can do */
4798 		status = LDC_TX_NORESOURCES;
4799 		ldcp->ldc_stats.tx_no_desc++;
4800 		goto vsw_dringsend_free_exit;
4801 	} else {
4802 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4803 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4804 	}
4805 
4806 	/* copy data into the descriptor */
4807 	bufp = priv_desc->datap;
4808 	bufp += VNET_IPALIGN;
4809 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4810 		n = MBLKL(bp);
4811 		bcopy(bp->b_rptr, bufp, n);
4812 		bufp += n;
4813 	}
4814 
4815 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4816 
4817 	pub = priv_desc->descp;
4818 	pub->nbytes = priv_desc->datalen;
4819 
4820 	/* update statistics */
4821 	if (IS_BROADCAST(ehp))
4822 		ldcp->ldc_stats.brdcstxmt++;
4823 	else if (IS_MULTICAST(ehp))
4824 		ldcp->ldc_stats.multixmt++;
4825 	ldcp->ldc_stats.opackets++;
4826 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4827 
4828 	mutex_enter(&priv_desc->dstate_lock);
4829 	pub->hdr.dstate = VIO_DESC_READY;
4830 	mutex_exit(&priv_desc->dstate_lock);
4831 
4832 	/*
4833 	 * Determine whether or not we need to send a message to our
4834 	 * peer prompting them to read our newly updated descriptor(s).
4835 	 */
4836 	mutex_enter(&dp->restart_lock);
4837 	if (dp->restart_reqd) {
4838 		dp->restart_reqd = B_FALSE;
4839 		ldcp->ldc_stats.dring_data_msgs++;
4840 		mutex_exit(&dp->restart_lock);
4841 
4842 		/*
4843 		 * Send a vio_dring_msg to peer to prompt them to read
4844 		 * the updated descriptor ring.
4845 		 */
4846 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4847 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4848 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4849 		dring_pkt.tag.vio_sid = ldcp->local_session;
4850 
4851 		/* Note - for now using first ring */
4852 		dring_pkt.dring_ident = dp->ident;
4853 
4854 		/*
4855 		 * If last_ack_recv is -1 then we know we've not
4856 		 * received any ack's yet, so this must be the first
4857 		 * msg sent, so set the start to the begining of the ring.
4858 		 */
4859 		mutex_enter(&dp->dlock);
4860 		if (dp->last_ack_recv == -1) {
4861 			dring_pkt.start_idx = 0;
4862 		} else {
4863 			dring_pkt.start_idx =
4864 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4865 		}
4866 		dring_pkt.end_idx = -1;
4867 		mutex_exit(&dp->dlock);
4868 
4869 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4870 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4871 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4872 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4873 		    dring_pkt.end_idx);
4874 
4875 		RW_EXIT(&ldcp->lane_out.dlistrw);
4876 
4877 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4878 		    sizeof (vio_dring_msg_t), B_TRUE);
4879 
4880 		return (status);
4881 
4882 	} else {
4883 		mutex_exit(&dp->restart_lock);
4884 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4885 		    ldcp->ldc_id, idx);
4886 	}
4887 
4888 vsw_dringsend_free_exit:
4889 
4890 	RW_EXIT(&ldcp->lane_out.dlistrw);
4891 
4892 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4893 	return (status);
4894 }
4895 
4896 /*
4897  * Send an in-band descriptor message over ldc.
4898  */
4899 static int
4900 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4901 {
4902 	vsw_t			*vswp = ldcp->ldc_vswp;
4903 	vnet_ibnd_desc_t	ibnd_msg;
4904 	vsw_private_desc_t	*priv_desc = NULL;
4905 	dring_info_t		*dp = NULL;
4906 	size_t			n, size = 0;
4907 	caddr_t			bufp;
4908 	mblk_t			*bp;
4909 	int			idx, i;
4910 	int			status = LDC_TX_SUCCESS;
4911 	static int		warn_msg = 1;
4912 	lane_t			*lp = &ldcp->lane_out;
4913 
4914 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4915 
4916 	ASSERT(mp != NULL);
4917 
4918 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4919 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4920 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4921 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4922 		    ldcp->lane_out.lstate);
4923 		ldcp->ldc_stats.oerrors++;
4924 		return (LDC_TX_FAILURE);
4925 	}
4926 
4927 	/*
4928 	 * only expect single dring to exist, which we use
4929 	 * as an internal buffer, rather than a transfer channel.
4930 	 */
4931 	READ_ENTER(&ldcp->lane_out.dlistrw);
4932 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4933 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4934 		    __func__, ldcp->ldc_id);
4935 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4936 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4937 		RW_EXIT(&ldcp->lane_out.dlistrw);
4938 		ldcp->ldc_stats.oerrors++;
4939 		return (LDC_TX_FAILURE);
4940 	}
4941 
4942 	size = msgsize(mp);
4943 	if (size > (size_t)lp->mtu) {
4944 		RW_EXIT(&ldcp->lane_out.dlistrw);
4945 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4946 		    ldcp->ldc_id, size);
4947 		ldcp->ldc_stats.oerrors++;
4948 		return (LDC_TX_FAILURE);
4949 	}
4950 
4951 	/*
4952 	 * Find a free descriptor in our buffer ring
4953 	 */
4954 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4955 		RW_EXIT(&ldcp->lane_out.dlistrw);
4956 		if (warn_msg) {
4957 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4958 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4959 			warn_msg = 0;
4960 		}
4961 
4962 		/* nothing more we can do */
4963 		status = LDC_TX_NORESOURCES;
4964 		goto vsw_descrsend_free_exit;
4965 	} else {
4966 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4967 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4968 		warn_msg = 1;
4969 	}
4970 
4971 	/* copy data into the descriptor */
4972 	bufp = priv_desc->datap;
4973 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4974 		n = MBLKL(bp);
4975 		bcopy(bp->b_rptr, bufp, n);
4976 		bufp += n;
4977 	}
4978 
4979 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4980 
4981 	/* create and send the in-band descp msg */
4982 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4983 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4984 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4985 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4986 
4987 	/*
4988 	 * Copy the mem cookies describing the data from the
4989 	 * private region of the descriptor ring into the inband
4990 	 * descriptor.
4991 	 */
4992 	for (i = 0; i < priv_desc->ncookies; i++) {
4993 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4994 		    sizeof (ldc_mem_cookie_t));
4995 	}
4996 
4997 	ibnd_msg.hdr.desc_handle = idx;
4998 	ibnd_msg.ncookies = priv_desc->ncookies;
4999 	ibnd_msg.nbytes = size;
5000 
5001 	ldcp->ldc_stats.opackets++;
5002 	ldcp->ldc_stats.obytes += size;
5003 
5004 	RW_EXIT(&ldcp->lane_out.dlistrw);
5005 
5006 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
5007 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
5008 
5009 vsw_descrsend_free_exit:
5010 
5011 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5012 	return (status);
5013 }
5014 
5015 static void
5016 vsw_send_ver(void *arg)
5017 {
5018 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
5019 	vsw_t		*vswp = ldcp->ldc_vswp;
5020 	lane_t		*lp = &ldcp->lane_out;
5021 	vio_ver_msg_t	ver_msg;
5022 
5023 	D1(vswp, "%s enter", __func__);
5024 
5025 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5026 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5027 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
5028 	ver_msg.tag.vio_sid = ldcp->local_session;
5029 
5030 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
5031 		ver_msg.ver_major = vsw_versions[0].ver_major;
5032 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
5033 	} else {
5034 		/* use the major,minor that we've ack'd */
5035 		lane_t	*lpi = &ldcp->lane_in;
5036 		ver_msg.ver_major = lpi->ver_major;
5037 		ver_msg.ver_minor = lpi->ver_minor;
5038 	}
5039 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
5040 
5041 	lp->lstate |= VSW_VER_INFO_SENT;
5042 	lp->ver_major = ver_msg.ver_major;
5043 	lp->ver_minor = ver_msg.ver_minor;
5044 
5045 	DUMP_TAG(ver_msg.tag);
5046 
5047 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
5048 
5049 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
5050 }
5051 
5052 static void
5053 vsw_send_attr(vsw_ldc_t *ldcp)
5054 {
5055 	vsw_t			*vswp = ldcp->ldc_vswp;
5056 	lane_t			*lp = &ldcp->lane_out;
5057 	vnet_attr_msg_t		attr_msg;
5058 
5059 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5060 
5061 	/*
5062 	 * Subtype is set to INFO by default
5063 	 */
5064 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5065 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5066 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
5067 	attr_msg.tag.vio_sid = ldcp->local_session;
5068 
5069 	/* payload copied from default settings for lane */
5070 	attr_msg.mtu = lp->mtu;
5071 	attr_msg.addr_type = lp->addr_type;
5072 	attr_msg.xfer_mode = lp->xfer_mode;
5073 	attr_msg.ack_freq = lp->xfer_mode;
5074 
5075 	READ_ENTER(&vswp->if_lockrw);
5076 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
5077 	RW_EXIT(&vswp->if_lockrw);
5078 
5079 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
5080 
5081 	DUMP_TAG(attr_msg.tag);
5082 
5083 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
5084 
5085 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
5086 }
5087 
5088 /*
5089  * Create dring info msg (which also results in the creation of
5090  * a dring).
5091  */
5092 static vio_dring_reg_msg_t *
5093 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
5094 {
5095 	vio_dring_reg_msg_t	*mp;
5096 	dring_info_t		*dp;
5097 	vsw_t			*vswp = ldcp->ldc_vswp;
5098 	int			rv;
5099 
5100 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
5101 
5102 	/*
5103 	 * If we can't create a dring, obviously no point sending
5104 	 * a message.
5105 	 */
5106 	if ((dp = vsw_create_dring(ldcp)) == NULL)
5107 		return (NULL);
5108 
5109 	/* Allocate pools of receive mblks */
5110 	rv = vsw_init_multipools(ldcp, vswp);
5111 	if (rv) {
5112 		/*
5113 		 * We do not return failure if receive mblk pools can't be
5114 		 * allocated, instead allocb(9F) will be used to dynamically
5115 		 * allocate buffers during receive.
5116 		 */
5117 		DWARN(vswp, "%s: unable to create free mblk pools for"
5118 		    " channel %ld (rv %d)", __func__, ldcp->ldc_id, rv);
5119 	}
5120 
5121 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
5122 
5123 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
5124 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
5125 	mp->tag.vio_subtype_env = VIO_DRING_REG;
5126 	mp->tag.vio_sid = ldcp->local_session;
5127 
5128 	/* payload */
5129 	mp->num_descriptors = dp->num_descriptors;
5130 	mp->descriptor_size = dp->descriptor_size;
5131 	mp->options = dp->options;
5132 	mp->ncookies = dp->ncookies;
5133 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
5134 
5135 	mp->dring_ident = 0;
5136 
5137 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
5138 
5139 	return (mp);
5140 }
5141 
5142 static void
5143 vsw_send_dring_info(vsw_ldc_t *ldcp)
5144 {
5145 	vio_dring_reg_msg_t	*dring_msg;
5146 	vsw_t			*vswp = ldcp->ldc_vswp;
5147 
5148 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
5149 
5150 	dring_msg = vsw_create_dring_info_pkt(ldcp);
5151 	if (dring_msg == NULL) {
5152 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
5153 		    vswp->instance, __func__);
5154 		return;
5155 	}
5156 
5157 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
5158 
5159 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
5160 
5161 	(void) vsw_send_msg(ldcp, dring_msg,
5162 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
5163 
5164 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
5165 
5166 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
5167 }
5168 
5169 static void
5170 vsw_send_rdx(vsw_ldc_t *ldcp)
5171 {
5172 	vsw_t		*vswp = ldcp->ldc_vswp;
5173 	vio_rdx_msg_t	rdx_msg;
5174 
5175 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5176 
5177 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5178 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5179 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
5180 	rdx_msg.tag.vio_sid = ldcp->local_session;
5181 
5182 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
5183 
5184 	DUMP_TAG(rdx_msg.tag);
5185 
5186 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
5187 
5188 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
5189 }
5190 
5191 /*
5192  * Generic routine to send message out over ldc channel.
5193  *
5194  * It is possible that when we attempt to write over the ldc channel
5195  * that we get notified that it has been reset. Depending on the value
5196  * of the handle_reset flag we either handle that event here or simply
5197  * notify the caller that the channel was reset.
5198  */
5199 int
5200 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
5201 {
5202 	int			rv;
5203 	size_t			msglen = size;
5204 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
5205 	vsw_t			*vswp = ldcp->ldc_vswp;
5206 	vio_dring_msg_t		*dmsg;
5207 	vio_raw_data_msg_t	*rmsg;
5208 	vnet_ibnd_desc_t	*imsg;
5209 	boolean_t		data_msg = B_FALSE;
5210 
5211 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
5212 	    ldcp->ldc_id, size);
5213 
5214 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
5215 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
5216 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
5217 
5218 	mutex_enter(&ldcp->ldc_txlock);
5219 
5220 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
5221 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
5222 			dmsg = (vio_dring_msg_t *)tag;
5223 			dmsg->seq_num = ldcp->lane_out.seq_num;
5224 			data_msg = B_TRUE;
5225 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
5226 			rmsg = (vio_raw_data_msg_t *)tag;
5227 			rmsg->seq_num = ldcp->lane_out.seq_num;
5228 			data_msg = B_TRUE;
5229 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
5230 			imsg = (vnet_ibnd_desc_t *)tag;
5231 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
5232 			data_msg = B_TRUE;
5233 		}
5234 	}
5235 
5236 	do {
5237 		msglen = size;
5238 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
5239 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
5240 
5241 	if (rv == 0 && data_msg == B_TRUE) {
5242 		ldcp->lane_out.seq_num++;
5243 	}
5244 
5245 	if ((rv != 0) || (msglen != size)) {
5246 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
5247 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
5248 		ldcp->ldc_stats.oerrors++;
5249 	}
5250 
5251 	mutex_exit(&ldcp->ldc_txlock);
5252 
5253 	/*
5254 	 * If channel has been reset we either handle it here or
5255 	 * simply report back that it has been reset and let caller
5256 	 * decide what to do.
5257 	 */
5258 	if (rv == ECONNRESET) {
5259 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
5260 
5261 		/*
5262 		 * N.B - must never be holding the dlistrw lock when
5263 		 * we do a reset of the channel.
5264 		 */
5265 		if (handle_reset) {
5266 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
5267 		}
5268 	}
5269 
5270 	return (rv);
5271 }
5272 
5273 /*
5274  * Remove the specified address from the list of address maintained
5275  * in this port node.
5276  */
5277 mcst_addr_t *
5278 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
5279 {
5280 	vsw_t		*vswp = NULL;
5281 	vsw_port_t	*port = NULL;
5282 	mcst_addr_t	*prev_p = NULL;
5283 	mcst_addr_t	*curr_p = NULL;
5284 
5285 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
5286 	    __func__, devtype, addr);
5287 
5288 	if (devtype == VSW_VNETPORT) {
5289 		port = (vsw_port_t *)arg;
5290 		mutex_enter(&port->mca_lock);
5291 		prev_p = curr_p = port->mcap;
5292 	} else {
5293 		vswp = (vsw_t *)arg;
5294 		mutex_enter(&vswp->mca_lock);
5295 		prev_p = curr_p = vswp->mcap;
5296 	}
5297 
5298 	while (curr_p != NULL) {
5299 		if (curr_p->addr == addr) {
5300 			D2(NULL, "%s: address found", __func__);
5301 			/* match found */
5302 			if (prev_p == curr_p) {
5303 				/* list head */
5304 				if (devtype == VSW_VNETPORT)
5305 					port->mcap = curr_p->nextp;
5306 				else
5307 					vswp->mcap = curr_p->nextp;
5308 			} else {
5309 				prev_p->nextp = curr_p->nextp;
5310 			}
5311 			break;
5312 		} else {
5313 			prev_p = curr_p;
5314 			curr_p = curr_p->nextp;
5315 		}
5316 	}
5317 
5318 	if (devtype == VSW_VNETPORT)
5319 		mutex_exit(&port->mca_lock);
5320 	else
5321 		mutex_exit(&vswp->mca_lock);
5322 
5323 	D1(NULL, "%s: exit", __func__);
5324 
5325 	return (curr_p);
5326 }
5327 
5328 /*
5329  * Creates a descriptor ring (dring) and links it into the
5330  * link of outbound drings for this channel.
5331  *
5332  * Returns NULL if creation failed.
5333  */
5334 static dring_info_t *
5335 vsw_create_dring(vsw_ldc_t *ldcp)
5336 {
5337 	vsw_private_desc_t	*priv_addr = NULL;
5338 	vsw_t			*vswp = ldcp->ldc_vswp;
5339 	ldc_mem_info_t		minfo;
5340 	dring_info_t		*dp, *tp;
5341 	int			i;
5342 
5343 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5344 
5345 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5346 
5347 	/* create public section of ring */
5348 	if ((ldc_mem_dring_create(vsw_ntxds,
5349 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
5350 
5351 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
5352 		    "failed", ldcp->ldc_id);
5353 		goto create_fail_exit;
5354 	}
5355 
5356 	ASSERT(dp->handle != NULL);
5357 
5358 	/*
5359 	 * Get the base address of the public section of the ring.
5360 	 */
5361 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
5362 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
5363 		    ldcp->ldc_id);
5364 		goto dring_fail_exit;
5365 	} else {
5366 		ASSERT(minfo.vaddr != 0);
5367 		dp->pub_addr = minfo.vaddr;
5368 	}
5369 
5370 	dp->num_descriptors = vsw_ntxds;
5371 	dp->descriptor_size = VSW_PUB_SIZE;
5372 	dp->options = VIO_TX_DRING;
5373 	dp->ncookies = 1;	/* guaranteed by ldc */
5374 
5375 	/*
5376 	 * create private portion of ring
5377 	 */
5378 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
5379 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5380 
5381 	if (vsw_setup_ring(ldcp, dp)) {
5382 		DERR(vswp, "%s: unable to setup ring", __func__);
5383 		goto dring_fail_exit;
5384 	}
5385 
5386 	/* haven't used any descriptors yet */
5387 	dp->end_idx = 0;
5388 	dp->last_ack_recv = -1;
5389 
5390 	/* bind dring to the channel */
5391 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
5392 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
5393 	    &dp->cookie[0], &dp->ncookies)) != 0) {
5394 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
5395 		    "%lld", ldcp->ldc_id);
5396 		goto dring_fail_exit;
5397 	}
5398 
5399 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5400 	dp->restart_reqd = B_TRUE;
5401 
5402 	/*
5403 	 * Only ever create rings for outgoing lane. Link it onto
5404 	 * end of list.
5405 	 */
5406 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5407 	if (ldcp->lane_out.dringp == NULL) {
5408 		D2(vswp, "vsw_create_dring: adding first outbound ring");
5409 		ldcp->lane_out.dringp = dp;
5410 	} else {
5411 		tp = ldcp->lane_out.dringp;
5412 		while (tp->next != NULL)
5413 			tp = tp->next;
5414 
5415 		tp->next = dp;
5416 	}
5417 	RW_EXIT(&ldcp->lane_out.dlistrw);
5418 
5419 	return (dp);
5420 
5421 dring_fail_exit:
5422 	(void) ldc_mem_dring_destroy(dp->handle);
5423 
5424 create_fail_exit:
5425 	if (dp->priv_addr != NULL) {
5426 		priv_addr = dp->priv_addr;
5427 		for (i = 0; i < vsw_ntxds; i++) {
5428 			if (priv_addr->memhandle != NULL)
5429 				(void) ldc_mem_free_handle(
5430 				    priv_addr->memhandle);
5431 			priv_addr++;
5432 		}
5433 		kmem_free(dp->priv_addr,
5434 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5435 	}
5436 	mutex_destroy(&dp->dlock);
5437 
5438 	kmem_free(dp, sizeof (dring_info_t));
5439 	return (NULL);
5440 }
5441 
5442 /*
5443  * Create a ring consisting of just a private portion and link
5444  * it into the list of rings for the outbound lane.
5445  *
5446  * These type of rings are used primarily for temporary data
5447  * storage (i.e. as data buffers).
5448  */
5449 void
5450 vsw_create_privring(vsw_ldc_t *ldcp)
5451 {
5452 	dring_info_t		*dp, *tp;
5453 	vsw_t			*vswp = ldcp->ldc_vswp;
5454 
5455 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5456 
5457 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5458 
5459 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5460 
5461 	/* no public section */
5462 	dp->pub_addr = NULL;
5463 
5464 	dp->priv_addr = kmem_zalloc(
5465 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5466 
5467 	dp->num_descriptors = vsw_ntxds;
5468 
5469 	if (vsw_setup_ring(ldcp, dp)) {
5470 		DERR(vswp, "%s: setup of ring failed", __func__);
5471 		kmem_free(dp->priv_addr,
5472 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5473 		mutex_destroy(&dp->dlock);
5474 		kmem_free(dp, sizeof (dring_info_t));
5475 		return;
5476 	}
5477 
5478 	/* haven't used any descriptors yet */
5479 	dp->end_idx = 0;
5480 
5481 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5482 	dp->restart_reqd = B_TRUE;
5483 
5484 	/*
5485 	 * Only ever create rings for outgoing lane. Link it onto
5486 	 * end of list.
5487 	 */
5488 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5489 	if (ldcp->lane_out.dringp == NULL) {
5490 		D2(vswp, "%s: adding first outbound privring", __func__);
5491 		ldcp->lane_out.dringp = dp;
5492 	} else {
5493 		tp = ldcp->lane_out.dringp;
5494 		while (tp->next != NULL)
5495 			tp = tp->next;
5496 
5497 		tp->next = dp;
5498 	}
5499 	RW_EXIT(&ldcp->lane_out.dlistrw);
5500 
5501 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5502 }
5503 
5504 /*
5505  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5506  * failure.
5507  */
5508 int
5509 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5510 {
5511 	vnet_public_desc_t	*pub_addr = NULL;
5512 	vsw_private_desc_t	*priv_addr = NULL;
5513 	vsw_t			*vswp = ldcp->ldc_vswp;
5514 	uint64_t		*tmpp;
5515 	uint64_t		offset = 0;
5516 	uint32_t		ncookies = 0;
5517 	static char		*name = "vsw_setup_ring";
5518 	int			i, j, nc, rv;
5519 	size_t			data_sz;
5520 	void			*data_addr;
5521 
5522 	priv_addr = dp->priv_addr;
5523 	pub_addr = dp->pub_addr;
5524 
5525 	/* public section may be null but private should never be */
5526 	ASSERT(priv_addr != NULL);
5527 
5528 	/*
5529 	 * Allocate the region of memory which will be used to hold
5530 	 * the data the descriptors will refer to.
5531 	 */
5532 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5533 
5534 	/*
5535 	 * In order to ensure that the number of ldc cookies per descriptor is
5536 	 * limited to be within the default MAX_COOKIES (2), we take the steps
5537 	 * outlined below:
5538 	 *
5539 	 * Align the entire data buffer area to 8K and carve out per descriptor
5540 	 * data buffers starting from this 8K aligned base address.
5541 	 *
5542 	 * We round up the mtu specified to be a multiple of 2K or 4K.
5543 	 * For sizes up to 12K we round up the size to the next 2K.
5544 	 * For sizes > 12K we round up to the next 4K (otherwise sizes such as
5545 	 * 14K could end up needing 3 cookies, with the buffer spread across
5546 	 * 3 8K pages:  8K+6K, 2K+8K+2K, 6K+8K, ...).
5547 	 */
5548 	if (data_sz <= VNET_12K) {
5549 		data_sz = VNET_ROUNDUP_2K(data_sz);
5550 	} else {
5551 		data_sz = VNET_ROUNDUP_4K(data_sz);
5552 	}
5553 
5554 	dp->desc_data_sz = data_sz;
5555 
5556 	/* allocate extra 8K bytes for alignment */
5557 	dp->data_sz = (vsw_ntxds * data_sz) + VNET_8K;
5558 	data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5559 	dp->data_addr = data_addr;
5560 
5561 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5562 	    dp->data_sz, dp->data_addr);
5563 
5564 	/* align the starting address of the data area to 8K */
5565 	data_addr = (void *)VNET_ROUNDUP_8K((uintptr_t)data_addr);
5566 
5567 	tmpp = (uint64_t *)data_addr;
5568 	offset = dp->desc_data_sz/sizeof (tmpp);
5569 
5570 	/*
5571 	 * Initialise some of the private and public (if they exist)
5572 	 * descriptor fields.
5573 	 */
5574 	for (i = 0; i < vsw_ntxds; i++) {
5575 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5576 
5577 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5578 		    &priv_addr->memhandle)) != 0) {
5579 			DERR(vswp, "%s: alloc mem handle failed", name);
5580 			goto setup_ring_cleanup;
5581 		}
5582 
5583 		priv_addr->datap = (void *)tmpp;
5584 
5585 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5586 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5587 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5588 		    &(priv_addr->memcookie[0]), &ncookies);
5589 		if (rv != 0) {
5590 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5591 			    "(rv %d)", name, ldcp->ldc_id, rv);
5592 			goto setup_ring_cleanup;
5593 		}
5594 		priv_addr->bound = 1;
5595 
5596 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5597 		    name, i, priv_addr->memcookie[0].addr,
5598 		    priv_addr->memcookie[0].size);
5599 
5600 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5601 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5602 			    "invalid num of cookies (%d) for size 0x%llx",
5603 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5604 
5605 			goto setup_ring_cleanup;
5606 		} else {
5607 			for (j = 1; j < ncookies; j++) {
5608 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5609 				    &(priv_addr->memcookie[j]));
5610 				if (rv != 0) {
5611 					DERR(vswp, "%s: ldc_mem_nextcookie "
5612 					    "failed rv (%d)", name, rv);
5613 					goto setup_ring_cleanup;
5614 				}
5615 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5616 				    "size 0x%llx", name, j,
5617 				    priv_addr->memcookie[j].addr,
5618 				    priv_addr->memcookie[j].size);
5619 			}
5620 
5621 		}
5622 		priv_addr->ncookies = ncookies;
5623 		priv_addr->dstate = VIO_DESC_FREE;
5624 
5625 		if (pub_addr != NULL) {
5626 
5627 			/* link pub and private sides */
5628 			priv_addr->descp = pub_addr;
5629 
5630 			pub_addr->ncookies = priv_addr->ncookies;
5631 
5632 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5633 				bcopy(&priv_addr->memcookie[nc],
5634 				    &pub_addr->memcookie[nc],
5635 				    sizeof (ldc_mem_cookie_t));
5636 			}
5637 
5638 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5639 			pub_addr++;
5640 		}
5641 
5642 		/*
5643 		 * move to next element in the dring and the next
5644 		 * position in the data buffer.
5645 		 */
5646 		priv_addr++;
5647 		tmpp += offset;
5648 	}
5649 
5650 	return (0);
5651 
5652 setup_ring_cleanup:
5653 	priv_addr = dp->priv_addr;
5654 
5655 	for (j = 0; j < i; j++) {
5656 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5657 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5658 
5659 		mutex_destroy(&priv_addr->dstate_lock);
5660 
5661 		priv_addr++;
5662 	}
5663 	kmem_free(dp->data_addr, dp->data_sz);
5664 
5665 	return (1);
5666 }
5667 
5668 /*
5669  * Searches the private section of a ring for a free descriptor,
5670  * starting at the location of the last free descriptor found
5671  * previously.
5672  *
5673  * Returns 0 if free descriptor is available, and updates state
5674  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5675  *
5676  * FUTURE: might need to return contiguous range of descriptors
5677  * as dring info msg assumes all will be contiguous.
5678  */
5679 static int
5680 vsw_dring_find_free_desc(dring_info_t *dringp,
5681 		vsw_private_desc_t **priv_p, int *idx)
5682 {
5683 	vsw_private_desc_t	*addr = NULL;
5684 	int			num = vsw_ntxds;
5685 	int			ret = 1;
5686 
5687 	D1(NULL, "%s enter\n", __func__);
5688 
5689 	ASSERT(dringp->priv_addr != NULL);
5690 
5691 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5692 	    __func__, dringp, dringp->end_idx);
5693 
5694 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5695 
5696 	mutex_enter(&addr->dstate_lock);
5697 	if (addr->dstate == VIO_DESC_FREE) {
5698 		addr->dstate = VIO_DESC_READY;
5699 		*priv_p = addr;
5700 		*idx = dringp->end_idx;
5701 		dringp->end_idx = (dringp->end_idx + 1) % num;
5702 		ret = 0;
5703 
5704 	}
5705 	mutex_exit(&addr->dstate_lock);
5706 
5707 	/* ring full */
5708 	if (ret == 1) {
5709 		D2(NULL, "%s: no desp free: started at %d", __func__,
5710 		    dringp->end_idx);
5711 	}
5712 
5713 	D1(NULL, "%s: exit\n", __func__);
5714 
5715 	return (ret);
5716 }
5717 
5718 /*
5719  * Map from a dring identifier to the ring itself. Returns
5720  * pointer to ring or NULL if no match found.
5721  *
5722  * Should be called with dlistrw rwlock held as reader.
5723  */
5724 static dring_info_t *
5725 vsw_ident2dring(lane_t *lane, uint64_t ident)
5726 {
5727 	dring_info_t	*dp = NULL;
5728 
5729 	if ((dp = lane->dringp) == NULL) {
5730 		return (NULL);
5731 	} else {
5732 		if (dp->ident == ident)
5733 			return (dp);
5734 
5735 		while (dp != NULL) {
5736 			if (dp->ident == ident)
5737 				break;
5738 			dp = dp->next;
5739 		}
5740 	}
5741 
5742 	return (dp);
5743 }
5744 
5745 /*
5746  * Set the default lane attributes. These are copied into
5747  * the attr msg we send to our peer. If they are not acceptable
5748  * then (currently) the handshake ends.
5749  */
5750 static void
5751 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5752 {
5753 	bzero(lp, sizeof (lane_t));
5754 
5755 	READ_ENTER(&vswp->if_lockrw);
5756 	ether_copy(&(vswp->if_addr), &(lp->addr));
5757 	RW_EXIT(&vswp->if_lockrw);
5758 
5759 	lp->mtu = vswp->max_frame_size;
5760 	lp->addr_type = ADDR_TYPE_MAC;
5761 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5762 	lp->ack_freq = 0;	/* for shared mode */
5763 	lp->seq_num = VNET_ISS;
5764 }
5765 
5766 /*
5767  * Verify that the attributes are acceptable.
5768  *
5769  * FUTURE: If some attributes are not acceptable, change them
5770  * our desired values.
5771  */
5772 static int
5773 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5774 {
5775 	int			ret = 0;
5776 	struct ether_addr	ea;
5777 	vsw_port_t		*port = ldcp->ldc_port;
5778 	lane_t			*lp = &ldcp->lane_out;
5779 
5780 	D1(NULL, "vsw_check_attr enter\n");
5781 
5782 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5783 	    (pkt->xfer_mode != lp->xfer_mode)) {
5784 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5785 		ret = 1;
5786 	}
5787 
5788 	/* Only support MAC addresses at moment. */
5789 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5790 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5791 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5792 		ret = 1;
5793 	}
5794 
5795 	/*
5796 	 * MAC address supplied by device should match that stored
5797 	 * in the vsw-port OBP node. Need to decide what to do if they
5798 	 * don't match, for the moment just warn but don't fail.
5799 	 */
5800 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5801 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5802 		DERR(NULL, "vsw_check_attr: device supplied address "
5803 		    "0x%llx doesn't match node address 0x%llx\n",
5804 		    pkt->addr, port->p_macaddr);
5805 	}
5806 
5807 	/*
5808 	 * Ack freq only makes sense in pkt mode, in shared
5809 	 * mode the ring descriptors say whether or not to
5810 	 * send back an ACK.
5811 	 */
5812 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5813 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5814 	    (VSW_VER_LT(ldcp, 1, 2) &&
5815 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5816 		if (pkt->ack_freq > 0) {
5817 			D2(NULL, "vsw_check_attr: non zero ack freq "
5818 			    " in SHM mode\n");
5819 			ret = 1;
5820 		}
5821 	}
5822 
5823 	if (VSW_VER_LT(ldcp, 1, 4)) {
5824 		/* versions < 1.4, mtu must match */
5825 		if (pkt->mtu != lp->mtu) {
5826 			D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5827 			    pkt->mtu);
5828 			ret = 1;
5829 		}
5830 	} else {
5831 		/* Ver >= 1.4, validate mtu of the peer is at least ETHERMAX */
5832 		if (pkt->mtu < ETHERMAX) {
5833 			ret = 1;
5834 		}
5835 	}
5836 
5837 	D1(NULL, "vsw_check_attr exit\n");
5838 
5839 	return (ret);
5840 }
5841 
5842 /*
5843  * Returns 1 if there is a problem, 0 otherwise.
5844  */
5845 static int
5846 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5847 {
5848 	_NOTE(ARGUNUSED(pkt))
5849 
5850 	int	ret = 0;
5851 
5852 	D1(NULL, "vsw_check_dring_info enter\n");
5853 
5854 	if ((pkt->num_descriptors == 0) ||
5855 	    (pkt->descriptor_size == 0) ||
5856 	    (pkt->ncookies != 1)) {
5857 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5858 		ret = 1;
5859 	}
5860 
5861 	D1(NULL, "vsw_check_dring_info exit\n");
5862 
5863 	return (ret);
5864 }
5865 
5866 /*
5867  * Returns 1 if two memory cookies match. Otherwise returns 0.
5868  */
5869 static int
5870 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5871 {
5872 	if ((m1->addr != m2->addr) ||
5873 	    (m2->size != m2->size)) {
5874 		return (0);
5875 	} else {
5876 		return (1);
5877 	}
5878 }
5879 
5880 /*
5881  * Returns 1 if ring described in reg message matches that
5882  * described by dring_info structure. Otherwise returns 0.
5883  */
5884 static int
5885 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5886 {
5887 	if ((msg->descriptor_size != dp->descriptor_size) ||
5888 	    (msg->num_descriptors != dp->num_descriptors) ||
5889 	    (msg->ncookies != dp->ncookies) ||
5890 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5891 		return (0);
5892 	} else {
5893 		return (1);
5894 	}
5895 
5896 }
5897 
5898 /*
5899  * Reset and free all the resources associated with
5900  * the channel.
5901  */
5902 static void
5903 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5904 {
5905 	dring_info_t		*dp, *dpp;
5906 	lane_t			*lp = NULL;
5907 
5908 	ASSERT(ldcp != NULL);
5909 
5910 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5911 
5912 	if (dir == INBOUND) {
5913 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5914 		    " of channel %lld", __func__, ldcp->ldc_id);
5915 		lp = &ldcp->lane_in;
5916 	} else {
5917 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5918 		    " of channel %lld", __func__, ldcp->ldc_id);
5919 		lp = &ldcp->lane_out;
5920 	}
5921 
5922 	lp->lstate = VSW_LANE_INACTIV;
5923 	lp->seq_num = VNET_ISS;
5924 
5925 	if (lp->dringp) {
5926 		if (dir == INBOUND) {
5927 			WRITE_ENTER(&lp->dlistrw);
5928 			dp = lp->dringp;
5929 			while (dp != NULL) {
5930 				dpp = dp->next;
5931 				if (dp->handle != NULL)
5932 					(void) ldc_mem_dring_unmap(dp->handle);
5933 				kmem_free(dp, sizeof (dring_info_t));
5934 				dp = dpp;
5935 			}
5936 			RW_EXIT(&lp->dlistrw);
5937 		} else {
5938 			/*
5939 			 * unbind, destroy exported dring, free dring struct
5940 			 */
5941 			WRITE_ENTER(&lp->dlistrw);
5942 			dp = lp->dringp;
5943 			vsw_free_ring(dp);
5944 			RW_EXIT(&lp->dlistrw);
5945 		}
5946 		lp->dringp = NULL;
5947 	}
5948 
5949 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5950 }
5951 
5952 /*
5953  * Free ring and all associated resources.
5954  *
5955  * Should be called with dlistrw rwlock held as writer.
5956  */
5957 static void
5958 vsw_free_ring(dring_info_t *dp)
5959 {
5960 	vsw_private_desc_t	*paddr = NULL;
5961 	dring_info_t		*dpp;
5962 	int			i;
5963 
5964 	while (dp != NULL) {
5965 		mutex_enter(&dp->dlock);
5966 		dpp = dp->next;
5967 		if (dp->priv_addr != NULL) {
5968 			/*
5969 			 * First unbind and free the memory handles
5970 			 * stored in each descriptor within the ring.
5971 			 */
5972 			for (i = 0; i < vsw_ntxds; i++) {
5973 				paddr = (vsw_private_desc_t *)
5974 				    dp->priv_addr + i;
5975 				if (paddr->memhandle != NULL) {
5976 					if (paddr->bound == 1) {
5977 						if (ldc_mem_unbind_handle(
5978 						    paddr->memhandle) != 0) {
5979 							DERR(NULL, "error "
5980 							"unbinding handle for "
5981 							"ring 0x%llx at pos %d",
5982 							    dp, i);
5983 							continue;
5984 						}
5985 						paddr->bound = 0;
5986 					}
5987 
5988 					if (ldc_mem_free_handle(
5989 					    paddr->memhandle) != 0) {
5990 						DERR(NULL, "error freeing "
5991 						    "handle for ring 0x%llx "
5992 						    "at pos %d", dp, i);
5993 						continue;
5994 					}
5995 					paddr->memhandle = NULL;
5996 				}
5997 				mutex_destroy(&paddr->dstate_lock);
5998 			}
5999 			kmem_free(dp->priv_addr,
6000 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
6001 		}
6002 
6003 		/*
6004 		 * Now unbind and destroy the ring itself.
6005 		 */
6006 		if (dp->handle != NULL) {
6007 			(void) ldc_mem_dring_unbind(dp->handle);
6008 			(void) ldc_mem_dring_destroy(dp->handle);
6009 		}
6010 
6011 		if (dp->data_addr != NULL) {
6012 			kmem_free(dp->data_addr, dp->data_sz);
6013 		}
6014 
6015 		mutex_exit(&dp->dlock);
6016 		mutex_destroy(&dp->dlock);
6017 		mutex_destroy(&dp->restart_lock);
6018 		kmem_free(dp, sizeof (dring_info_t));
6019 
6020 		dp = dpp;
6021 	}
6022 }
6023 
6024 /*
6025  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
6026  * This thread is woken up by the LDC interrupt handler to process
6027  * LDC packets and receive data.
6028  */
6029 static void
6030 vsw_ldc_rx_worker(void *arg)
6031 {
6032 	callb_cpr_t	cprinfo;
6033 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
6034 	vsw_t *vswp = ldcp->ldc_vswp;
6035 
6036 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6037 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
6038 	    "vsw_rx_thread");
6039 	mutex_enter(&ldcp->rx_thr_lock);
6040 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
6041 
6042 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
6043 		/*
6044 		 * Wait until the data is received or a stop
6045 		 * request is received.
6046 		 */
6047 		while (!(ldcp->rx_thr_flags &
6048 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
6049 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
6050 		}
6051 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
6052 
6053 		/*
6054 		 * First process the stop request.
6055 		 */
6056 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
6057 			D2(vswp, "%s(%lld):Rx thread stopped\n",
6058 			    __func__, ldcp->ldc_id);
6059 			break;
6060 		}
6061 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
6062 		mutex_exit(&ldcp->rx_thr_lock);
6063 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
6064 		    __func__, ldcp->ldc_id);
6065 		mutex_enter(&ldcp->ldc_cblock);
6066 		vsw_process_pkt(ldcp);
6067 		mutex_exit(&ldcp->ldc_cblock);
6068 		mutex_enter(&ldcp->rx_thr_lock);
6069 	}
6070 
6071 	/*
6072 	 * Update the run status and wakeup the thread that
6073 	 * has sent the stop request.
6074 	 */
6075 	ldcp->rx_thr_flags &= ~VSW_WTHR_STOP;
6076 	ldcp->rx_thread = NULL;
6077 	CALLB_CPR_EXIT(&cprinfo);
6078 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6079 	thread_exit();
6080 }
6081 
6082 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
6083 static void
6084 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
6085 {
6086 	kt_did_t	tid = 0;
6087 	vsw_t		*vswp = ldcp->ldc_vswp;
6088 
6089 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6090 	/*
6091 	 * Send a stop request by setting the stop flag and
6092 	 * wait until the receive thread stops.
6093 	 */
6094 	mutex_enter(&ldcp->rx_thr_lock);
6095 	if (ldcp->rx_thread != NULL) {
6096 		tid = ldcp->rx_thread->t_did;
6097 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
6098 		cv_signal(&ldcp->rx_thr_cv);
6099 	}
6100 	mutex_exit(&ldcp->rx_thr_lock);
6101 
6102 	if (tid != 0) {
6103 		thread_join(tid);
6104 	}
6105 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6106 }
6107 
6108 /*
6109  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
6110  * This thread is woken up by the vsw_portsend to transmit
6111  * packets.
6112  */
6113 static void
6114 vsw_ldc_tx_worker(void *arg)
6115 {
6116 	callb_cpr_t	cprinfo;
6117 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
6118 	vsw_t *vswp = ldcp->ldc_vswp;
6119 	mblk_t *mp;
6120 	mblk_t *tmp;
6121 
6122 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6123 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
6124 	    "vnet_tx_thread");
6125 	mutex_enter(&ldcp->tx_thr_lock);
6126 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
6127 
6128 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
6129 		/*
6130 		 * Wait until the data is received or a stop
6131 		 * request is received.
6132 		 */
6133 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
6134 		    (ldcp->tx_mhead == NULL)) {
6135 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
6136 		}
6137 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
6138 
6139 		/*
6140 		 * First process the stop request.
6141 		 */
6142 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
6143 			D2(vswp, "%s(%lld):tx thread stopped\n",
6144 			    __func__, ldcp->ldc_id);
6145 			break;
6146 		}
6147 		mp = ldcp->tx_mhead;
6148 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
6149 		ldcp->tx_cnt = 0;
6150 		mutex_exit(&ldcp->tx_thr_lock);
6151 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
6152 		    __func__, ldcp->ldc_id);
6153 		while (mp != NULL) {
6154 			tmp = mp->b_next;
6155 			mp->b_next = mp->b_prev = NULL;
6156 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
6157 			mp = tmp;
6158 		}
6159 		mutex_enter(&ldcp->tx_thr_lock);
6160 	}
6161 
6162 	/*
6163 	 * Update the run status and wakeup the thread that
6164 	 * has sent the stop request.
6165 	 */
6166 	ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
6167 	ldcp->tx_thread = NULL;
6168 	CALLB_CPR_EXIT(&cprinfo);
6169 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6170 	thread_exit();
6171 }
6172 
6173 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
6174 static void
6175 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
6176 {
6177 	kt_did_t	tid = 0;
6178 	vsw_t		*vswp = ldcp->ldc_vswp;
6179 
6180 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6181 	/*
6182 	 * Send a stop request by setting the stop flag and
6183 	 * wait until the receive thread stops.
6184 	 */
6185 	mutex_enter(&ldcp->tx_thr_lock);
6186 	if (ldcp->tx_thread != NULL) {
6187 		tid = ldcp->tx_thread->t_did;
6188 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
6189 		cv_signal(&ldcp->tx_thr_cv);
6190 	}
6191 	mutex_exit(&ldcp->tx_thr_lock);
6192 
6193 	if (tid != 0) {
6194 		thread_join(tid);
6195 	}
6196 
6197 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6198 }
6199 
6200 /* vsw_reclaim_dring -- reclaim descriptors */
6201 static int
6202 vsw_reclaim_dring(dring_info_t *dp, int start)
6203 {
6204 	int i, j, len;
6205 	vsw_private_desc_t *priv_addr;
6206 	vnet_public_desc_t *pub_addr;
6207 
6208 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
6209 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6210 	len = dp->num_descriptors;
6211 
6212 	D2(NULL, "%s: start index %ld\n", __func__, start);
6213 
6214 	j = 0;
6215 	for (i = start; j < len; i = (i + 1) % len, j++) {
6216 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6217 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6218 
6219 		mutex_enter(&priv_addr->dstate_lock);
6220 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
6221 			mutex_exit(&priv_addr->dstate_lock);
6222 			break;
6223 		}
6224 		pub_addr->hdr.dstate = VIO_DESC_FREE;
6225 		priv_addr->dstate = VIO_DESC_FREE;
6226 		/* clear all the fields */
6227 		priv_addr->datalen = 0;
6228 		pub_addr->hdr.ack = 0;
6229 		mutex_exit(&priv_addr->dstate_lock);
6230 
6231 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
6232 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
6233 	}
6234 	return (j);
6235 }
6236 
6237 /*
6238  * Debugging routines
6239  */
6240 static void
6241 display_state(void)
6242 {
6243 	vsw_t		*vswp;
6244 	vsw_port_list_t	*plist;
6245 	vsw_port_t 	*port;
6246 	vsw_ldc_list_t	*ldcl;
6247 	vsw_ldc_t 	*ldcp;
6248 	extern vsw_t 	*vsw_head;
6249 
6250 	cmn_err(CE_NOTE, "***** system state *****");
6251 
6252 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
6253 		plist = &vswp->plist;
6254 		READ_ENTER(&plist->lockrw);
6255 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
6256 		    vswp->instance, plist->num_ports);
6257 
6258 		for (port = plist->head; port != NULL; port = port->p_next) {
6259 			ldcl = &port->p_ldclist;
6260 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
6261 			    port->p_instance, port->num_ldcs);
6262 			READ_ENTER(&ldcl->lockrw);
6263 			ldcp = ldcl->head;
6264 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
6265 				cmn_err(CE_CONT, "chan %lu : dev %d : "
6266 				    "status %d : phase %u\n",
6267 				    ldcp->ldc_id, ldcp->dev_class,
6268 				    ldcp->ldc_status, ldcp->hphase);
6269 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
6270 				    "psession %lu\n", ldcp->ldc_id,
6271 				    ldcp->local_session, ldcp->peer_session);
6272 
6273 				cmn_err(CE_CONT, "Inbound lane:\n");
6274 				display_lane(&ldcp->lane_in);
6275 				cmn_err(CE_CONT, "Outbound lane:\n");
6276 				display_lane(&ldcp->lane_out);
6277 			}
6278 			RW_EXIT(&ldcl->lockrw);
6279 		}
6280 		RW_EXIT(&plist->lockrw);
6281 	}
6282 	cmn_err(CE_NOTE, "***** system state *****");
6283 }
6284 
6285 static void
6286 display_lane(lane_t *lp)
6287 {
6288 	dring_info_t	*drp;
6289 
6290 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
6291 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
6292 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
6293 	    lp->addr_type, lp->addr, lp->xfer_mode);
6294 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
6295 
6296 	cmn_err(CE_CONT, "Dring info:\n");
6297 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
6298 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
6299 		    drp->num_descriptors, drp->descriptor_size);
6300 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
6301 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
6302 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
6303 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
6304 		    drp->ident, drp->end_idx);
6305 		display_ring(drp);
6306 	}
6307 }
6308 
6309 static void
6310 display_ring(dring_info_t *dringp)
6311 {
6312 	uint64_t		i;
6313 	uint64_t		priv_count = 0;
6314 	uint64_t		pub_count = 0;
6315 	vnet_public_desc_t	*pub_addr = NULL;
6316 	vsw_private_desc_t	*priv_addr = NULL;
6317 
6318 	for (i = 0; i < vsw_ntxds; i++) {
6319 		if (dringp->pub_addr != NULL) {
6320 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
6321 
6322 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
6323 				pub_count++;
6324 		}
6325 
6326 		if (dringp->priv_addr != NULL) {
6327 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
6328 
6329 			if (priv_addr->dstate == VIO_DESC_FREE)
6330 				priv_count++;
6331 		}
6332 	}
6333 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
6334 	    i, priv_count, pub_count);
6335 }
6336 
6337 static void
6338 dump_flags(uint64_t state)
6339 {
6340 	int	i;
6341 
6342 	typedef struct flag_name {
6343 		int	flag_val;
6344 		char	*flag_name;
6345 	} flag_name_t;
6346 
6347 	flag_name_t	flags[] = {
6348 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
6349 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
6350 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
6351 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
6352 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
6353 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
6354 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
6355 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
6356 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
6357 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
6358 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
6359 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
6360 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
6361 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
6362 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
6363 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
6364 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
6365 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
6366 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
6367 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
6368 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
6369 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
6370 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
6371 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
6372 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
6373 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
6374 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
6375 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
6376 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
6377 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
6378 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
6379 
6380 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
6381 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
6382 		if (state & flags[i].flag_val)
6383 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
6384 	}
6385 }
6386