xref: /titanic_41/usr/src/uts/sun4v/io/vsw_ldc.c (revision 60c45ed01d4f99571d468c42f609d11a099fab1e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 #include <sys/callb.h>
75 #include <sys/vlan.h>
76 
77 /* Port add/deletion/etc routines */
78 static	int vsw_port_delete(vsw_port_t *port);
79 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
80 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
81 static	int vsw_init_ldcs(vsw_port_t *port);
82 static	int vsw_uninit_ldcs(vsw_port_t *port);
83 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
84 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
85 static	int vsw_drain_ldcs(vsw_port_t *port);
86 static	int vsw_drain_port_taskq(vsw_port_t *port);
87 static	void vsw_marker_task(void *);
88 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
89 int vsw_detach_ports(vsw_t *vswp);
90 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
91 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
92 int vsw_port_detach(vsw_t *vswp, int p_instance);
93 int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
94 int vsw_port_attach(vsw_port_t *portp);
95 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
96 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
97 
98 /* Interrupt routines */
99 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
100 
101 /* Handshake routines */
102 static	void vsw_ldc_reinit(vsw_ldc_t *);
103 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
104 static	void vsw_conn_task(void *);
105 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
106 static	void vsw_next_milestone(vsw_ldc_t *);
107 static	int vsw_supported_version(vio_ver_msg_t *);
108 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
109 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
110 
111 /* Data processing routines */
112 static void vsw_process_pkt(void *);
113 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
114 static void vsw_process_ctrl_pkt(void *);
115 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
116 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
117 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
122 	uint32_t);
123 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
124 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
125 static void vsw_process_pkt_data(void *, void *, uint32_t);
126 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
127 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
128 
129 /* Switching/data transmit routines */
130 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
131 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
132 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
133 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
134 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
135 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
136 
137 /* Packet creation routines */
138 static void vsw_send_ver(void *);
139 static void vsw_send_attr(vsw_ldc_t *);
140 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
141 static void vsw_send_dring_info(vsw_ldc_t *);
142 static void vsw_send_rdx(vsw_ldc_t *);
143 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
144 
145 /* Dring routines */
146 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
147 static void vsw_create_privring(vsw_ldc_t *);
148 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
149 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
150     int *);
151 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
152 static int vsw_reclaim_dring(dring_info_t *dp, int start);
153 
154 static void vsw_set_lane_attr(vsw_t *, lane_t *);
155 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
156 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
157 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
158 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
159 
160 /* Rcv/Tx thread routines */
161 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
162 static void vsw_ldc_tx_worker(void *arg);
163 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
164 static void vsw_ldc_rx_worker(void *arg);
165 
166 /* Misc support routines */
167 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
168 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
169 static int vsw_free_ring(dring_info_t *);
170 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
171 static int vsw_get_same_dest_list(struct ether_header *ehp,
172     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
173 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
174 
175 /* Debugging routines */
176 static void dump_flags(uint64_t);
177 static void display_state(void);
178 static void display_lane(lane_t *);
179 static void display_ring(dring_info_t *);
180 
181 /*
182  * Functions imported from other files.
183  */
184 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
185 extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
186 extern void vsw_reconfig_hw(vsw_t *);
187 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
188 extern void vsw_del_mcst_port(vsw_port_t *port);
189 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
190 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
191 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
192 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
193 extern void vsw_create_vlans(void *arg, int type);
194 extern void vsw_destroy_vlans(void *arg, int type);
195 extern void vsw_vlan_add_ids(void *arg, int type);
196 extern void vsw_vlan_remove_ids(void *arg, int type);
197 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
198 	struct ether_header *ehp, uint16_t *vidp);
199 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
200 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
201 	mblk_t **npt);
202 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
203 
204 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
205 
206 /*
207  * Tunables used in this file.
208  */
209 extern int vsw_num_handshakes;
210 extern int vsw_wretries;
211 extern int vsw_desc_delay;
212 extern int vsw_read_attempts;
213 extern int vsw_ldc_tx_delay;
214 extern int vsw_ldc_tx_retries;
215 extern boolean_t vsw_ldc_rxthr_enabled;
216 extern boolean_t vsw_ldc_txthr_enabled;
217 extern uint32_t vsw_ntxds;
218 extern uint32_t vsw_max_tx_qcount;
219 extern uint32_t vsw_chain_len;
220 extern uint32_t vsw_mblk_size1;
221 extern uint32_t vsw_mblk_size2;
222 extern uint32_t vsw_mblk_size3;
223 extern uint32_t vsw_num_mblks1;
224 extern uint32_t vsw_num_mblks2;
225 extern uint32_t vsw_num_mblks3;
226 extern boolean_t vsw_obp_ver_proto_workaround;
227 
228 #define	LDC_ENTER_LOCK(ldcp)	\
229 				mutex_enter(&((ldcp)->ldc_cblock));\
230 				mutex_enter(&((ldcp)->ldc_rxlock));\
231 				mutex_enter(&((ldcp)->ldc_txlock));
232 #define	LDC_EXIT_LOCK(ldcp)	\
233 				mutex_exit(&((ldcp)->ldc_txlock));\
234 				mutex_exit(&((ldcp)->ldc_rxlock));\
235 				mutex_exit(&((ldcp)->ldc_cblock));
236 
237 #define	VSW_VER_EQ(ldcp, major, minor)	\
238 	((ldcp)->lane_out.ver_major == (major) &&	\
239 	    (ldcp)->lane_out.ver_minor == (minor))
240 
241 #define	VSW_VER_LT(ldcp, major, minor)	\
242 	(((ldcp)->lane_out.ver_major < (major)) ||	\
243 	    ((ldcp)->lane_out.ver_major == (major) &&	\
244 	    (ldcp)->lane_out.ver_minor < (minor)))
245 
246 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
247 	(((ldcp)->lane_out.ver_major > (major)) ||	\
248 	    ((ldcp)->lane_out.ver_major == (major) &&	\
249 	    (ldcp)->lane_out.ver_minor >= (minor)))
250 
251 /* supported versions */
252 static	ver_sup_t	vsw_versions[] = { {1, 3} };
253 
254 /*
255  * For the moment the state dump routines have their own
256  * private flag.
257  */
258 #define	DUMP_STATE	0
259 
260 #if DUMP_STATE
261 
262 #define	DUMP_TAG(tag) \
263 {			\
264 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
265 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
266 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
267 }
268 
269 #define	DUMP_TAG_PTR(tag) \
270 {			\
271 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
272 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
273 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
274 }
275 
276 #define	DUMP_FLAGS(flags) dump_flags(flags);
277 #define	DISPLAY_STATE()	display_state()
278 
279 #else
280 
281 #define	DUMP_TAG(tag)
282 #define	DUMP_TAG_PTR(tag)
283 #define	DUMP_FLAGS(state)
284 #define	DISPLAY_STATE()
285 
286 #endif	/* DUMP_STATE */
287 
288 /*
289  * Attach the specified port.
290  *
291  * Returns 0 on success, 1 on failure.
292  */
293 int
294 vsw_port_attach(vsw_port_t *port)
295 {
296 	vsw_t			*vswp = port->p_vswp;
297 	vsw_port_list_t		*plist = &vswp->plist;
298 	vsw_port_t		*p, **pp;
299 	int			i;
300 	int			nids = port->num_ldcs;
301 	uint64_t		*ldcids;
302 
303 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
304 
305 	/* port already exists? */
306 	READ_ENTER(&plist->lockrw);
307 	for (p = plist->head; p != NULL; p = p->p_next) {
308 		if (p->p_instance == port->p_instance) {
309 			DWARN(vswp, "%s: port instance %d already attached",
310 			    __func__, p->p_instance);
311 			RW_EXIT(&plist->lockrw);
312 			return (1);
313 		}
314 	}
315 	RW_EXIT(&plist->lockrw);
316 
317 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
318 
319 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
320 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
321 
322 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
323 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
324 	port->state = VSW_PORT_INIT;
325 
326 	D2(vswp, "%s: %d nids", __func__, nids);
327 	ldcids = port->ldc_ids;
328 	for (i = 0; i < nids; i++) {
329 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
330 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
331 			DERR(vswp, "%s: ldc_attach failed", __func__);
332 
333 			rw_destroy(&port->p_ldclist.lockrw);
334 
335 			cv_destroy(&port->state_cv);
336 			mutex_destroy(&port->state_lock);
337 
338 			mutex_destroy(&port->tx_lock);
339 			mutex_destroy(&port->mca_lock);
340 			kmem_free(port, sizeof (vsw_port_t));
341 			return (1);
342 		}
343 	}
344 
345 	if (vswp->switching_setup_done == B_TRUE) {
346 		/*
347 		 * If the underlying physical device has been setup,
348 		 * program the mac address of this port in it.
349 		 * Otherwise, port macaddr will be set after the physical
350 		 * device is successfully setup by the timeout handler.
351 		 */
352 		mutex_enter(&vswp->hw_lock);
353 		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
354 		mutex_exit(&vswp->hw_lock);
355 	}
356 
357 	/* create the fdb entry for this port/mac address */
358 	vsw_fdbe_add(vswp, port);
359 
360 	vsw_create_vlans(port, VSW_VNETPORT);
361 
362 	WRITE_ENTER(&plist->lockrw);
363 
364 	/* link it into the list of ports for this vsw instance */
365 	pp = (vsw_port_t **)(&plist->head);
366 	port->p_next = *pp;
367 	*pp = port;
368 	plist->num_ports++;
369 
370 	RW_EXIT(&plist->lockrw);
371 
372 	/*
373 	 * Initialise the port and any ldc's under it.
374 	 */
375 	(void) vsw_init_ldcs(port);
376 
377 	D1(vswp, "%s: exit", __func__);
378 	return (0);
379 }
380 
381 /*
382  * Detach the specified port.
383  *
384  * Returns 0 on success, 1 on failure.
385  */
386 int
387 vsw_port_detach(vsw_t *vswp, int p_instance)
388 {
389 	vsw_port_t	*port = NULL;
390 	vsw_port_list_t	*plist = &vswp->plist;
391 
392 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
393 
394 	WRITE_ENTER(&plist->lockrw);
395 
396 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
397 		RW_EXIT(&plist->lockrw);
398 		return (1);
399 	}
400 
401 	if (vsw_plist_del_node(vswp, port)) {
402 		RW_EXIT(&plist->lockrw);
403 		return (1);
404 	}
405 
406 	/*
407 	 * No longer need to hold writer lock on port list now
408 	 * that we have unlinked the target port from the list.
409 	 */
410 	RW_EXIT(&plist->lockrw);
411 
412 	/* Remove the fdb entry for this port/mac address */
413 	vsw_fdbe_del(vswp, &(port->p_macaddr));
414 	vsw_destroy_vlans(port, VSW_VNETPORT);
415 
416 	/* Remove any multicast addresses.. */
417 	vsw_del_mcst_port(port);
418 
419 	/* Remove address if was programmed into HW. */
420 	mutex_enter(&vswp->hw_lock);
421 
422 	/*
423 	 * Port's address may not have been set in hardware. This could
424 	 * happen if the underlying physical device is not yet available and
425 	 * vsw_setup_switching_timeout() may be in progress.
426 	 * We remove its addr from hardware only if it has been set before.
427 	 */
428 	if (port->addr_set != VSW_ADDR_UNSET)
429 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
430 
431 	if (vswp->recfg_reqd)
432 		vsw_reconfig_hw(vswp);
433 
434 	mutex_exit(&vswp->hw_lock);
435 
436 	if (vsw_port_delete(port)) {
437 		return (1);
438 	}
439 
440 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
441 	return (0);
442 }
443 
444 /*
445  * Detach all active ports.
446  *
447  * Returns 0 on success, 1 on failure.
448  */
449 int
450 vsw_detach_ports(vsw_t *vswp)
451 {
452 	vsw_port_list_t 	*plist = &vswp->plist;
453 	vsw_port_t		*port = NULL;
454 
455 	D1(vswp, "%s: enter", __func__);
456 
457 	WRITE_ENTER(&plist->lockrw);
458 
459 	while ((port = plist->head) != NULL) {
460 		if (vsw_plist_del_node(vswp, port)) {
461 			DERR(vswp, "%s: Error deleting port %d"
462 			    " from port list", __func__, port->p_instance);
463 			RW_EXIT(&plist->lockrw);
464 			return (1);
465 		}
466 
467 		/* Remove address if was programmed into HW. */
468 		mutex_enter(&vswp->hw_lock);
469 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
470 		mutex_exit(&vswp->hw_lock);
471 
472 		/* Remove the fdb entry for this port/mac address */
473 		vsw_fdbe_del(vswp, &(port->p_macaddr));
474 		vsw_destroy_vlans(port, VSW_VNETPORT);
475 
476 		/* Remove any multicast addresses.. */
477 		vsw_del_mcst_port(port);
478 
479 		/*
480 		 * No longer need to hold the lock on the port list
481 		 * now that we have unlinked the target port from the
482 		 * list.
483 		 */
484 		RW_EXIT(&plist->lockrw);
485 		if (vsw_port_delete(port)) {
486 			DERR(vswp, "%s: Error deleting port %d",
487 			    __func__, port->p_instance);
488 			return (1);
489 		}
490 		WRITE_ENTER(&plist->lockrw);
491 	}
492 	RW_EXIT(&plist->lockrw);
493 
494 	D1(vswp, "%s: exit", __func__);
495 
496 	return (0);
497 }
498 
499 /*
500  * Delete the specified port.
501  *
502  * Returns 0 on success, 1 on failure.
503  */
504 static int
505 vsw_port_delete(vsw_port_t *port)
506 {
507 	vsw_ldc_list_t 		*ldcl;
508 	vsw_t			*vswp = port->p_vswp;
509 	int			num_ldcs;
510 
511 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
512 
513 	(void) vsw_uninit_ldcs(port);
514 
515 	/*
516 	 * Wait for any pending ctrl msg tasks which reference this
517 	 * port to finish.
518 	 */
519 	if (vsw_drain_port_taskq(port))
520 		return (1);
521 
522 	/*
523 	 * Wait for any active callbacks to finish
524 	 */
525 	if (vsw_drain_ldcs(port))
526 		return (1);
527 
528 	ldcl = &port->p_ldclist;
529 	num_ldcs = port->num_ldcs;
530 	WRITE_ENTER(&ldcl->lockrw);
531 	while (num_ldcs > 0) {
532 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
533 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
534 			    vswp->instance, ldcl->head->ldc_id);
535 			RW_EXIT(&ldcl->lockrw);
536 			port->num_ldcs = num_ldcs;
537 			return (1);
538 		}
539 		num_ldcs--;
540 	}
541 	RW_EXIT(&ldcl->lockrw);
542 
543 	rw_destroy(&port->p_ldclist.lockrw);
544 
545 	mutex_destroy(&port->mca_lock);
546 	mutex_destroy(&port->tx_lock);
547 
548 	cv_destroy(&port->state_cv);
549 	mutex_destroy(&port->state_lock);
550 
551 	if (port->num_ldcs != 0) {
552 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
553 		port->num_ldcs = 0;
554 	}
555 	kmem_free(port, sizeof (vsw_port_t));
556 
557 	D1(vswp, "%s: exit", __func__);
558 
559 	return (0);
560 }
561 
562 /*
563  * Attach a logical domain channel (ldc) under a specified port.
564  *
565  * Returns 0 on success, 1 on failure.
566  */
567 static int
568 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
569 {
570 	vsw_t 		*vswp = port->p_vswp;
571 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
572 	vsw_ldc_t 	*ldcp = NULL;
573 	ldc_attr_t 	attr;
574 	ldc_status_t	istatus;
575 	int 		status = DDI_FAILURE;
576 	int		rv;
577 	char		kname[MAXNAMELEN];
578 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
579 			    PROG_callback = 0x2, PROG_rx_thread = 0x4,
580 			    PROG_tx_thread = 0x8}
581 			progress;
582 
583 	progress = PROG_init;
584 
585 	D1(vswp, "%s: enter", __func__);
586 
587 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
588 	if (ldcp == NULL) {
589 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
590 		return (1);
591 	}
592 	ldcp->ldc_id = ldc_id;
593 
594 	/* Allocate pools of receive mblks */
595 	rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
596 	    vsw_mblk_size1, vsw_mblk_size2, vsw_mblk_size3,
597 	    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
598 	if (rv) {
599 		DWARN(vswp, "%s: unable to create free mblk pools for"
600 		    " channel %ld (rv %d)", __func__, ldc_id, rv);
601 		kmem_free(ldcp, sizeof (vsw_ldc_t));
602 		return (1);
603 	}
604 
605 	progress |= PROG_mblks;
606 
607 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
608 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
609 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
610 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
611 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
612 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
613 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
614 
615 	/* required for handshake with peer */
616 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
617 	ldcp->peer_session = 0;
618 	ldcp->session_status = 0;
619 	ldcp->hss_id = 1;	/* Initial handshake session id */
620 
621 	/* only set for outbound lane, inbound set by peer */
622 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
623 
624 	attr.devclass = LDC_DEV_NT_SVC;
625 	attr.instance = ddi_get_instance(vswp->dip);
626 	attr.mode = LDC_MODE_UNRELIABLE;
627 	attr.mtu = VSW_LDC_MTU;
628 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
629 	if (status != 0) {
630 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
631 		    __func__, ldc_id, status);
632 		goto ldc_attach_fail;
633 	}
634 
635 	if (vsw_ldc_rxthr_enabled) {
636 		ldcp->rx_thr_flags = 0;
637 
638 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
639 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
640 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
641 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
642 
643 		progress |= PROG_rx_thread;
644 		if (ldcp->rx_thread == NULL) {
645 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
646 			    __func__, ldc_id);
647 			goto ldc_attach_fail;
648 		}
649 	}
650 
651 	if (vsw_ldc_txthr_enabled) {
652 		ldcp->tx_thr_flags = 0;
653 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
654 
655 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
656 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
657 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
658 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
659 
660 		progress |= PROG_tx_thread;
661 		if (ldcp->tx_thread == NULL) {
662 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
663 			    __func__, ldc_id);
664 			goto ldc_attach_fail;
665 		}
666 	}
667 
668 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
669 	if (status != 0) {
670 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
671 		    __func__, ldc_id, status);
672 		(void) ldc_fini(ldcp->ldc_handle);
673 		goto ldc_attach_fail;
674 	}
675 	/*
676 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
677 	 * data msgs, including raw data msgs used to recv priority frames.
678 	 */
679 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
680 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
681 
682 	progress |= PROG_callback;
683 
684 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
685 
686 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
687 		DERR(vswp, "%s: ldc_status failed", __func__);
688 		mutex_destroy(&ldcp->status_lock);
689 		goto ldc_attach_fail;
690 	}
691 
692 	ldcp->ldc_status = istatus;
693 	ldcp->ldc_port = port;
694 	ldcp->ldc_vswp = vswp;
695 
696 	vsw_reset_vnet_proto_ops(ldcp);
697 
698 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
699 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
700 	    kname, &ldcp->ldc_stats);
701 	if (ldcp->ksp == NULL) {
702 		DERR(vswp, "%s: kstats setup failed", __func__);
703 		goto ldc_attach_fail;
704 	}
705 
706 	/* link it into the list of channels for this port */
707 	WRITE_ENTER(&ldcl->lockrw);
708 	ldcp->ldc_next = ldcl->head;
709 	ldcl->head = ldcp;
710 	RW_EXIT(&ldcl->lockrw);
711 
712 	D1(vswp, "%s: exit", __func__);
713 	return (0);
714 
715 ldc_attach_fail:
716 
717 	if (progress & PROG_callback) {
718 		(void) ldc_unreg_callback(ldcp->ldc_handle);
719 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
720 	}
721 
722 	if (progress & PROG_rx_thread) {
723 		if (ldcp->rx_thread != NULL) {
724 			vsw_stop_rx_thread(ldcp);
725 		}
726 		mutex_destroy(&ldcp->rx_thr_lock);
727 		cv_destroy(&ldcp->rx_thr_cv);
728 	}
729 
730 	if (progress & PROG_tx_thread) {
731 		if (ldcp->tx_thread != NULL) {
732 			vsw_stop_tx_thread(ldcp);
733 		}
734 		mutex_destroy(&ldcp->tx_thr_lock);
735 		cv_destroy(&ldcp->tx_thr_cv);
736 	}
737 	if (ldcp->ksp != NULL) {
738 		vgen_destroy_kstats(ldcp->ksp);
739 	}
740 	mutex_destroy(&ldcp->ldc_txlock);
741 	mutex_destroy(&ldcp->ldc_rxlock);
742 	mutex_destroy(&ldcp->ldc_cblock);
743 	mutex_destroy(&ldcp->drain_cv_lock);
744 
745 	cv_destroy(&ldcp->drain_cv);
746 
747 	rw_destroy(&ldcp->lane_in.dlistrw);
748 	rw_destroy(&ldcp->lane_out.dlistrw);
749 
750 	if (progress & PROG_mblks) {
751 		vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
752 	}
753 	kmem_free(ldcp, sizeof (vsw_ldc_t));
754 
755 	return (1);
756 }
757 
758 /*
759  * Detach a logical domain channel (ldc) belonging to a
760  * particular port.
761  *
762  * Returns 0 on success, 1 on failure.
763  */
764 static int
765 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
766 {
767 	vsw_t 		*vswp = port->p_vswp;
768 	vsw_ldc_t 	*ldcp, *prev_ldcp;
769 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
770 	int 		rv;
771 
772 	prev_ldcp = ldcl->head;
773 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
774 		if (ldcp->ldc_id == ldc_id) {
775 			break;
776 		}
777 	}
778 
779 	/* specified ldc id not found */
780 	if (ldcp == NULL) {
781 		DERR(vswp, "%s: ldcp = NULL", __func__);
782 		return (1);
783 	}
784 
785 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
786 
787 	/* Stop the receive thread */
788 	if (ldcp->rx_thread != NULL) {
789 		vsw_stop_rx_thread(ldcp);
790 		mutex_destroy(&ldcp->rx_thr_lock);
791 		cv_destroy(&ldcp->rx_thr_cv);
792 	}
793 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
794 
795 	/* Stop the tx thread */
796 	if (ldcp->tx_thread != NULL) {
797 		vsw_stop_tx_thread(ldcp);
798 		mutex_destroy(&ldcp->tx_thr_lock);
799 		cv_destroy(&ldcp->tx_thr_cv);
800 		if (ldcp->tx_mhead != NULL) {
801 			freemsgchain(ldcp->tx_mhead);
802 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
803 			ldcp->tx_cnt = 0;
804 		}
805 	}
806 
807 	/* Destory kstats */
808 	vgen_destroy_kstats(ldcp->ksp);
809 
810 	/*
811 	 * Before we can close the channel we must release any mapped
812 	 * resources (e.g. drings).
813 	 */
814 	vsw_free_lane_resources(ldcp, INBOUND);
815 	vsw_free_lane_resources(ldcp, OUTBOUND);
816 
817 	/*
818 	 * If the close fails we are in serious trouble, as won't
819 	 * be able to delete the parent port.
820 	 */
821 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
822 		DERR(vswp, "%s: error %d closing channel %lld",
823 		    __func__, rv, ldcp->ldc_id);
824 		return (1);
825 	}
826 
827 	(void) ldc_fini(ldcp->ldc_handle);
828 
829 	ldcp->ldc_status = LDC_INIT;
830 	ldcp->ldc_handle = NULL;
831 	ldcp->ldc_vswp = NULL;
832 
833 
834 	/*
835 	 * Most likely some mblks are still in use and
836 	 * have not been returned to the pool. These mblks are
837 	 * added to the pool that is maintained in the device instance.
838 	 * Another attempt will be made to destroy the pool
839 	 * when the device detaches.
840 	 */
841 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
842 
843 	/* unlink it from the list */
844 	prev_ldcp = ldcp->ldc_next;
845 
846 	mutex_destroy(&ldcp->ldc_txlock);
847 	mutex_destroy(&ldcp->ldc_rxlock);
848 	mutex_destroy(&ldcp->ldc_cblock);
849 	cv_destroy(&ldcp->drain_cv);
850 	mutex_destroy(&ldcp->drain_cv_lock);
851 	mutex_destroy(&ldcp->status_lock);
852 	rw_destroy(&ldcp->lane_in.dlistrw);
853 	rw_destroy(&ldcp->lane_out.dlistrw);
854 
855 	kmem_free(ldcp, sizeof (vsw_ldc_t));
856 
857 	return (0);
858 }
859 
860 /*
861  * Open and attempt to bring up the channel. Note that channel
862  * can only be brought up if peer has also opened channel.
863  *
864  * Returns 0 if can open and bring up channel, otherwise
865  * returns 1.
866  */
867 static int
868 vsw_ldc_init(vsw_ldc_t *ldcp)
869 {
870 	vsw_t 		*vswp = ldcp->ldc_vswp;
871 	ldc_status_t	istatus = 0;
872 	int		rv;
873 
874 	D1(vswp, "%s: enter", __func__);
875 
876 	LDC_ENTER_LOCK(ldcp);
877 
878 	/* don't start at 0 in case clients don't like that */
879 	ldcp->next_ident = 1;
880 
881 	rv = ldc_open(ldcp->ldc_handle);
882 	if (rv != 0) {
883 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
884 		    __func__, ldcp->ldc_id, rv);
885 		LDC_EXIT_LOCK(ldcp);
886 		return (1);
887 	}
888 
889 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
890 		DERR(vswp, "%s: unable to get status", __func__);
891 		LDC_EXIT_LOCK(ldcp);
892 		return (1);
893 
894 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
895 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
896 		    __func__, ldcp->ldc_id, istatus);
897 		LDC_EXIT_LOCK(ldcp);
898 		return (1);
899 	}
900 
901 	mutex_enter(&ldcp->status_lock);
902 	ldcp->ldc_status = istatus;
903 	mutex_exit(&ldcp->status_lock);
904 
905 	rv = ldc_up(ldcp->ldc_handle);
906 	if (rv != 0) {
907 		/*
908 		 * Not a fatal error for ldc_up() to fail, as peer
909 		 * end point may simply not be ready yet.
910 		 */
911 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
912 		    ldcp->ldc_id, rv);
913 		LDC_EXIT_LOCK(ldcp);
914 		return (1);
915 	}
916 
917 	/*
918 	 * ldc_up() call is non-blocking so need to explicitly
919 	 * check channel status to see if in fact the channel
920 	 * is UP.
921 	 */
922 	mutex_enter(&ldcp->status_lock);
923 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
924 		DERR(vswp, "%s: unable to get status", __func__);
925 		mutex_exit(&ldcp->status_lock);
926 		LDC_EXIT_LOCK(ldcp);
927 		return (1);
928 
929 	}
930 
931 	if (ldcp->ldc_status == LDC_UP) {
932 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
933 		    ldcp->ldc_id, istatus);
934 		mutex_exit(&ldcp->status_lock);
935 		LDC_EXIT_LOCK(ldcp);
936 
937 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
938 		return (0);
939 	}
940 
941 	mutex_exit(&ldcp->status_lock);
942 	LDC_EXIT_LOCK(ldcp);
943 
944 	D1(vswp, "%s: exit", __func__);
945 	return (0);
946 }
947 
948 /* disable callbacks on the channel */
949 static int
950 vsw_ldc_uninit(vsw_ldc_t *ldcp)
951 {
952 	vsw_t	*vswp = ldcp->ldc_vswp;
953 	int	rv;
954 
955 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
956 
957 	LDC_ENTER_LOCK(ldcp);
958 
959 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
960 	if (rv != 0) {
961 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
962 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
963 		LDC_EXIT_LOCK(ldcp);
964 		return (1);
965 	}
966 
967 	mutex_enter(&ldcp->status_lock);
968 	ldcp->ldc_status = LDC_INIT;
969 	mutex_exit(&ldcp->status_lock);
970 
971 	LDC_EXIT_LOCK(ldcp);
972 
973 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
974 
975 	return (0);
976 }
977 
978 static int
979 vsw_init_ldcs(vsw_port_t *port)
980 {
981 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
982 	vsw_ldc_t	*ldcp;
983 
984 	READ_ENTER(&ldcl->lockrw);
985 	ldcp =  ldcl->head;
986 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
987 		(void) vsw_ldc_init(ldcp);
988 	}
989 	RW_EXIT(&ldcl->lockrw);
990 
991 	return (0);
992 }
993 
994 static int
995 vsw_uninit_ldcs(vsw_port_t *port)
996 {
997 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
998 	vsw_ldc_t	*ldcp;
999 
1000 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1001 
1002 	READ_ENTER(&ldcl->lockrw);
1003 	ldcp =  ldcl->head;
1004 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1005 		(void) vsw_ldc_uninit(ldcp);
1006 	}
1007 	RW_EXIT(&ldcl->lockrw);
1008 
1009 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1010 
1011 	return (0);
1012 }
1013 
1014 /*
1015  * Wait until the callback(s) associated with the ldcs under the specified
1016  * port have completed.
1017  *
1018  * Prior to this function being invoked each channel under this port
1019  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1020  *
1021  * A short explaination of what we are doing below..
1022  *
1023  * The simplest approach would be to have a reference counter in
1024  * the ldc structure which is increment/decremented by the callbacks as
1025  * they use the channel. The drain function could then simply disable any
1026  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1027  * there is a tiny window here - before the callback is able to get the lock
1028  * on the channel it is interrupted and this function gets to execute. It
1029  * sees that the ref count is zero and believes its free to delete the
1030  * associated data structures.
1031  *
1032  * We get around this by taking advantage of the fact that before the ldc
1033  * framework invokes a callback it sets a flag to indicate that there is a
1034  * callback active (or about to become active). If when we attempt to
1035  * unregister a callback when this active flag is set then the unregister
1036  * will fail with EWOULDBLOCK.
1037  *
1038  * If the unregister fails we do a cv_timedwait. We will either be signaled
1039  * by the callback as it is exiting (note we have to wait a short period to
1040  * allow the callback to return fully to the ldc framework and it to clear
1041  * the active flag), or by the timer expiring. In either case we again attempt
1042  * the unregister. We repeat this until we can succesfully unregister the
1043  * callback.
1044  *
1045  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1046  * the case where the callback has finished but the ldc framework has not yet
1047  * cleared the active flag. In this case we would never get a cv_signal.
1048  */
1049 static int
1050 vsw_drain_ldcs(vsw_port_t *port)
1051 {
1052 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1053 	vsw_ldc_t	*ldcp;
1054 	vsw_t		*vswp = port->p_vswp;
1055 
1056 	D1(vswp, "%s: enter", __func__);
1057 
1058 	READ_ENTER(&ldcl->lockrw);
1059 
1060 	ldcp = ldcl->head;
1061 
1062 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1063 		/*
1064 		 * If we can unregister the channel callback then we
1065 		 * know that there is no callback either running or
1066 		 * scheduled to run for this channel so move on to next
1067 		 * channel in the list.
1068 		 */
1069 		mutex_enter(&ldcp->drain_cv_lock);
1070 
1071 		/* prompt active callbacks to quit */
1072 		ldcp->drain_state = VSW_LDC_DRAINING;
1073 
1074 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1075 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1076 			    ldcp->ldc_id);
1077 			mutex_exit(&ldcp->drain_cv_lock);
1078 			continue;
1079 		} else {
1080 			/*
1081 			 * If we end up here we know that either 1) a callback
1082 			 * is currently executing, 2) is about to start (i.e.
1083 			 * the ldc framework has set the active flag but
1084 			 * has not actually invoked the callback yet, or 3)
1085 			 * has finished and has returned to the ldc framework
1086 			 * but the ldc framework has not yet cleared the
1087 			 * active bit.
1088 			 *
1089 			 * Wait for it to finish.
1090 			 */
1091 			while (ldc_unreg_callback(ldcp->ldc_handle)
1092 			    == EWOULDBLOCK)
1093 				(void) cv_timedwait(&ldcp->drain_cv,
1094 				    &ldcp->drain_cv_lock, lbolt + hz);
1095 
1096 			mutex_exit(&ldcp->drain_cv_lock);
1097 			D2(vswp, "%s: unreg callback for chan %ld after "
1098 			    "timeout", __func__, ldcp->ldc_id);
1099 		}
1100 	}
1101 	RW_EXIT(&ldcl->lockrw);
1102 
1103 	D1(vswp, "%s: exit", __func__);
1104 	return (0);
1105 }
1106 
1107 /*
1108  * Wait until all tasks which reference this port have completed.
1109  *
1110  * Prior to this function being invoked each channel under this port
1111  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1112  */
1113 static int
1114 vsw_drain_port_taskq(vsw_port_t *port)
1115 {
1116 	vsw_t		*vswp = port->p_vswp;
1117 
1118 	D1(vswp, "%s: enter", __func__);
1119 
1120 	/*
1121 	 * Mark the port as in the process of being detached, and
1122 	 * dispatch a marker task to the queue so we know when all
1123 	 * relevant tasks have completed.
1124 	 */
1125 	mutex_enter(&port->state_lock);
1126 	port->state = VSW_PORT_DETACHING;
1127 
1128 	if ((vswp->taskq_p == NULL) ||
1129 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1130 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1131 		DERR(vswp, "%s: unable to dispatch marker task",
1132 		    __func__);
1133 		mutex_exit(&port->state_lock);
1134 		return (1);
1135 	}
1136 
1137 	/*
1138 	 * Wait for the marker task to finish.
1139 	 */
1140 	while (port->state != VSW_PORT_DETACHABLE)
1141 		cv_wait(&port->state_cv, &port->state_lock);
1142 
1143 	mutex_exit(&port->state_lock);
1144 
1145 	D1(vswp, "%s: exit", __func__);
1146 
1147 	return (0);
1148 }
1149 
1150 static void
1151 vsw_marker_task(void *arg)
1152 {
1153 	vsw_port_t	*port = arg;
1154 	vsw_t		*vswp = port->p_vswp;
1155 
1156 	D1(vswp, "%s: enter", __func__);
1157 
1158 	mutex_enter(&port->state_lock);
1159 
1160 	/*
1161 	 * No further tasks should be dispatched which reference
1162 	 * this port so ok to mark it as safe to detach.
1163 	 */
1164 	port->state = VSW_PORT_DETACHABLE;
1165 
1166 	cv_signal(&port->state_cv);
1167 
1168 	mutex_exit(&port->state_lock);
1169 
1170 	D1(vswp, "%s: exit", __func__);
1171 }
1172 
1173 vsw_port_t *
1174 vsw_lookup_port(vsw_t *vswp, int p_instance)
1175 {
1176 	vsw_port_list_t *plist = &vswp->plist;
1177 	vsw_port_t	*port;
1178 
1179 	for (port = plist->head; port != NULL; port = port->p_next) {
1180 		if (port->p_instance == p_instance) {
1181 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1182 			return (port);
1183 		}
1184 	}
1185 
1186 	return (NULL);
1187 }
1188 
1189 void
1190 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1191 {
1192 	vsw_ldc_list_t 	*ldclp;
1193 	vsw_ldc_t	*ldcp;
1194 
1195 	ldclp = &portp->p_ldclist;
1196 
1197 	READ_ENTER(&ldclp->lockrw);
1198 
1199 	/*
1200 	 * NOTE: for now, we will assume we have a single channel.
1201 	 */
1202 	if (ldclp->head == NULL) {
1203 		RW_EXIT(&ldclp->lockrw);
1204 		return;
1205 	}
1206 	ldcp = ldclp->head;
1207 
1208 	mutex_enter(&ldcp->ldc_cblock);
1209 
1210 	/*
1211 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1212 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1213 	 */
1214 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1215 	    portp->nvids != 0) {
1216 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1217 	}
1218 
1219 	mutex_exit(&ldcp->ldc_cblock);
1220 
1221 	RW_EXIT(&ldclp->lockrw);
1222 }
1223 
1224 /*
1225  * Search for and remove the specified port from the port
1226  * list. Returns 0 if able to locate and remove port, otherwise
1227  * returns 1.
1228  */
1229 static int
1230 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1231 {
1232 	vsw_port_list_t *plist = &vswp->plist;
1233 	vsw_port_t	*curr_p, *prev_p;
1234 
1235 	if (plist->head == NULL)
1236 		return (1);
1237 
1238 	curr_p = prev_p = plist->head;
1239 
1240 	while (curr_p != NULL) {
1241 		if (curr_p == port) {
1242 			if (prev_p == curr_p) {
1243 				plist->head = curr_p->p_next;
1244 			} else {
1245 				prev_p->p_next = curr_p->p_next;
1246 			}
1247 			plist->num_ports--;
1248 			break;
1249 		} else {
1250 			prev_p = curr_p;
1251 			curr_p = curr_p->p_next;
1252 		}
1253 	}
1254 	return (0);
1255 }
1256 
1257 /*
1258  * Interrupt handler for ldc messages.
1259  */
1260 static uint_t
1261 vsw_ldc_cb(uint64_t event, caddr_t arg)
1262 {
1263 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1264 	vsw_t 		*vswp = ldcp->ldc_vswp;
1265 
1266 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1267 
1268 	mutex_enter(&ldcp->ldc_cblock);
1269 	ldcp->ldc_stats.callbacks++;
1270 
1271 	mutex_enter(&ldcp->status_lock);
1272 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1273 		mutex_exit(&ldcp->status_lock);
1274 		mutex_exit(&ldcp->ldc_cblock);
1275 		return (LDC_SUCCESS);
1276 	}
1277 	mutex_exit(&ldcp->status_lock);
1278 
1279 	if (event & LDC_EVT_UP) {
1280 		/*
1281 		 * Channel has come up.
1282 		 */
1283 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1284 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1285 
1286 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1287 
1288 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1289 	}
1290 
1291 	if (event & LDC_EVT_READ) {
1292 		/*
1293 		 * Data available for reading.
1294 		 */
1295 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1296 		    __func__, ldcp->ldc_id, event);
1297 
1298 		if (ldcp->rx_thread != NULL) {
1299 			/*
1300 			 * If the receive thread is enabled, then
1301 			 * wakeup the receive thread to process the
1302 			 * LDC messages.
1303 			 */
1304 			mutex_exit(&ldcp->ldc_cblock);
1305 			mutex_enter(&ldcp->rx_thr_lock);
1306 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1307 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1308 				cv_signal(&ldcp->rx_thr_cv);
1309 			}
1310 			mutex_exit(&ldcp->rx_thr_lock);
1311 			mutex_enter(&ldcp->ldc_cblock);
1312 		} else {
1313 			vsw_process_pkt(ldcp);
1314 		}
1315 
1316 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1317 
1318 		goto vsw_cb_exit;
1319 	}
1320 
1321 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1322 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1323 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1324 
1325 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1326 	}
1327 
1328 	/*
1329 	 * Catch either LDC_EVT_WRITE which we don't support or any
1330 	 * unknown event.
1331 	 */
1332 	if (event &
1333 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1334 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1335 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1336 	}
1337 
1338 vsw_cb_exit:
1339 	mutex_exit(&ldcp->ldc_cblock);
1340 
1341 	/*
1342 	 * Let the drain function know we are finishing if it
1343 	 * is waiting.
1344 	 */
1345 	mutex_enter(&ldcp->drain_cv_lock);
1346 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1347 		cv_signal(&ldcp->drain_cv);
1348 	mutex_exit(&ldcp->drain_cv_lock);
1349 
1350 	return (LDC_SUCCESS);
1351 }
1352 
1353 /*
1354  * Reinitialise data structures associated with the channel.
1355  */
1356 static void
1357 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1358 {
1359 	vsw_t		*vswp = ldcp->ldc_vswp;
1360 	vsw_port_t	*port;
1361 	vsw_ldc_list_t	*ldcl;
1362 
1363 	D1(vswp, "%s: enter", __func__);
1364 
1365 	port = ldcp->ldc_port;
1366 	ldcl = &port->p_ldclist;
1367 
1368 	READ_ENTER(&ldcl->lockrw);
1369 
1370 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1371 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1372 
1373 	vsw_free_lane_resources(ldcp, INBOUND);
1374 	vsw_free_lane_resources(ldcp, OUTBOUND);
1375 	RW_EXIT(&ldcl->lockrw);
1376 
1377 	ldcp->lane_in.lstate = 0;
1378 	ldcp->lane_out.lstate = 0;
1379 
1380 	/* Remove the fdb entry for this port/mac address */
1381 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1382 
1383 	/* remove the port from vlans it has been assigned to */
1384 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1385 
1386 	/*
1387 	 * Remove parent port from any multicast groups
1388 	 * it may have registered with. Client must resend
1389 	 * multicast add command after handshake completes.
1390 	 */
1391 	vsw_del_mcst_port(port);
1392 
1393 	ldcp->peer_session = 0;
1394 	ldcp->session_status = 0;
1395 	ldcp->hcnt = 0;
1396 	ldcp->hphase = VSW_MILESTONE0;
1397 
1398 	vsw_reset_vnet_proto_ops(ldcp);
1399 
1400 	D1(vswp, "%s: exit", __func__);
1401 }
1402 
1403 /*
1404  * Process a connection event.
1405  *
1406  * Note - care must be taken to ensure that this function is
1407  * not called with the dlistrw lock held.
1408  */
1409 static void
1410 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1411 {
1412 	vsw_t		*vswp = ldcp->ldc_vswp;
1413 	vsw_conn_evt_t	*conn = NULL;
1414 
1415 	D1(vswp, "%s: enter", __func__);
1416 
1417 	/*
1418 	 * Check if either a reset or restart event is pending
1419 	 * or in progress. If so just return.
1420 	 *
1421 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1422 	 * being received by the callback handler, or a ECONNRESET error
1423 	 * code being returned from a ldc_read() or ldc_write() call.
1424 	 *
1425 	 * A VSW_CONN_RESTART event occurs when some error checking code
1426 	 * decides that there is a problem with data from the channel,
1427 	 * and that the handshake should be restarted.
1428 	 */
1429 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1430 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1431 		return;
1432 
1433 	/*
1434 	 * If it is an LDC_UP event we first check the recorded
1435 	 * state of the channel. If this is UP then we know that
1436 	 * the channel moving to the UP state has already been dealt
1437 	 * with and don't need to dispatch a  new task.
1438 	 *
1439 	 * The reason for this check is that when we do a ldc_up(),
1440 	 * depending on the state of the peer, we may or may not get
1441 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1442 	 * every time we do ldc_up() we explicitly check the channel
1443 	 * status to see has it come up (ldc_up() is asynch and will
1444 	 * complete at some undefined time), and take the appropriate
1445 	 * action.
1446 	 *
1447 	 * The flip side of this is that we may get a LDC_UP event
1448 	 * when we have already seen that the channel is up and have
1449 	 * dealt with that.
1450 	 */
1451 	mutex_enter(&ldcp->status_lock);
1452 	if (evt == VSW_CONN_UP) {
1453 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1454 			mutex_exit(&ldcp->status_lock);
1455 			return;
1456 		}
1457 	}
1458 	mutex_exit(&ldcp->status_lock);
1459 
1460 	/*
1461 	 * The transaction group id allows us to identify and discard
1462 	 * any tasks which are still pending on the taskq and refer
1463 	 * to the handshake session we are about to restart or reset.
1464 	 * These stale messages no longer have any real meaning.
1465 	 */
1466 	(void) atomic_inc_32(&ldcp->hss_id);
1467 
1468 	ASSERT(vswp->taskq_p != NULL);
1469 
1470 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1471 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1472 		    " connection event", vswp->instance);
1473 		goto err_exit;
1474 	}
1475 
1476 	conn->evt = evt;
1477 	conn->ldcp = ldcp;
1478 
1479 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1480 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1481 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1482 		    vswp->instance);
1483 
1484 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1485 		goto err_exit;
1486 	}
1487 
1488 	D1(vswp, "%s: exit", __func__);
1489 	return;
1490 
1491 err_exit:
1492 	/*
1493 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1494 	 * that future requests will at least be attempted and will hopefully
1495 	 * succeed.
1496 	 */
1497 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1498 		ldcp->reset_active = 0;
1499 }
1500 
1501 /*
1502  * Deal with events relating to a connection. Invoked from a taskq.
1503  */
1504 static void
1505 vsw_conn_task(void *arg)
1506 {
1507 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1508 	vsw_ldc_t	*ldcp = NULL;
1509 	vsw_t		*vswp = NULL;
1510 	uint16_t	evt;
1511 	ldc_status_t	curr_status;
1512 
1513 	ldcp = conn->ldcp;
1514 	evt = conn->evt;
1515 	vswp = ldcp->ldc_vswp;
1516 
1517 	D1(vswp, "%s: enter", __func__);
1518 
1519 	/* can safely free now have copied out data */
1520 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1521 
1522 	mutex_enter(&ldcp->status_lock);
1523 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1524 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1525 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1526 		mutex_exit(&ldcp->status_lock);
1527 		return;
1528 	}
1529 
1530 	/*
1531 	 * If we wish to restart the handshake on this channel, then if
1532 	 * the channel is UP we bring it DOWN to flush the underlying
1533 	 * ldc queue.
1534 	 */
1535 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1536 		(void) ldc_down(ldcp->ldc_handle);
1537 
1538 	/*
1539 	 * re-init all the associated data structures.
1540 	 */
1541 	vsw_ldc_reinit(ldcp);
1542 
1543 	/*
1544 	 * Bring the channel back up (note it does no harm to
1545 	 * do this even if the channel is already UP, Just
1546 	 * becomes effectively a no-op).
1547 	 */
1548 	(void) ldc_up(ldcp->ldc_handle);
1549 
1550 	/*
1551 	 * Check if channel is now UP. This will only happen if
1552 	 * peer has also done a ldc_up().
1553 	 */
1554 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1555 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1556 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1557 		mutex_exit(&ldcp->status_lock);
1558 		return;
1559 	}
1560 
1561 	ldcp->ldc_status = curr_status;
1562 
1563 	/* channel UP so restart handshake by sending version info */
1564 	if (curr_status == LDC_UP) {
1565 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1566 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1567 			    " handshake attempts (%d) on channel %ld",
1568 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1569 			mutex_exit(&ldcp->status_lock);
1570 			return;
1571 		}
1572 
1573 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1574 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1575 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1576 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1577 			    vswp->instance);
1578 
1579 			/*
1580 			 * Don't count as valid restart attempt if couldn't
1581 			 * send version msg.
1582 			 */
1583 			if (ldcp->hcnt > 0)
1584 				ldcp->hcnt--;
1585 		}
1586 	}
1587 
1588 	/*
1589 	 * Mark that the process is complete by clearing the flag.
1590 	 *
1591 	 * Note is it possible that the taskq dispatch above may have failed,
1592 	 * most likely due to memory shortage. We still clear the flag so
1593 	 * future attempts will at least be attempted and will hopefully
1594 	 * succeed.
1595 	 */
1596 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1597 		ldcp->reset_active = 0;
1598 
1599 	mutex_exit(&ldcp->status_lock);
1600 
1601 	D1(vswp, "%s: exit", __func__);
1602 }
1603 
1604 /*
1605  * returns 0 if legal for event signified by flag to have
1606  * occured at the time it did. Otherwise returns 1.
1607  */
1608 int
1609 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1610 {
1611 	vsw_t		*vswp = ldcp->ldc_vswp;
1612 	uint64_t	state;
1613 	uint64_t	phase;
1614 
1615 	if (dir == INBOUND)
1616 		state = ldcp->lane_in.lstate;
1617 	else
1618 		state = ldcp->lane_out.lstate;
1619 
1620 	phase = ldcp->hphase;
1621 
1622 	switch (flag) {
1623 	case VSW_VER_INFO_RECV:
1624 		if (phase > VSW_MILESTONE0) {
1625 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1626 			    " when in state %d\n", ldcp->ldc_id, phase);
1627 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1628 			return (1);
1629 		}
1630 		break;
1631 
1632 	case VSW_VER_ACK_RECV:
1633 	case VSW_VER_NACK_RECV:
1634 		if (!(state & VSW_VER_INFO_SENT)) {
1635 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1636 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1637 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1638 			return (1);
1639 		} else
1640 			state &= ~VSW_VER_INFO_SENT;
1641 		break;
1642 
1643 	case VSW_ATTR_INFO_RECV:
1644 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1645 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1646 			    " when in state %d\n", ldcp->ldc_id, phase);
1647 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1648 			return (1);
1649 		}
1650 		break;
1651 
1652 	case VSW_ATTR_ACK_RECV:
1653 	case VSW_ATTR_NACK_RECV:
1654 		if (!(state & VSW_ATTR_INFO_SENT)) {
1655 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1656 			    " or ATTR_NACK when in state %d\n",
1657 			    ldcp->ldc_id, phase);
1658 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1659 			return (1);
1660 		} else
1661 			state &= ~VSW_ATTR_INFO_SENT;
1662 		break;
1663 
1664 	case VSW_DRING_INFO_RECV:
1665 		if (phase < VSW_MILESTONE1) {
1666 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1667 			    " when in state %d\n", ldcp->ldc_id, phase);
1668 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1669 			return (1);
1670 		}
1671 		break;
1672 
1673 	case VSW_DRING_ACK_RECV:
1674 	case VSW_DRING_NACK_RECV:
1675 		if (!(state & VSW_DRING_INFO_SENT)) {
1676 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1677 			    " or DRING_NACK when in state %d\n",
1678 			    ldcp->ldc_id, phase);
1679 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1680 			return (1);
1681 		} else
1682 			state &= ~VSW_DRING_INFO_SENT;
1683 		break;
1684 
1685 	case VSW_RDX_INFO_RECV:
1686 		if (phase < VSW_MILESTONE3) {
1687 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1688 			    " when in state %d\n", ldcp->ldc_id, phase);
1689 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1690 			return (1);
1691 		}
1692 		break;
1693 
1694 	case VSW_RDX_ACK_RECV:
1695 	case VSW_RDX_NACK_RECV:
1696 		if (!(state & VSW_RDX_INFO_SENT)) {
1697 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1698 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1699 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1700 			return (1);
1701 		} else
1702 			state &= ~VSW_RDX_INFO_SENT;
1703 		break;
1704 
1705 	case VSW_MCST_INFO_RECV:
1706 		if (phase < VSW_MILESTONE3) {
1707 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1708 			    " when in state %d\n", ldcp->ldc_id, phase);
1709 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1710 			return (1);
1711 		}
1712 		break;
1713 
1714 	default:
1715 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1716 		    ldcp->ldc_id, flag);
1717 		return (1);
1718 	}
1719 
1720 	if (dir == INBOUND)
1721 		ldcp->lane_in.lstate = state;
1722 	else
1723 		ldcp->lane_out.lstate = state;
1724 
1725 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1726 
1727 	return (0);
1728 }
1729 
1730 void
1731 vsw_next_milestone(vsw_ldc_t *ldcp)
1732 {
1733 	vsw_t		*vswp = ldcp->ldc_vswp;
1734 
1735 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1736 	    ldcp->ldc_id, ldcp->hphase);
1737 
1738 	DUMP_FLAGS(ldcp->lane_in.lstate);
1739 	DUMP_FLAGS(ldcp->lane_out.lstate);
1740 
1741 	switch (ldcp->hphase) {
1742 
1743 	case VSW_MILESTONE0:
1744 		/*
1745 		 * If we haven't started to handshake with our peer,
1746 		 * start to do so now.
1747 		 */
1748 		if (ldcp->lane_out.lstate == 0) {
1749 			D2(vswp, "%s: (chan %lld) starting handshake "
1750 			    "with peer", __func__, ldcp->ldc_id);
1751 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1752 		}
1753 
1754 		/*
1755 		 * Only way to pass this milestone is to have successfully
1756 		 * negotiated version info.
1757 		 */
1758 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1759 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1760 
1761 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1762 			    __func__, ldcp->ldc_id);
1763 
1764 			vsw_set_vnet_proto_ops(ldcp);
1765 
1766 			/*
1767 			 * Next milestone is passed when attribute
1768 			 * information has been successfully exchanged.
1769 			 */
1770 			ldcp->hphase = VSW_MILESTONE1;
1771 			vsw_send_attr(ldcp);
1772 
1773 		}
1774 		break;
1775 
1776 	case VSW_MILESTONE1:
1777 		/*
1778 		 * Only way to pass this milestone is to have successfully
1779 		 * negotiated attribute information.
1780 		 */
1781 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1782 
1783 			ldcp->hphase = VSW_MILESTONE2;
1784 
1785 			/*
1786 			 * If the peer device has said it wishes to
1787 			 * use descriptor rings then we send it our ring
1788 			 * info, otherwise we just set up a private ring
1789 			 * which we use an internal buffer
1790 			 */
1791 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1792 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1793 			    (VSW_VER_LT(ldcp, 1, 2) &&
1794 			    (ldcp->lane_in.xfer_mode ==
1795 			    VIO_DRING_MODE_V1_0))) {
1796 				vsw_send_dring_info(ldcp);
1797 			}
1798 		}
1799 		break;
1800 
1801 	case VSW_MILESTONE2:
1802 		/*
1803 		 * If peer has indicated in its attribute message that
1804 		 * it wishes to use descriptor rings then the only way
1805 		 * to pass this milestone is for us to have received
1806 		 * valid dring info.
1807 		 *
1808 		 * If peer is not using descriptor rings then just fall
1809 		 * through.
1810 		 */
1811 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1812 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1813 		    (VSW_VER_LT(ldcp, 1, 2) &&
1814 		    (ldcp->lane_in.xfer_mode ==
1815 		    VIO_DRING_MODE_V1_0))) {
1816 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
1817 				break;
1818 		}
1819 
1820 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1821 		    __func__, ldcp->ldc_id);
1822 
1823 		ldcp->hphase = VSW_MILESTONE3;
1824 		vsw_send_rdx(ldcp);
1825 		break;
1826 
1827 	case VSW_MILESTONE3:
1828 		/*
1829 		 * Pass this milestone when all paramaters have been
1830 		 * successfully exchanged and RDX sent in both directions.
1831 		 *
1832 		 * Mark outbound lane as available to transmit data.
1833 		 */
1834 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
1835 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
1836 
1837 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1838 			    __func__, ldcp->ldc_id);
1839 			D2(vswp, "%s: ** handshake complete (0x%llx : "
1840 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
1841 			    ldcp->lane_out.lstate);
1842 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
1843 			ldcp->hphase = VSW_MILESTONE4;
1844 			ldcp->hcnt = 0;
1845 			DISPLAY_STATE();
1846 		} else {
1847 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1848 			    __func__, ldcp->lane_in.lstate,
1849 			    ldcp->lane_out.lstate);
1850 		}
1851 		break;
1852 
1853 	case VSW_MILESTONE4:
1854 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1855 		    ldcp->ldc_id);
1856 		break;
1857 
1858 	default:
1859 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1860 		    ldcp->ldc_id, ldcp->hphase);
1861 	}
1862 
1863 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1864 	    ldcp->hphase);
1865 }
1866 
1867 /*
1868  * Check if major version is supported.
1869  *
1870  * Returns 0 if finds supported major number, and if necessary
1871  * adjusts the minor field.
1872  *
1873  * Returns 1 if can't match major number exactly. Sets mjor/minor
1874  * to next lowest support values, or to zero if no other values possible.
1875  */
1876 static int
1877 vsw_supported_version(vio_ver_msg_t *vp)
1878 {
1879 	int	i;
1880 
1881 	D1(NULL, "vsw_supported_version: enter");
1882 
1883 	for (i = 0; i < VSW_NUM_VER; i++) {
1884 		if (vsw_versions[i].ver_major == vp->ver_major) {
1885 			/*
1886 			 * Matching or lower major version found. Update
1887 			 * minor number if necessary.
1888 			 */
1889 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1890 				D2(NULL, "%s: adjusting minor value from %d "
1891 				    "to %d", __func__, vp->ver_minor,
1892 				    vsw_versions[i].ver_minor);
1893 				vp->ver_minor = vsw_versions[i].ver_minor;
1894 			}
1895 
1896 			return (0);
1897 		}
1898 
1899 		/*
1900 		 * If the message contains a higher major version number, set
1901 		 * the message's major/minor versions to the current values
1902 		 * and return false, so this message will get resent with
1903 		 * these values.
1904 		 */
1905 		if (vsw_versions[i].ver_major < vp->ver_major) {
1906 			D2(NULL, "%s: adjusting major and minor "
1907 			    "values to %d, %d\n",
1908 			    __func__, vsw_versions[i].ver_major,
1909 			    vsw_versions[i].ver_minor);
1910 			vp->ver_major = vsw_versions[i].ver_major;
1911 			vp->ver_minor = vsw_versions[i].ver_minor;
1912 			return (1);
1913 		}
1914 	}
1915 
1916 	/* No match was possible, zero out fields */
1917 	vp->ver_major = 0;
1918 	vp->ver_minor = 0;
1919 
1920 	D1(NULL, "vsw_supported_version: exit");
1921 
1922 	return (1);
1923 }
1924 
1925 /*
1926  * Set vnet-protocol-version dependent functions based on version.
1927  */
1928 static void
1929 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1930 {
1931 	vsw_t	*vswp = ldcp->ldc_vswp;
1932 	lane_t	*lp = &ldcp->lane_out;
1933 
1934 	if (VSW_VER_GTEQ(ldcp, 1, 3)) {
1935 		/*
1936 		 * If the version negotiated with peer is >= 1.3,
1937 		 * set the mtu in our attributes to max_frame_size.
1938 		 */
1939 		lp->mtu = vswp->max_frame_size;
1940 	} else {
1941 		vsw_port_t	*portp = ldcp->ldc_port;
1942 		/*
1943 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
1944 		 * We can negotiate that size with those peers provided the
1945 		 * following conditions are true:
1946 		 * - Our max_frame_size is greater only by VLAN_TAGSZ (4).
1947 		 * - Only pvid is defined for our peer and there are no vids.
1948 		 * If the above conditions are true, then we can send/recv only
1949 		 * untagged frames of max size ETHERMAX. Note that pvid of the
1950 		 * peer can be different, as vsw has to serve the vnet in that
1951 		 * vlan even if itself is not assigned to that vlan.
1952 		 */
1953 		if ((vswp->max_frame_size == ETHERMAX + VLAN_TAGSZ) &&
1954 		    portp->nvids == 0) {
1955 			lp->mtu = ETHERMAX;
1956 		}
1957 	}
1958 
1959 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
1960 		/* Versions >= 1.2 */
1961 
1962 		if (VSW_PRI_ETH_DEFINED(vswp)) {
1963 			/*
1964 			 * enable priority routines and pkt mode only if
1965 			 * at least one pri-eth-type is specified in MD.
1966 			 */
1967 			ldcp->tx = vsw_ldctx_pri;
1968 			ldcp->rx_pktdata = vsw_process_pkt_data;
1969 
1970 			/* set xfer mode for vsw_send_attr() */
1971 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
1972 		} else {
1973 			/* no priority eth types defined in MD */
1974 
1975 			ldcp->tx = vsw_ldctx;
1976 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
1977 
1978 			/* set xfer mode for vsw_send_attr() */
1979 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
1980 		}
1981 
1982 	} else {
1983 		/* Versions prior to 1.2  */
1984 
1985 		vsw_reset_vnet_proto_ops(ldcp);
1986 	}
1987 }
1988 
1989 /*
1990  * Reset vnet-protocol-version dependent functions to v1.0.
1991  */
1992 static void
1993 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
1994 {
1995 	lane_t	*lp = &ldcp->lane_out;
1996 
1997 	ldcp->tx = vsw_ldctx;
1998 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
1999 
2000 	/* set xfer mode for vsw_send_attr() */
2001 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2002 }
2003 
2004 /*
2005  * Main routine for processing messages received over LDC.
2006  */
2007 static void
2008 vsw_process_pkt(void *arg)
2009 {
2010 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2011 	vsw_t 		*vswp = ldcp->ldc_vswp;
2012 	size_t		msglen;
2013 	vio_msg_tag_t	*tagp;
2014 	uint64_t	*ldcmsg;
2015 	int 		rv = 0;
2016 
2017 
2018 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2019 
2020 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2021 
2022 	ldcmsg = ldcp->ldcmsg;
2023 	/*
2024 	 * If channel is up read messages until channel is empty.
2025 	 */
2026 	do {
2027 		msglen = ldcp->msglen;
2028 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2029 
2030 		if (rv != 0) {
2031 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2032 			    __func__, ldcp->ldc_id, rv, msglen);
2033 		}
2034 
2035 		/* channel has been reset */
2036 		if (rv == ECONNRESET) {
2037 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2038 			break;
2039 		}
2040 
2041 		if (msglen == 0) {
2042 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2043 			    ldcp->ldc_id);
2044 			break;
2045 		}
2046 
2047 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2048 		    ldcp->ldc_id, msglen);
2049 
2050 		/*
2051 		 * Figure out what sort of packet we have gotten by
2052 		 * examining the msg tag, and then switch it appropriately.
2053 		 */
2054 		tagp = (vio_msg_tag_t *)ldcmsg;
2055 
2056 		switch (tagp->vio_msgtype) {
2057 		case VIO_TYPE_CTRL:
2058 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2059 			break;
2060 		case VIO_TYPE_DATA:
2061 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2062 			break;
2063 		case VIO_TYPE_ERR:
2064 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2065 			break;
2066 		default:
2067 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2068 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2069 			break;
2070 		}
2071 	} while (msglen);
2072 
2073 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2074 }
2075 
2076 /*
2077  * Dispatch a task to process a VIO control message.
2078  */
2079 static void
2080 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2081 {
2082 	vsw_ctrl_task_t		*ctaskp = NULL;
2083 	vsw_port_t		*port = ldcp->ldc_port;
2084 	vsw_t			*vswp = port->p_vswp;
2085 
2086 	D1(vswp, "%s: enter", __func__);
2087 
2088 	/*
2089 	 * We need to handle RDX ACK messages in-band as once they
2090 	 * are exchanged it is possible that we will get an
2091 	 * immediate (legitimate) data packet.
2092 	 */
2093 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2094 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2095 
2096 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2097 			return;
2098 
2099 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2100 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2101 		    "(ostate 0x%llx : hphase %d)", __func__,
2102 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2103 		vsw_next_milestone(ldcp);
2104 		return;
2105 	}
2106 
2107 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2108 
2109 	if (ctaskp == NULL) {
2110 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2111 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2112 		return;
2113 	}
2114 
2115 	ctaskp->ldcp = ldcp;
2116 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2117 	ctaskp->hss_id = ldcp->hss_id;
2118 
2119 	/*
2120 	 * Dispatch task to processing taskq if port is not in
2121 	 * the process of being detached.
2122 	 */
2123 	mutex_enter(&port->state_lock);
2124 	if (port->state == VSW_PORT_INIT) {
2125 		if ((vswp->taskq_p == NULL) ||
2126 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2127 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2128 			DERR(vswp, "%s: unable to dispatch task to taskq",
2129 			    __func__);
2130 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2131 			mutex_exit(&port->state_lock);
2132 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2133 			return;
2134 		}
2135 	} else {
2136 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2137 		    "task", __func__, port->p_instance);
2138 	}
2139 
2140 	mutex_exit(&port->state_lock);
2141 
2142 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2143 	    ldcp->ldc_id);
2144 	D1(vswp, "%s: exit", __func__);
2145 }
2146 
2147 /*
2148  * Process a VIO ctrl message. Invoked from taskq.
2149  */
2150 static void
2151 vsw_process_ctrl_pkt(void *arg)
2152 {
2153 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2154 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2155 	vsw_t 		*vswp = ldcp->ldc_vswp;
2156 	vio_msg_tag_t	tag;
2157 	uint16_t	env;
2158 
2159 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2160 
2161 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2162 	env = tag.vio_subtype_env;
2163 
2164 	/* stale pkt check */
2165 	if (ctaskp->hss_id < ldcp->hss_id) {
2166 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2167 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2168 		return;
2169 	}
2170 
2171 	/* session id check */
2172 	if (ldcp->session_status & VSW_PEER_SESSION) {
2173 		if (ldcp->peer_session != tag.vio_sid) {
2174 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2175 			    __func__, ldcp->ldc_id, tag.vio_sid);
2176 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2177 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2178 			return;
2179 		}
2180 	}
2181 
2182 	/*
2183 	 * Switch on vio_subtype envelope, then let lower routines
2184 	 * decide if its an INFO, ACK or NACK packet.
2185 	 */
2186 	switch (env) {
2187 	case VIO_VER_INFO:
2188 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2189 		break;
2190 	case VIO_DRING_REG:
2191 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2192 		break;
2193 	case VIO_DRING_UNREG:
2194 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2195 		break;
2196 	case VIO_ATTR_INFO:
2197 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2198 		break;
2199 	case VNET_MCAST_INFO:
2200 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2201 		break;
2202 	case VIO_RDX:
2203 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2204 		break;
2205 	default:
2206 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2207 	}
2208 
2209 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2210 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2211 }
2212 
2213 /*
2214  * Version negotiation. We can end up here either because our peer
2215  * has responded to a handshake message we have sent it, or our peer
2216  * has initiated a handshake with us. If its the former then can only
2217  * be ACK or NACK, if its the later can only be INFO.
2218  *
2219  * If its an ACK we move to the next stage of the handshake, namely
2220  * attribute exchange. If its a NACK we see if we can specify another
2221  * version, if we can't we stop.
2222  *
2223  * If it is an INFO we reset all params associated with communication
2224  * in that direction over this channel (remember connection is
2225  * essentially 2 independent simplex channels).
2226  */
2227 void
2228 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2229 {
2230 	vio_ver_msg_t	*ver_pkt;
2231 	vsw_t 		*vswp = ldcp->ldc_vswp;
2232 
2233 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2234 
2235 	/*
2236 	 * We know this is a ctrl/version packet so
2237 	 * cast it into the correct structure.
2238 	 */
2239 	ver_pkt = (vio_ver_msg_t *)pkt;
2240 
2241 	switch (ver_pkt->tag.vio_subtype) {
2242 	case VIO_SUBTYPE_INFO:
2243 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2244 
2245 		/*
2246 		 * Record the session id, which we will use from now
2247 		 * until we see another VER_INFO msg. Even then the
2248 		 * session id in most cases will be unchanged, execpt
2249 		 * if channel was reset.
2250 		 */
2251 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2252 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2253 			DERR(vswp, "%s: updating session id for chan %lld "
2254 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2255 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2256 		}
2257 
2258 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2259 		ldcp->session_status |= VSW_PEER_SESSION;
2260 
2261 		/* Legal message at this time ? */
2262 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2263 			return;
2264 
2265 		/*
2266 		 * First check the device class. Currently only expect
2267 		 * to be talking to a network device. In the future may
2268 		 * also talk to another switch.
2269 		 */
2270 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2271 			DERR(vswp, "%s: illegal device class %d", __func__,
2272 			    ver_pkt->dev_class);
2273 
2274 			ver_pkt->tag.vio_sid = ldcp->local_session;
2275 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2276 
2277 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2278 
2279 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2280 			    sizeof (vio_ver_msg_t), B_TRUE);
2281 
2282 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2283 			vsw_next_milestone(ldcp);
2284 			return;
2285 		} else {
2286 			ldcp->dev_class = ver_pkt->dev_class;
2287 		}
2288 
2289 		/*
2290 		 * Now check the version.
2291 		 */
2292 		if (vsw_supported_version(ver_pkt) == 0) {
2293 			/*
2294 			 * Support this major version and possibly
2295 			 * adjusted minor version.
2296 			 */
2297 
2298 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2299 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2300 
2301 			/* Store accepted values */
2302 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2303 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2304 
2305 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2306 
2307 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2308 
2309 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2310 				/*
2311 				 * Send a version info message
2312 				 * using the accepted version that
2313 				 * we are about to ack. Also note that
2314 				 * we send our ver info before we ack.
2315 				 * Otherwise, as soon as receiving the
2316 				 * ack, obp sends attr info msg, which
2317 				 * breaks vsw_check_flag() invoked
2318 				 * from vsw_process_ctrl_attr_pkt();
2319 				 * as we also need VSW_VER_ACK_RECV to
2320 				 * be set in lane_out.lstate, before
2321 				 * we can receive attr info.
2322 				 */
2323 				vsw_send_ver(ldcp);
2324 			}
2325 		} else {
2326 			/*
2327 			 * NACK back with the next lower major/minor
2328 			 * pairing we support (if don't suuport any more
2329 			 * versions then they will be set to zero.
2330 			 */
2331 
2332 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2333 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2334 
2335 			/* Store updated values */
2336 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2337 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2338 
2339 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2340 
2341 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2342 		}
2343 
2344 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2345 		ver_pkt->tag.vio_sid = ldcp->local_session;
2346 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2347 		    sizeof (vio_ver_msg_t), B_TRUE);
2348 
2349 		vsw_next_milestone(ldcp);
2350 		break;
2351 
2352 	case VIO_SUBTYPE_ACK:
2353 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2354 
2355 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2356 			return;
2357 
2358 		/* Store updated values */
2359 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2360 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2361 
2362 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2363 		vsw_next_milestone(ldcp);
2364 
2365 		break;
2366 
2367 	case VIO_SUBTYPE_NACK:
2368 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2369 
2370 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2371 			return;
2372 
2373 		/*
2374 		 * If our peer sent us a NACK with the ver fields set to
2375 		 * zero then there is nothing more we can do. Otherwise see
2376 		 * if we support either the version suggested, or a lesser
2377 		 * one.
2378 		 */
2379 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2380 			DERR(vswp, "%s: peer unable to negotiate any "
2381 			    "further.", __func__);
2382 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2383 			vsw_next_milestone(ldcp);
2384 			return;
2385 		}
2386 
2387 		/*
2388 		 * Check to see if we support this major version or
2389 		 * a lower one. If we don't then maj/min will be set
2390 		 * to zero.
2391 		 */
2392 		(void) vsw_supported_version(ver_pkt);
2393 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2394 			/* Nothing more we can do */
2395 			DERR(vswp, "%s: version negotiation failed.\n",
2396 			    __func__);
2397 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2398 			vsw_next_milestone(ldcp);
2399 		} else {
2400 			/* found a supported major version */
2401 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2402 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2403 
2404 			D2(vswp, "%s: resending with updated values (%x, %x)",
2405 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2406 
2407 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2408 			ver_pkt->tag.vio_sid = ldcp->local_session;
2409 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2410 
2411 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2412 
2413 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2414 			    sizeof (vio_ver_msg_t), B_TRUE);
2415 
2416 			vsw_next_milestone(ldcp);
2417 
2418 		}
2419 		break;
2420 
2421 	default:
2422 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2423 		    ver_pkt->tag.vio_subtype);
2424 	}
2425 
2426 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2427 }
2428 
2429 /*
2430  * Process an attribute packet. We can end up here either because our peer
2431  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2432  * peer has sent us an attribute INFO message
2433  *
2434  * If its an ACK we then move to the next stage of the handshake which
2435  * is to send our descriptor ring info to our peer. If its a NACK then
2436  * there is nothing more we can (currently) do.
2437  *
2438  * If we get a valid/acceptable INFO packet (and we have already negotiated
2439  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2440  * NACK back and reset channel state to INACTIV.
2441  *
2442  * FUTURE: in time we will probably negotiate over attributes, but for
2443  * the moment unacceptable attributes are regarded as a fatal error.
2444  *
2445  */
2446 void
2447 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2448 {
2449 	vnet_attr_msg_t		*attr_pkt;
2450 	vsw_t			*vswp = ldcp->ldc_vswp;
2451 	vsw_port_t		*port = ldcp->ldc_port;
2452 	uint64_t		macaddr = 0;
2453 	int			i;
2454 
2455 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2456 
2457 	/*
2458 	 * We know this is a ctrl/attr packet so
2459 	 * cast it into the correct structure.
2460 	 */
2461 	attr_pkt = (vnet_attr_msg_t *)pkt;
2462 
2463 	switch (attr_pkt->tag.vio_subtype) {
2464 	case VIO_SUBTYPE_INFO:
2465 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2466 
2467 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2468 			return;
2469 
2470 		/*
2471 		 * If the attributes are unacceptable then we NACK back.
2472 		 */
2473 		if (vsw_check_attr(attr_pkt, ldcp)) {
2474 
2475 			DERR(vswp, "%s (chan %d): invalid attributes",
2476 			    __func__, ldcp->ldc_id);
2477 
2478 			vsw_free_lane_resources(ldcp, INBOUND);
2479 
2480 			attr_pkt->tag.vio_sid = ldcp->local_session;
2481 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2482 
2483 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2484 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2485 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2486 			    sizeof (vnet_attr_msg_t), B_TRUE);
2487 
2488 			vsw_next_milestone(ldcp);
2489 			return;
2490 		}
2491 
2492 		/*
2493 		 * Otherwise store attributes for this lane and update
2494 		 * lane state.
2495 		 */
2496 		ldcp->lane_in.mtu = attr_pkt->mtu;
2497 		ldcp->lane_in.addr = attr_pkt->addr;
2498 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
2499 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
2500 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
2501 
2502 		macaddr = ldcp->lane_in.addr;
2503 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2504 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2505 			macaddr >>= 8;
2506 		}
2507 
2508 		/* create the fdb entry for this port/mac address */
2509 		vsw_fdbe_add(vswp, port);
2510 
2511 		/* add the port to the specified vlans */
2512 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2513 
2514 		/* setup device specifc xmit routines */
2515 		mutex_enter(&port->tx_lock);
2516 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2517 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2518 		    (VSW_VER_LT(ldcp, 1, 2) &&
2519 		    (ldcp->lane_in.xfer_mode == VIO_DRING_MODE_V1_0))) {
2520 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2521 			port->transmit = vsw_dringsend;
2522 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
2523 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2524 			vsw_create_privring(ldcp);
2525 			port->transmit = vsw_descrsend;
2526 			ldcp->lane_out.xfer_mode = VIO_DESC_MODE;
2527 		}
2528 		mutex_exit(&port->tx_lock);
2529 
2530 		attr_pkt->tag.vio_sid = ldcp->local_session;
2531 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2532 
2533 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2534 
2535 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
2536 
2537 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2538 		    sizeof (vnet_attr_msg_t), B_TRUE);
2539 
2540 		vsw_next_milestone(ldcp);
2541 		break;
2542 
2543 	case VIO_SUBTYPE_ACK:
2544 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2545 
2546 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2547 			return;
2548 
2549 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
2550 		vsw_next_milestone(ldcp);
2551 		break;
2552 
2553 	case VIO_SUBTYPE_NACK:
2554 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2555 
2556 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2557 			return;
2558 
2559 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
2560 		vsw_next_milestone(ldcp);
2561 		break;
2562 
2563 	default:
2564 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2565 		    attr_pkt->tag.vio_subtype);
2566 	}
2567 
2568 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2569 }
2570 
2571 /*
2572  * Process a dring info packet. We can end up here either because our peer
2573  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2574  * peer has sent us a dring INFO message.
2575  *
2576  * If we get a valid/acceptable INFO packet (and we have already negotiated
2577  * a version) we ACK back and update the lane state, otherwise we NACK back.
2578  *
2579  * FUTURE: nothing to stop client from sending us info on multiple dring's
2580  * but for the moment we will just use the first one we are given.
2581  *
2582  */
2583 void
2584 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2585 {
2586 	vio_dring_reg_msg_t	*dring_pkt;
2587 	vsw_t			*vswp = ldcp->ldc_vswp;
2588 	ldc_mem_info_t		minfo;
2589 	dring_info_t		*dp, *dbp;
2590 	int			dring_found = 0;
2591 
2592 	/*
2593 	 * We know this is a ctrl/dring packet so
2594 	 * cast it into the correct structure.
2595 	 */
2596 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2597 
2598 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2599 
2600 	switch (dring_pkt->tag.vio_subtype) {
2601 	case VIO_SUBTYPE_INFO:
2602 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2603 
2604 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2605 			return;
2606 
2607 		/*
2608 		 * If the dring params are unacceptable then we NACK back.
2609 		 */
2610 		if (vsw_check_dring_info(dring_pkt)) {
2611 
2612 			DERR(vswp, "%s (%lld): invalid dring info",
2613 			    __func__, ldcp->ldc_id);
2614 
2615 			vsw_free_lane_resources(ldcp, INBOUND);
2616 
2617 			dring_pkt->tag.vio_sid = ldcp->local_session;
2618 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2619 
2620 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2621 
2622 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2623 
2624 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2625 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2626 
2627 			vsw_next_milestone(ldcp);
2628 			return;
2629 		}
2630 
2631 		/*
2632 		 * Otherwise, attempt to map in the dring using the
2633 		 * cookie. If that succeeds we send back a unique dring
2634 		 * identifier that the sending side will use in future
2635 		 * to refer to this descriptor ring.
2636 		 */
2637 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2638 
2639 		dp->num_descriptors = dring_pkt->num_descriptors;
2640 		dp->descriptor_size = dring_pkt->descriptor_size;
2641 		dp->options = dring_pkt->options;
2642 		dp->ncookies = dring_pkt->ncookies;
2643 
2644 		/*
2645 		 * Note: should only get one cookie. Enforced in
2646 		 * the ldc layer.
2647 		 */
2648 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2649 		    sizeof (ldc_mem_cookie_t));
2650 
2651 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2652 		    dp->num_descriptors, dp->descriptor_size);
2653 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2654 		    dp->options, dp->ncookies);
2655 
2656 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2657 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2658 		    LDC_SHADOW_MAP, &(dp->handle))) != 0) {
2659 
2660 			DERR(vswp, "%s: dring_map failed\n", __func__);
2661 
2662 			kmem_free(dp, sizeof (dring_info_t));
2663 			vsw_free_lane_resources(ldcp, INBOUND);
2664 
2665 			dring_pkt->tag.vio_sid = ldcp->local_session;
2666 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2667 
2668 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2669 
2670 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2671 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2672 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2673 
2674 			vsw_next_milestone(ldcp);
2675 			return;
2676 		}
2677 
2678 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2679 
2680 			DERR(vswp, "%s: dring_addr failed\n", __func__);
2681 
2682 			kmem_free(dp, sizeof (dring_info_t));
2683 			vsw_free_lane_resources(ldcp, INBOUND);
2684 
2685 			dring_pkt->tag.vio_sid = ldcp->local_session;
2686 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2687 
2688 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2689 
2690 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2691 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2692 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2693 
2694 			vsw_next_milestone(ldcp);
2695 			return;
2696 		} else {
2697 			/* store the address of the pub part of ring */
2698 			dp->pub_addr = minfo.vaddr;
2699 		}
2700 
2701 		/* no private section as we are importing */
2702 		dp->priv_addr = NULL;
2703 
2704 		/*
2705 		 * Using simple mono increasing int for ident at
2706 		 * the moment.
2707 		 */
2708 		dp->ident = ldcp->next_ident;
2709 		ldcp->next_ident++;
2710 
2711 		dp->end_idx = 0;
2712 		dp->next = NULL;
2713 
2714 		/*
2715 		 * Link it onto the end of the list of drings
2716 		 * for this lane.
2717 		 */
2718 		if (ldcp->lane_in.dringp == NULL) {
2719 			D2(vswp, "%s: adding first INBOUND dring", __func__);
2720 			ldcp->lane_in.dringp = dp;
2721 		} else {
2722 			dbp = ldcp->lane_in.dringp;
2723 
2724 			while (dbp->next != NULL)
2725 				dbp = dbp->next;
2726 
2727 			dbp->next = dp;
2728 		}
2729 
2730 		/* acknowledge it */
2731 		dring_pkt->tag.vio_sid = ldcp->local_session;
2732 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2733 		dring_pkt->dring_ident = dp->ident;
2734 
2735 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2736 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
2737 
2738 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
2739 		vsw_next_milestone(ldcp);
2740 		break;
2741 
2742 	case VIO_SUBTYPE_ACK:
2743 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2744 
2745 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
2746 			return;
2747 
2748 		/*
2749 		 * Peer is acknowledging our dring info and will have
2750 		 * sent us a dring identifier which we will use to
2751 		 * refer to this ring w.r.t. our peer.
2752 		 */
2753 		dp = ldcp->lane_out.dringp;
2754 		if (dp != NULL) {
2755 			/*
2756 			 * Find the ring this ident should be associated
2757 			 * with.
2758 			 */
2759 			if (vsw_dring_match(dp, dring_pkt)) {
2760 				dring_found = 1;
2761 
2762 			} else while (dp != NULL) {
2763 				if (vsw_dring_match(dp, dring_pkt)) {
2764 					dring_found = 1;
2765 					break;
2766 				}
2767 				dp = dp->next;
2768 			}
2769 
2770 			if (dring_found == 0) {
2771 				DERR(NULL, "%s: unrecognised ring cookie",
2772 				    __func__);
2773 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2774 				return;
2775 			}
2776 
2777 		} else {
2778 			DERR(vswp, "%s: DRING ACK received but no drings "
2779 			    "allocated", __func__);
2780 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2781 			return;
2782 		}
2783 
2784 		/* store ident */
2785 		dp->ident = dring_pkt->dring_ident;
2786 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
2787 		vsw_next_milestone(ldcp);
2788 		break;
2789 
2790 	case VIO_SUBTYPE_NACK:
2791 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2792 
2793 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
2794 			return;
2795 
2796 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
2797 		vsw_next_milestone(ldcp);
2798 		break;
2799 
2800 	default:
2801 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2802 		    dring_pkt->tag.vio_subtype);
2803 	}
2804 
2805 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2806 }
2807 
2808 /*
2809  * Process a request from peer to unregister a dring.
2810  *
2811  * For the moment we just restart the handshake if our
2812  * peer endpoint attempts to unregister a dring.
2813  */
2814 void
2815 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
2816 {
2817 	vsw_t			*vswp = ldcp->ldc_vswp;
2818 	vio_dring_unreg_msg_t	*dring_pkt;
2819 
2820 	/*
2821 	 * We know this is a ctrl/dring packet so
2822 	 * cast it into the correct structure.
2823 	 */
2824 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
2825 
2826 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2827 
2828 	switch (dring_pkt->tag.vio_subtype) {
2829 	case VIO_SUBTYPE_INFO:
2830 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2831 
2832 		DWARN(vswp, "%s: restarting handshake..", __func__);
2833 		break;
2834 
2835 	case VIO_SUBTYPE_ACK:
2836 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2837 
2838 		DWARN(vswp, "%s: restarting handshake..", __func__);
2839 		break;
2840 
2841 	case VIO_SUBTYPE_NACK:
2842 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2843 
2844 		DWARN(vswp, "%s: restarting handshake..", __func__);
2845 		break;
2846 
2847 	default:
2848 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2849 		    dring_pkt->tag.vio_subtype);
2850 	}
2851 
2852 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2853 
2854 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2855 }
2856 
2857 #define	SND_MCST_NACK(ldcp, pkt) \
2858 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2859 	pkt->tag.vio_sid = ldcp->local_session; \
2860 	(void) vsw_send_msg(ldcp, (void *)pkt, \
2861 			sizeof (vnet_mcast_msg_t), B_TRUE);
2862 
2863 /*
2864  * Process a multicast request from a vnet.
2865  *
2866  * Vnet's specify a multicast address that they are interested in. This
2867  * address is used as a key into the hash table which forms the multicast
2868  * forwarding database (mFDB).
2869  *
2870  * The table keys are the multicast addresses, while the table entries
2871  * are pointers to lists of ports which wish to receive packets for the
2872  * specified multicast address.
2873  *
2874  * When a multicast packet is being switched we use the address as a key
2875  * into the hash table, and then walk the appropriate port list forwarding
2876  * the pkt to each port in turn.
2877  *
2878  * If a vnet is no longer interested in a particular multicast grouping
2879  * we simply find the correct location in the hash table and then delete
2880  * the relevant port from the port list.
2881  *
2882  * To deal with the case whereby a port is being deleted without first
2883  * removing itself from the lists in the hash table, we maintain a list
2884  * of multicast addresses the port has registered an interest in, within
2885  * the port structure itself. We then simply walk that list of addresses
2886  * using them as keys into the hash table and remove the port from the
2887  * appropriate lists.
2888  */
2889 static void
2890 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
2891 {
2892 	vnet_mcast_msg_t	*mcst_pkt;
2893 	vsw_port_t		*port = ldcp->ldc_port;
2894 	vsw_t			*vswp = ldcp->ldc_vswp;
2895 	int			i;
2896 
2897 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2898 
2899 	/*
2900 	 * We know this is a ctrl/mcast packet so
2901 	 * cast it into the correct structure.
2902 	 */
2903 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
2904 
2905 	switch (mcst_pkt->tag.vio_subtype) {
2906 	case VIO_SUBTYPE_INFO:
2907 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2908 
2909 		/*
2910 		 * Check if in correct state to receive a multicast
2911 		 * message (i.e. handshake complete). If not reset
2912 		 * the handshake.
2913 		 */
2914 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
2915 			return;
2916 
2917 		/*
2918 		 * Before attempting to add or remove address check
2919 		 * that they are valid multicast addresses.
2920 		 * If not, then NACK back.
2921 		 */
2922 		for (i = 0; i < mcst_pkt->count; i++) {
2923 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
2924 				DERR(vswp, "%s: invalid multicast address",
2925 				    __func__);
2926 				SND_MCST_NACK(ldcp, mcst_pkt);
2927 				return;
2928 			}
2929 		}
2930 
2931 		/*
2932 		 * Now add/remove the addresses. If this fails we
2933 		 * NACK back.
2934 		 */
2935 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
2936 			SND_MCST_NACK(ldcp, mcst_pkt);
2937 			return;
2938 		}
2939 
2940 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2941 		mcst_pkt->tag.vio_sid = ldcp->local_session;
2942 
2943 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
2944 
2945 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
2946 		    sizeof (vnet_mcast_msg_t), B_TRUE);
2947 		break;
2948 
2949 	case VIO_SUBTYPE_ACK:
2950 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2951 
2952 		/*
2953 		 * We shouldn't ever get a multicast ACK message as
2954 		 * at the moment we never request multicast addresses
2955 		 * to be set on some other device. This may change in
2956 		 * the future if we have cascading switches.
2957 		 */
2958 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
2959 			return;
2960 
2961 				/* Do nothing */
2962 		break;
2963 
2964 	case VIO_SUBTYPE_NACK:
2965 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2966 
2967 		/*
2968 		 * We shouldn't get a multicast NACK packet for the
2969 		 * same reasons as we shouldn't get a ACK packet.
2970 		 */
2971 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
2972 			return;
2973 
2974 				/* Do nothing */
2975 		break;
2976 
2977 	default:
2978 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2979 		    mcst_pkt->tag.vio_subtype);
2980 	}
2981 
2982 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2983 }
2984 
2985 static void
2986 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
2987 {
2988 	vio_rdx_msg_t	*rdx_pkt;
2989 	vsw_t		*vswp = ldcp->ldc_vswp;
2990 
2991 	/*
2992 	 * We know this is a ctrl/rdx packet so
2993 	 * cast it into the correct structure.
2994 	 */
2995 	rdx_pkt = (vio_rdx_msg_t *)pkt;
2996 
2997 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2998 
2999 	switch (rdx_pkt->tag.vio_subtype) {
3000 	case VIO_SUBTYPE_INFO:
3001 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3002 
3003 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3004 			return;
3005 
3006 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3007 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3008 
3009 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3010 
3011 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3012 
3013 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3014 		    sizeof (vio_rdx_msg_t), B_TRUE);
3015 
3016 		vsw_next_milestone(ldcp);
3017 		break;
3018 
3019 	case VIO_SUBTYPE_ACK:
3020 		/*
3021 		 * Should be handled in-band by callback handler.
3022 		 */
3023 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3024 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3025 		break;
3026 
3027 	case VIO_SUBTYPE_NACK:
3028 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3029 
3030 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3031 			return;
3032 
3033 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3034 		vsw_next_milestone(ldcp);
3035 		break;
3036 
3037 	default:
3038 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3039 		    rdx_pkt->tag.vio_subtype);
3040 	}
3041 
3042 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3043 }
3044 
3045 static void
3046 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3047 	uint32_t msglen)
3048 {
3049 	uint16_t	env = tagp->vio_subtype_env;
3050 	vsw_t		*vswp = ldcp->ldc_vswp;
3051 
3052 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3053 
3054 	/* session id check */
3055 	if (ldcp->session_status & VSW_PEER_SESSION) {
3056 		if (ldcp->peer_session != tagp->vio_sid) {
3057 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3058 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3059 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3060 			return;
3061 		}
3062 	}
3063 
3064 	/*
3065 	 * It is an error for us to be getting data packets
3066 	 * before the handshake has completed.
3067 	 */
3068 	if (ldcp->hphase != VSW_MILESTONE4) {
3069 		DERR(vswp, "%s: got data packet before handshake complete "
3070 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3071 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3072 		DUMP_FLAGS(ldcp->lane_in.lstate);
3073 		DUMP_FLAGS(ldcp->lane_out.lstate);
3074 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3075 		return;
3076 	}
3077 
3078 	/*
3079 	 * To reduce the locking contention, release the
3080 	 * ldc_cblock here and re-acquire it once we are done
3081 	 * receiving packets.
3082 	 */
3083 	mutex_exit(&ldcp->ldc_cblock);
3084 	mutex_enter(&ldcp->ldc_rxlock);
3085 
3086 	/*
3087 	 * Switch on vio_subtype envelope, then let lower routines
3088 	 * decide if its an INFO, ACK or NACK packet.
3089 	 */
3090 	if (env == VIO_DRING_DATA) {
3091 		vsw_process_data_dring_pkt(ldcp, dpkt);
3092 	} else if (env == VIO_PKT_DATA) {
3093 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3094 	} else if (env == VIO_DESC_DATA) {
3095 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3096 	} else {
3097 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3098 	}
3099 
3100 	mutex_exit(&ldcp->ldc_rxlock);
3101 	mutex_enter(&ldcp->ldc_cblock);
3102 
3103 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3104 }
3105 
3106 #define	SND_DRING_NACK(ldcp, pkt) \
3107 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3108 	pkt->tag.vio_sid = ldcp->local_session; \
3109 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3110 			sizeof (vio_dring_msg_t), B_TRUE);
3111 
3112 static void
3113 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3114 {
3115 	vio_dring_msg_t		*dring_pkt;
3116 	vnet_public_desc_t	*pub_addr = NULL;
3117 	vsw_private_desc_t	*priv_addr = NULL;
3118 	dring_info_t		*dp = NULL;
3119 	vsw_t			*vswp = ldcp->ldc_vswp;
3120 	mblk_t			*mp = NULL;
3121 	mblk_t			*bp = NULL;
3122 	mblk_t			*bpt = NULL;
3123 	size_t			nbytes = 0;
3124 	uint64_t		ncookies = 0;
3125 	uint64_t		chain = 0;
3126 	uint64_t		len;
3127 	uint32_t		pos, start, datalen;
3128 	uint32_t		range_start, range_end;
3129 	int32_t			end, num, cnt = 0;
3130 	int			i, rv, msg_rv = 0;
3131 	boolean_t		ack_needed = B_FALSE;
3132 	boolean_t		prev_desc_ack = B_FALSE;
3133 	int			read_attempts = 0;
3134 	struct ether_header	*ehp;
3135 
3136 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3137 
3138 	/*
3139 	 * We know this is a data/dring packet so
3140 	 * cast it into the correct structure.
3141 	 */
3142 	dring_pkt = (vio_dring_msg_t *)dpkt;
3143 
3144 	/*
3145 	 * Switch on the vio_subtype. If its INFO then we need to
3146 	 * process the data. If its an ACK we need to make sure
3147 	 * it makes sense (i.e did we send an earlier data/info),
3148 	 * and if its a NACK then we maybe attempt a retry.
3149 	 */
3150 	switch (dring_pkt->tag.vio_subtype) {
3151 	case VIO_SUBTYPE_INFO:
3152 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3153 
3154 		READ_ENTER(&ldcp->lane_in.dlistrw);
3155 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3156 		    dring_pkt->dring_ident)) == NULL) {
3157 			RW_EXIT(&ldcp->lane_in.dlistrw);
3158 
3159 			DERR(vswp, "%s(%lld): unable to find dring from "
3160 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3161 			    dring_pkt->dring_ident);
3162 
3163 			SND_DRING_NACK(ldcp, dring_pkt);
3164 			return;
3165 		}
3166 
3167 		start = pos = dring_pkt->start_idx;
3168 		end = dring_pkt->end_idx;
3169 		len = dp->num_descriptors;
3170 
3171 		range_start = range_end = pos;
3172 
3173 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3174 		    __func__, ldcp->ldc_id, start, end);
3175 
3176 		if (end == -1) {
3177 			num = -1;
3178 		} else if (end >= 0) {
3179 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3180 
3181 			/* basic sanity check */
3182 			if (end > len) {
3183 				RW_EXIT(&ldcp->lane_in.dlistrw);
3184 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3185 				    "ring length %lld", __func__,
3186 				    ldcp->ldc_id, end, len);
3187 
3188 				SND_DRING_NACK(ldcp, dring_pkt);
3189 				return;
3190 			}
3191 		} else {
3192 			RW_EXIT(&ldcp->lane_in.dlistrw);
3193 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3194 			    __func__, ldcp->ldc_id, end);
3195 			SND_DRING_NACK(ldcp, dring_pkt);
3196 			return;
3197 		}
3198 
3199 		while (cnt != num) {
3200 vsw_recheck_desc:
3201 			if ((rv = ldc_mem_dring_acquire(dp->handle,
3202 			    pos, pos)) != 0) {
3203 				RW_EXIT(&ldcp->lane_in.dlistrw);
3204 				DERR(vswp, "%s(%lld): unable to acquire "
3205 				    "descriptor at pos %d: err %d",
3206 				    __func__, pos, ldcp->ldc_id, rv);
3207 				SND_DRING_NACK(ldcp, dring_pkt);
3208 				ldcp->ldc_stats.ierrors++;
3209 				return;
3210 			}
3211 
3212 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3213 
3214 			/*
3215 			 * When given a bounded range of descriptors
3216 			 * to process, its an error to hit a descriptor
3217 			 * which is not ready. In the non-bounded case
3218 			 * (end_idx == -1) this simply indicates we have
3219 			 * reached the end of the current active range.
3220 			 */
3221 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
3222 				/* unbound - no error */
3223 				if (end == -1) {
3224 					if (read_attempts == vsw_read_attempts)
3225 						break;
3226 
3227 					delay(drv_usectohz(vsw_desc_delay));
3228 					read_attempts++;
3229 					goto vsw_recheck_desc;
3230 				}
3231 
3232 				/* bounded - error - so NACK back */
3233 				RW_EXIT(&ldcp->lane_in.dlistrw);
3234 				DERR(vswp, "%s(%lld): descriptor not READY "
3235 				    "(%d)", __func__, ldcp->ldc_id,
3236 				    pub_addr->hdr.dstate);
3237 				SND_DRING_NACK(ldcp, dring_pkt);
3238 				return;
3239 			}
3240 
3241 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3242 
3243 			range_end = pos;
3244 
3245 			/*
3246 			 * If we ACK'd the previous descriptor then now
3247 			 * record the new range start position for later
3248 			 * ACK's.
3249 			 */
3250 			if (prev_desc_ack) {
3251 				range_start = pos;
3252 
3253 				D2(vswp, "%s(%lld): updating range start to be "
3254 				    "%d", __func__, ldcp->ldc_id, range_start);
3255 
3256 				prev_desc_ack = B_FALSE;
3257 			}
3258 
3259 			/*
3260 			 * Data is padded to align on 8 byte boundary,
3261 			 * datalen is actual data length, i.e. minus that
3262 			 * padding.
3263 			 */
3264 			datalen = pub_addr->nbytes;
3265 
3266 			/*
3267 			 * Does peer wish us to ACK when we have finished
3268 			 * with this descriptor ?
3269 			 */
3270 			if (pub_addr->hdr.ack)
3271 				ack_needed = B_TRUE;
3272 
3273 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3274 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3275 			    __func__, ldcp->ldc_id, pos, pub_addr,
3276 			    pub_addr->hdr.dstate, datalen);
3277 
3278 			/*
3279 			 * Mark that we are starting to process descriptor.
3280 			 */
3281 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
3282 
3283 			/*
3284 			 * Ensure that we ask ldc for an aligned
3285 			 * number of bytes.
3286 			 */
3287 			nbytes = (datalen + VNET_IPALIGN + 7) & ~7;
3288 
3289 			mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3290 			if (mp == NULL) {
3291 				ldcp->ldc_stats.rx_vio_allocb_fail++;
3292 				/*
3293 				 * No free receive buffers available, so
3294 				 * fallback onto allocb(9F). Make sure that
3295 				 * we get a data buffer which is a multiple
3296 				 * of 8 as this is required by ldc_mem_copy.
3297 				 */
3298 				DTRACE_PROBE(allocb);
3299 				if ((mp = allocb(datalen + VNET_IPALIGN + 8,
3300 				    BPRI_MED)) == NULL) {
3301 					DERR(vswp, "%s(%ld): allocb failed",
3302 					    __func__, ldcp->ldc_id);
3303 					pub_addr->hdr.dstate = VIO_DESC_DONE;
3304 					(void) ldc_mem_dring_release(dp->handle,
3305 					    pos, pos);
3306 					ldcp->ldc_stats.ierrors++;
3307 					ldcp->ldc_stats.rx_allocb_fail++;
3308 					break;
3309 				}
3310 			}
3311 
3312 			ncookies = pub_addr->ncookies;
3313 			rv = ldc_mem_copy(ldcp->ldc_handle,
3314 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3315 			    pub_addr->memcookie, ncookies, LDC_COPY_IN);
3316 
3317 			if (rv != 0) {
3318 				DERR(vswp, "%s(%d): unable to copy in data "
3319 				    "from %d cookies in desc %d (rv %d)",
3320 				    __func__, ldcp->ldc_id, ncookies, pos, rv);
3321 				freemsg(mp);
3322 
3323 				pub_addr->hdr.dstate = VIO_DESC_DONE;
3324 				(void) ldc_mem_dring_release(dp->handle,
3325 				    pos, pos);
3326 				ldcp->ldc_stats.ierrors++;
3327 				break;
3328 			} else {
3329 				D2(vswp, "%s(%d): copied in %ld bytes"
3330 				    " using %d cookies", __func__,
3331 				    ldcp->ldc_id, nbytes, ncookies);
3332 			}
3333 
3334 			/* adjust the read pointer to skip over the padding */
3335 			mp->b_rptr += VNET_IPALIGN;
3336 
3337 			/* point to the actual end of data */
3338 			mp->b_wptr = mp->b_rptr + datalen;
3339 
3340 			/* update statistics */
3341 			ehp = (struct ether_header *)mp->b_rptr;
3342 			if (IS_BROADCAST(ehp))
3343 				ldcp->ldc_stats.brdcstrcv++;
3344 			else if (IS_MULTICAST(ehp))
3345 				ldcp->ldc_stats.multircv++;
3346 
3347 			ldcp->ldc_stats.ipackets++;
3348 			ldcp->ldc_stats.rbytes += datalen;
3349 
3350 			/*
3351 			 * IPALIGN space can be used for VLAN_TAG
3352 			 */
3353 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3354 			    VSW_VNETPORT, mp);
3355 
3356 			/* build a chain of received packets */
3357 			if (bp == NULL) {
3358 				/* first pkt */
3359 				bp = mp;
3360 				bp->b_next = bp->b_prev = NULL;
3361 				bpt = bp;
3362 				chain = 1;
3363 			} else {
3364 				mp->b_next = mp->b_prev = NULL;
3365 				bpt->b_next = mp;
3366 				bpt = mp;
3367 				chain++;
3368 			}
3369 
3370 			/* mark we are finished with this descriptor */
3371 			pub_addr->hdr.dstate = VIO_DESC_DONE;
3372 
3373 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
3374 
3375 			/*
3376 			 * Send an ACK back to peer if requested.
3377 			 */
3378 			if (ack_needed) {
3379 				ack_needed = B_FALSE;
3380 
3381 				dring_pkt->start_idx = range_start;
3382 				dring_pkt->end_idx = range_end;
3383 
3384 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3385 				    " requested", __func__, ldcp->ldc_id,
3386 				    dring_pkt->start_idx, dring_pkt->end_idx);
3387 
3388 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3389 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3390 				dring_pkt->tag.vio_sid = ldcp->local_session;
3391 
3392 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3393 				    sizeof (vio_dring_msg_t), B_FALSE);
3394 
3395 				/*
3396 				 * Check if ACK was successfully sent. If not
3397 				 * we break and deal with that below.
3398 				 */
3399 				if (msg_rv != 0)
3400 					break;
3401 
3402 				prev_desc_ack = B_TRUE;
3403 				range_start = pos;
3404 			}
3405 
3406 			/* next descriptor */
3407 			pos = (pos + 1) % len;
3408 			cnt++;
3409 
3410 			/*
3411 			 * Break out of loop here and stop processing to
3412 			 * allow some other network device (or disk) to
3413 			 * get access to the cpu.
3414 			 */
3415 			if (chain > vsw_chain_len) {
3416 				D3(vswp, "%s(%lld): switching chain of %d "
3417 				    "msgs", __func__, ldcp->ldc_id, chain);
3418 				break;
3419 			}
3420 		}
3421 		RW_EXIT(&ldcp->lane_in.dlistrw);
3422 
3423 		/*
3424 		 * If when we attempted to send the ACK we found that the
3425 		 * channel had been reset then now handle this. We deal with
3426 		 * it here as we cannot reset the channel while holding the
3427 		 * dlistrw lock, and we don't want to acquire/release it
3428 		 * continuously in the above loop, as a channel reset should
3429 		 * be a rare event.
3430 		 */
3431 		if (msg_rv == ECONNRESET) {
3432 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3433 			break;
3434 		}
3435 
3436 		/* send the chain of packets to be switched */
3437 		if (bp != NULL) {
3438 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3439 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3440 			    __func__, ldcp->ldc_id, chain);
3441 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3442 			    ldcp->ldc_port, NULL);
3443 		}
3444 
3445 		DTRACE_PROBE1(msg_cnt, int, cnt);
3446 
3447 		/*
3448 		 * We are now finished so ACK back with the state
3449 		 * set to STOPPING so our peer knows we are finished
3450 		 */
3451 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3452 		dring_pkt->tag.vio_sid = ldcp->local_session;
3453 
3454 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3455 
3456 		DTRACE_PROBE(stop_process_sent);
3457 
3458 		/*
3459 		 * We have not processed any more descriptors beyond
3460 		 * the last one we ACK'd.
3461 		 */
3462 		if (prev_desc_ack)
3463 			range_start = range_end;
3464 
3465 		dring_pkt->start_idx = range_start;
3466 		dring_pkt->end_idx = range_end;
3467 
3468 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3469 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3470 		    dring_pkt->end_idx);
3471 
3472 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3473 		    sizeof (vio_dring_msg_t), B_TRUE);
3474 		break;
3475 
3476 	case VIO_SUBTYPE_ACK:
3477 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3478 		/*
3479 		 * Verify that the relevant descriptors are all
3480 		 * marked as DONE
3481 		 */
3482 		READ_ENTER(&ldcp->lane_out.dlistrw);
3483 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3484 		    dring_pkt->dring_ident)) == NULL) {
3485 			RW_EXIT(&ldcp->lane_out.dlistrw);
3486 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3487 			return;
3488 		}
3489 
3490 		start = end = 0;
3491 		start = dring_pkt->start_idx;
3492 		end = dring_pkt->end_idx;
3493 		len = dp->num_descriptors;
3494 
3495 
3496 		mutex_enter(&dp->dlock);
3497 		dp->last_ack_recv = end;
3498 		ldcp->ldc_stats.dring_data_acks++;
3499 		mutex_exit(&dp->dlock);
3500 
3501 		(void) vsw_reclaim_dring(dp, start);
3502 
3503 		/*
3504 		 * If our peer is stopping processing descriptors then
3505 		 * we check to make sure it has processed all the descriptors
3506 		 * we have updated. If not then we send it a new message
3507 		 * to prompt it to restart.
3508 		 */
3509 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3510 			DTRACE_PROBE(stop_process_recv);
3511 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3512 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3513 			    dring_pkt->end_idx);
3514 
3515 			/*
3516 			 * Check next descriptor in public section of ring.
3517 			 * If its marked as READY then we need to prompt our
3518 			 * peer to start processing the ring again.
3519 			 */
3520 			i = (end + 1) % len;
3521 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3522 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3523 
3524 			/*
3525 			 * Hold the restart lock across all of this to
3526 			 * make sure that its not possible for us to
3527 			 * decide that a msg needs to be sent in the future
3528 			 * but the sending code having already checked is
3529 			 * about to exit.
3530 			 */
3531 			mutex_enter(&dp->restart_lock);
3532 			ldcp->ldc_stats.dring_stopped_acks++;
3533 			mutex_enter(&priv_addr->dstate_lock);
3534 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3535 
3536 				mutex_exit(&priv_addr->dstate_lock);
3537 
3538 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3539 				dring_pkt->tag.vio_sid = ldcp->local_session;
3540 
3541 				dring_pkt->start_idx = (end + 1) % len;
3542 				dring_pkt->end_idx = -1;
3543 
3544 				D2(vswp, "%s(%lld) : sending restart msg:"
3545 				    " %d : %d", __func__, ldcp->ldc_id,
3546 				    dring_pkt->start_idx, dring_pkt->end_idx);
3547 
3548 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3549 				    sizeof (vio_dring_msg_t), B_FALSE);
3550 				ldcp->ldc_stats.dring_data_msgs++;
3551 
3552 			} else {
3553 				mutex_exit(&priv_addr->dstate_lock);
3554 				dp->restart_reqd = B_TRUE;
3555 			}
3556 			mutex_exit(&dp->restart_lock);
3557 		}
3558 		RW_EXIT(&ldcp->lane_out.dlistrw);
3559 
3560 		/* only do channel reset after dropping dlistrw lock */
3561 		if (msg_rv == ECONNRESET)
3562 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3563 
3564 		break;
3565 
3566 	case VIO_SUBTYPE_NACK:
3567 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3568 		    __func__, ldcp->ldc_id);
3569 		/*
3570 		 * Something is badly wrong if we are getting NACK's
3571 		 * for our data pkts. So reset the channel.
3572 		 */
3573 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3574 
3575 		break;
3576 
3577 	default:
3578 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3579 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3580 	}
3581 
3582 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3583 }
3584 
3585 /*
3586  * dummy pkt data handler function for vnet protocol version 1.0
3587  */
3588 static void
3589 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3590 {
3591 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3592 }
3593 
3594 /*
3595  * This function handles raw pkt data messages received over the channel.
3596  * Currently, only priority-eth-type frames are received through this mechanism.
3597  * In this case, the frame(data) is present within the message itself which
3598  * is copied into an mblk before switching it.
3599  */
3600 static void
3601 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3602 {
3603 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3604 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3605 	uint32_t		size;
3606 	mblk_t			*mp;
3607 	vsw_t			*vswp = ldcp->ldc_vswp;
3608 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3609 	lane_t			*lp = &ldcp->lane_out;
3610 
3611 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3612 	if (size < ETHERMIN || size > lp->mtu) {
3613 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3614 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3615 		    ldcp->ldc_id, size);
3616 		return;
3617 	}
3618 
3619 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3620 	if (mp == NULL) {
3621 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3622 		if (mp == NULL) {
3623 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3624 			DWARN(vswp, "%s(%lld) allocb failure, "
3625 			    "unable to process priority frame\n", __func__,
3626 			    ldcp->ldc_id);
3627 			return;
3628 		}
3629 	}
3630 
3631 	/* skip over the extra space for vlan tag */
3632 	mp->b_rptr += VLAN_TAGSZ;
3633 
3634 	/* copy the frame from the payload of raw data msg into the mblk */
3635 	bcopy(dpkt->data, mp->b_rptr, size);
3636 	mp->b_wptr = mp->b_rptr + size;
3637 
3638 	/* update stats */
3639 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3640 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3641 
3642 	/*
3643 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3644 	 */
3645 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3646 
3647 	/* switch the frame to destination */
3648 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3649 }
3650 
3651 /*
3652  * Process an in-band descriptor message (most likely from
3653  * OBP).
3654  */
3655 static void
3656 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3657 {
3658 	vnet_ibnd_desc_t	*ibnd_desc;
3659 	dring_info_t		*dp = NULL;
3660 	vsw_private_desc_t	*priv_addr = NULL;
3661 	vsw_t			*vswp = ldcp->ldc_vswp;
3662 	mblk_t			*mp = NULL;
3663 	size_t			nbytes = 0;
3664 	size_t			off = 0;
3665 	uint64_t		idx = 0;
3666 	uint32_t		num = 1, len, datalen = 0;
3667 	uint64_t		ncookies = 0;
3668 	int			i, rv;
3669 	int			j = 0;
3670 
3671 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3672 
3673 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3674 
3675 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3676 	case VIO_SUBTYPE_INFO:
3677 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3678 
3679 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3680 			return;
3681 
3682 		/*
3683 		 * Data is padded to align on a 8 byte boundary,
3684 		 * nbytes is actual data length, i.e. minus that
3685 		 * padding.
3686 		 */
3687 		datalen = ibnd_desc->nbytes;
3688 
3689 		D2(vswp, "%s(%lld): processing inband desc : "
3690 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3691 
3692 		ncookies = ibnd_desc->ncookies;
3693 
3694 		/*
3695 		 * allocb(9F) returns an aligned data block. We
3696 		 * need to ensure that we ask ldc for an aligned
3697 		 * number of bytes also.
3698 		 */
3699 		nbytes = datalen;
3700 		if (nbytes & 0x7) {
3701 			off = 8 - (nbytes & 0x7);
3702 			nbytes += off;
3703 		}
3704 
3705 		/* alloc extra space for VLAN_TAG */
3706 		mp = allocb(datalen + 8, BPRI_MED);
3707 		if (mp == NULL) {
3708 			DERR(vswp, "%s(%lld): allocb failed",
3709 			    __func__, ldcp->ldc_id);
3710 			ldcp->ldc_stats.rx_allocb_fail++;
3711 			return;
3712 		}
3713 
3714 		/* skip over the extra space for VLAN_TAG */
3715 		mp->b_rptr += 8;
3716 
3717 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3718 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3719 		    LDC_COPY_IN);
3720 
3721 		if (rv != 0) {
3722 			DERR(vswp, "%s(%d): unable to copy in data from "
3723 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3724 			freemsg(mp);
3725 			ldcp->ldc_stats.ierrors++;
3726 			return;
3727 		}
3728 
3729 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3730 		    __func__, ldcp->ldc_id, nbytes, ncookies);
3731 
3732 		/* point to the actual end of data */
3733 		mp->b_wptr = mp->b_rptr + datalen;
3734 		ldcp->ldc_stats.ipackets++;
3735 		ldcp->ldc_stats.rbytes += datalen;
3736 
3737 		/*
3738 		 * We ACK back every in-band descriptor message we process
3739 		 */
3740 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3741 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3742 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3743 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3744 
3745 		/*
3746 		 * there is extra space alloc'd for VLAN_TAG
3747 		 */
3748 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3749 
3750 		/* send the packet to be switched */
3751 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3752 		    ldcp->ldc_port, NULL);
3753 
3754 		break;
3755 
3756 	case VIO_SUBTYPE_ACK:
3757 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3758 
3759 		/* Verify the ACK is valid */
3760 		idx = ibnd_desc->hdr.desc_handle;
3761 
3762 		if (idx >= vsw_ntxds) {
3763 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3764 			    "(idx %ld)", vswp->instance, idx);
3765 			return;
3766 		}
3767 
3768 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3769 			DERR(vswp, "%s: no dring found", __func__);
3770 			return;
3771 		}
3772 
3773 		len = dp->num_descriptors;
3774 		/*
3775 		 * If the descriptor we are being ACK'ed for is not the
3776 		 * one we expected, then pkts were lost somwhere, either
3777 		 * when we tried to send a msg, or a previous ACK msg from
3778 		 * our peer. In either case we now reclaim the descriptors
3779 		 * in the range from the last ACK we received up to the
3780 		 * current ACK.
3781 		 */
3782 		if (idx != dp->last_ack_recv) {
3783 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3784 			    __func__, dp->last_ack_recv, idx);
3785 			num = idx >= dp->last_ack_recv ?
3786 			    idx - dp->last_ack_recv + 1:
3787 			    (len - dp->last_ack_recv + 1) + idx;
3788 		}
3789 
3790 		/*
3791 		 * When we sent the in-band message to our peer we
3792 		 * marked the copy in our private ring as READY. We now
3793 		 * check that the descriptor we are being ACK'ed for is in
3794 		 * fact READY, i.e. it is one we have shared with our peer.
3795 		 *
3796 		 * If its not we flag an error, but still reset the descr
3797 		 * back to FREE.
3798 		 */
3799 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3800 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3801 			mutex_enter(&priv_addr->dstate_lock);
3802 			if (priv_addr->dstate != VIO_DESC_READY) {
3803 				DERR(vswp, "%s: (%ld) desc at index %ld not "
3804 				    "READY (0x%lx)", __func__,
3805 				    ldcp->ldc_id, idx, priv_addr->dstate);
3806 				DERR(vswp, "%s: bound %d: ncookies %ld : "
3807 				    "datalen %ld", __func__,
3808 				    priv_addr->bound, priv_addr->ncookies,
3809 				    priv_addr->datalen);
3810 			}
3811 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3812 			    ldcp->ldc_id, idx);
3813 			/* release resources associated with sent msg */
3814 			priv_addr->datalen = 0;
3815 			priv_addr->dstate = VIO_DESC_FREE;
3816 			mutex_exit(&priv_addr->dstate_lock);
3817 		}
3818 		/* update to next expected value */
3819 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3820 
3821 		break;
3822 
3823 	case VIO_SUBTYPE_NACK:
3824 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3825 
3826 		/*
3827 		 * We should only get a NACK if our peer doesn't like
3828 		 * something about a message we have sent it. If this
3829 		 * happens we just release the resources associated with
3830 		 * the message. (We are relying on higher layers to decide
3831 		 * whether or not to resend.
3832 		 */
3833 
3834 		/* limit check */
3835 		idx = ibnd_desc->hdr.desc_handle;
3836 
3837 		if (idx >= vsw_ntxds) {
3838 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3839 			    __func__, idx);
3840 			return;
3841 		}
3842 
3843 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3844 			DERR(vswp, "%s: no dring found", __func__);
3845 			return;
3846 		}
3847 
3848 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3849 
3850 		/* move to correct location in ring */
3851 		priv_addr += idx;
3852 
3853 		/* release resources associated with sent msg */
3854 		mutex_enter(&priv_addr->dstate_lock);
3855 		priv_addr->datalen = 0;
3856 		priv_addr->dstate = VIO_DESC_FREE;
3857 		mutex_exit(&priv_addr->dstate_lock);
3858 
3859 		break;
3860 
3861 	default:
3862 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3863 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3864 	}
3865 
3866 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3867 }
3868 
3869 static void
3870 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3871 {
3872 	_NOTE(ARGUNUSED(epkt))
3873 
3874 	vsw_t		*vswp = ldcp->ldc_vswp;
3875 	uint16_t	env = tagp->vio_subtype_env;
3876 
3877 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3878 
3879 	/*
3880 	 * Error vio_subtypes have yet to be defined. So for
3881 	 * the moment we can't do anything.
3882 	 */
3883 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3884 
3885 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3886 }
3887 
3888 /* transmit the packet over the given port */
3889 int
3890 vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
3891 {
3892 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
3893 	vsw_ldc_t 	*ldcp;
3894 	int		status = 0;
3895 	uint32_t	n;
3896 
3897 	READ_ENTER(&ldcl->lockrw);
3898 	/*
3899 	 * Note for now, we have a single channel.
3900 	 */
3901 	ldcp = ldcl->head;
3902 	if (ldcp == NULL) {
3903 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
3904 		freemsgchain(mp);
3905 		RW_EXIT(&ldcl->lockrw);
3906 		return (1);
3907 	}
3908 
3909 	n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3910 
3911 	count -= n;
3912 	if (count == 0) {
3913 		goto vsw_portsend_exit;
3914 	}
3915 
3916 	status = ldcp->tx(ldcp, mp, mpt, count);
3917 
3918 vsw_portsend_exit:
3919 	RW_EXIT(&ldcl->lockrw);
3920 
3921 	return (status);
3922 }
3923 
3924 /*
3925  * Break up frames into 2 seperate chains: normal and
3926  * priority, based on the frame type. The number of
3927  * priority frames is also counted and returned.
3928  *
3929  * Params:
3930  * 	vswp:	pointer to the instance of vsw
3931  *	np:	head of packet chain to be broken
3932  *	npt:	tail of packet chain to be broken
3933  *
3934  * Returns:
3935  *	np:	head of normal data packets
3936  *	npt:	tail of normal data packets
3937  *	hp:	head of high priority packets
3938  *	hpt:	tail of high priority packets
3939  */
3940 static uint32_t
3941 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
3942 	mblk_t **hp, mblk_t **hpt)
3943 {
3944 	mblk_t			*tmp = NULL;
3945 	mblk_t			*smp = NULL;
3946 	mblk_t			*hmp = NULL;	/* high prio pkts head */
3947 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
3948 	mblk_t			*nmp = NULL;	/* normal pkts head */
3949 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
3950 	uint32_t		count = 0;
3951 	int			i;
3952 	struct ether_header	*ehp;
3953 	uint32_t		num_types;
3954 	uint16_t		*types;
3955 
3956 	tmp = *np;
3957 	while (tmp != NULL) {
3958 
3959 		smp = tmp;
3960 		tmp = tmp->b_next;
3961 		smp->b_next = NULL;
3962 		smp->b_prev = NULL;
3963 
3964 		ehp = (struct ether_header *)smp->b_rptr;
3965 		num_types = vswp->pri_num_types;
3966 		types = vswp->pri_types;
3967 		for (i = 0; i < num_types; i++) {
3968 			if (ehp->ether_type == types[i]) {
3969 				/* high priority frame */
3970 
3971 				if (hmp != NULL) {
3972 					hmpt->b_next = smp;
3973 					hmpt = smp;
3974 				} else {
3975 					hmp = hmpt = smp;
3976 				}
3977 				count++;
3978 				break;
3979 			}
3980 		}
3981 		if (i == num_types) {
3982 			/* normal data frame */
3983 
3984 			if (nmp != NULL) {
3985 				nmpt->b_next = smp;
3986 				nmpt = smp;
3987 			} else {
3988 				nmp = nmpt = smp;
3989 			}
3990 		}
3991 	}
3992 
3993 	*hp = hmp;
3994 	*hpt = hmpt;
3995 	*np = nmp;
3996 	*npt = nmpt;
3997 
3998 	return (count);
3999 }
4000 
4001 /*
4002  * Wrapper function to transmit normal and/or priority frames over the channel.
4003  */
4004 static int
4005 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4006 {
4007 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4008 	mblk_t			*tmp;
4009 	mblk_t			*smp;
4010 	mblk_t			*hmp;	/* high prio pkts head */
4011 	mblk_t			*hmpt;	/* high prio pkts tail */
4012 	mblk_t			*nmp;	/* normal pkts head */
4013 	mblk_t			*nmpt;	/* normal pkts tail */
4014 	uint32_t		n = 0;
4015 	vsw_t			*vswp = ldcp->ldc_vswp;
4016 
4017 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4018 	ASSERT(count != 0);
4019 
4020 	nmp = mp;
4021 	nmpt = mpt;
4022 
4023 	/* gather any priority frames from the chain of packets */
4024 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4025 
4026 	/* transmit priority frames */
4027 	tmp = hmp;
4028 	while (tmp != NULL) {
4029 		smp = tmp;
4030 		tmp = tmp->b_next;
4031 		smp->b_next = NULL;
4032 		vsw_ldcsend_pkt(ldcp, smp);
4033 	}
4034 
4035 	count -= n;
4036 
4037 	if (count == 0) {
4038 		/* no normal data frames to process */
4039 		return (0);
4040 	}
4041 
4042 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4043 }
4044 
4045 /*
4046  * Wrapper function to transmit normal frames over the channel.
4047  */
4048 static int
4049 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4050 {
4051 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4052 	mblk_t		*tmp = NULL;
4053 
4054 	ASSERT(count != 0);
4055 	/*
4056 	 * If the TX thread is enabled, then queue the
4057 	 * ordinary frames and signal the tx thread.
4058 	 */
4059 	if (ldcp->tx_thread != NULL) {
4060 
4061 		mutex_enter(&ldcp->tx_thr_lock);
4062 
4063 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4064 			/*
4065 			 * If we reached queue limit,
4066 			 * do not queue new packets,
4067 			 * drop them.
4068 			 */
4069 			ldcp->ldc_stats.tx_qfull += count;
4070 			mutex_exit(&ldcp->tx_thr_lock);
4071 			freemsgchain(mp);
4072 			goto exit;
4073 		}
4074 		if (ldcp->tx_mhead == NULL) {
4075 			ldcp->tx_mhead = mp;
4076 			ldcp->tx_mtail = mpt;
4077 			cv_signal(&ldcp->tx_thr_cv);
4078 		} else {
4079 			ldcp->tx_mtail->b_next = mp;
4080 			ldcp->tx_mtail = mpt;
4081 		}
4082 		ldcp->tx_cnt += count;
4083 		mutex_exit(&ldcp->tx_thr_lock);
4084 	} else {
4085 		while (mp != NULL) {
4086 			tmp = mp->b_next;
4087 			mp->b_next = mp->b_prev = NULL;
4088 			(void) vsw_ldcsend(ldcp, mp, 1);
4089 			mp = tmp;
4090 		}
4091 	}
4092 
4093 exit:
4094 	return (0);
4095 }
4096 
4097 /*
4098  * This function transmits the frame in the payload of a raw data
4099  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4100  * send special frames with high priorities, without going through
4101  * the normal data path which uses descriptor ring mechanism.
4102  */
4103 static void
4104 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4105 {
4106 	vio_raw_data_msg_t	*pkt;
4107 	mblk_t			*bp;
4108 	mblk_t			*nmp = NULL;
4109 	caddr_t			dst;
4110 	uint32_t		mblksz;
4111 	uint32_t		size;
4112 	uint32_t		nbytes;
4113 	int			rv;
4114 	vsw_t			*vswp = ldcp->ldc_vswp;
4115 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4116 
4117 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4118 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4119 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4120 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4121 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4122 		    ldcp->lane_out.lstate);
4123 		goto send_pkt_exit;
4124 	}
4125 
4126 	size = msgsize(mp);
4127 
4128 	/* frame size bigger than available payload len of raw data msg ? */
4129 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4130 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4131 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4132 		    ldcp->ldc_id, size);
4133 		goto send_pkt_exit;
4134 	}
4135 
4136 	if (size < ETHERMIN)
4137 		size = ETHERMIN;
4138 
4139 	/* alloc space for a raw data message */
4140 	nmp = vio_allocb(vswp->pri_tx_vmp);
4141 	if (nmp == NULL) {
4142 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4143 		DWARN(vswp, "vio_allocb failed\n");
4144 		goto send_pkt_exit;
4145 	}
4146 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4147 
4148 	/* copy frame into the payload of raw data message */
4149 	dst = (caddr_t)pkt->data;
4150 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4151 		mblksz = MBLKL(bp);
4152 		bcopy(bp->b_rptr, dst, mblksz);
4153 		dst += mblksz;
4154 	}
4155 
4156 	/* setup the raw data msg */
4157 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4158 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4159 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4160 	pkt->tag.vio_sid = ldcp->local_session;
4161 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4162 
4163 	/* send the msg over ldc */
4164 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4165 	if (rv != 0) {
4166 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4167 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4168 		    ldcp->ldc_id);
4169 		goto send_pkt_exit;
4170 	}
4171 
4172 	/* update stats */
4173 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4174 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4175 
4176 send_pkt_exit:
4177 	if (nmp != NULL)
4178 		freemsg(nmp);
4179 	freemsg(mp);
4180 }
4181 
4182 /*
4183  * Transmit the packet over the given LDC channel.
4184  *
4185  * The 'retries' argument indicates how many times a packet
4186  * is retried before it is dropped. Note, the retry is done
4187  * only for a resource related failure, for all other failures
4188  * the packet is dropped immediately.
4189  */
4190 static int
4191 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4192 {
4193 	int i;
4194 	int rc;
4195 	int status = 0;
4196 	vsw_port_t *port = ldcp->ldc_port;
4197 	dring_info_t *dp = NULL;
4198 
4199 
4200 	for (i = 0; i < retries; ) {
4201 		/*
4202 		 * Send the message out using the appropriate
4203 		 * transmit function which will free mblock when it
4204 		 * is finished with it.
4205 		 */
4206 		mutex_enter(&port->tx_lock);
4207 		if (port->transmit != NULL) {
4208 			status = (*port->transmit)(ldcp, mp);
4209 		}
4210 		if (status == LDC_TX_SUCCESS) {
4211 			mutex_exit(&port->tx_lock);
4212 			break;
4213 		}
4214 		i++;	/* increment the counter here */
4215 
4216 		/* If its the last retry, then update the oerror */
4217 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4218 			ldcp->ldc_stats.oerrors++;
4219 		}
4220 		mutex_exit(&port->tx_lock);
4221 
4222 		if (status != LDC_TX_NORESOURCES) {
4223 			/*
4224 			 * No retrying required for errors un-related
4225 			 * to resources.
4226 			 */
4227 			break;
4228 		}
4229 		READ_ENTER(&ldcp->lane_out.dlistrw);
4230 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4231 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4232 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4233 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4234 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4235 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4236 		} else {
4237 			/*
4238 			 * If there is no dring or the xfer_mode is
4239 			 * set to DESC_MODE(ie., OBP), then simply break here.
4240 			 */
4241 			RW_EXIT(&ldcp->lane_out.dlistrw);
4242 			break;
4243 		}
4244 		RW_EXIT(&ldcp->lane_out.dlistrw);
4245 
4246 		/*
4247 		 * Delay only if none were reclaimed
4248 		 * and its not the last retry.
4249 		 */
4250 		if ((rc == 0) && (i < retries)) {
4251 			delay(drv_usectohz(vsw_ldc_tx_delay));
4252 		}
4253 	}
4254 	freemsg(mp);
4255 	return (status);
4256 }
4257 
4258 /*
4259  * Send packet out via descriptor ring to a logical device.
4260  */
4261 static int
4262 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4263 {
4264 	vio_dring_msg_t		dring_pkt;
4265 	dring_info_t		*dp = NULL;
4266 	vsw_private_desc_t	*priv_desc = NULL;
4267 	vnet_public_desc_t	*pub = NULL;
4268 	vsw_t			*vswp = ldcp->ldc_vswp;
4269 	mblk_t			*bp;
4270 	size_t			n, size;
4271 	caddr_t			bufp;
4272 	int			idx;
4273 	int			status = LDC_TX_SUCCESS;
4274 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4275 	lane_t			*lp = &ldcp->lane_out;
4276 
4277 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4278 
4279 	/* TODO: make test a macro */
4280 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4281 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4282 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4283 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4284 		    ldcp->lane_out.lstate);
4285 		ldcp->ldc_stats.oerrors++;
4286 		return (LDC_TX_FAILURE);
4287 	}
4288 
4289 	/*
4290 	 * Note - using first ring only, this may change
4291 	 * in the future.
4292 	 */
4293 	READ_ENTER(&ldcp->lane_out.dlistrw);
4294 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4295 		RW_EXIT(&ldcp->lane_out.dlistrw);
4296 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4297 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4298 		ldcp->ldc_stats.oerrors++;
4299 		return (LDC_TX_FAILURE);
4300 	}
4301 
4302 	size = msgsize(mp);
4303 	if (size > (size_t)lp->mtu) {
4304 		RW_EXIT(&ldcp->lane_out.dlistrw);
4305 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4306 		    ldcp->ldc_id, size);
4307 		ldcp->ldc_stats.oerrors++;
4308 		return (LDC_TX_FAILURE);
4309 	}
4310 
4311 	/*
4312 	 * Find a free descriptor
4313 	 *
4314 	 * Note: for the moment we are assuming that we will only
4315 	 * have one dring going from the switch to each of its
4316 	 * peers. This may change in the future.
4317 	 */
4318 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4319 		D2(vswp, "%s(%lld): no descriptor available for ring "
4320 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4321 
4322 		/* nothing more we can do */
4323 		status = LDC_TX_NORESOURCES;
4324 		ldcp->ldc_stats.tx_no_desc++;
4325 		goto vsw_dringsend_free_exit;
4326 	} else {
4327 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4328 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4329 	}
4330 
4331 	/* copy data into the descriptor */
4332 	bufp = priv_desc->datap;
4333 	bufp += VNET_IPALIGN;
4334 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4335 		n = MBLKL(bp);
4336 		bcopy(bp->b_rptr, bufp, n);
4337 		bufp += n;
4338 	}
4339 
4340 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4341 
4342 	pub = priv_desc->descp;
4343 	pub->nbytes = priv_desc->datalen;
4344 
4345 	/* update statistics */
4346 	if (IS_BROADCAST(ehp))
4347 		ldcp->ldc_stats.brdcstxmt++;
4348 	else if (IS_MULTICAST(ehp))
4349 		ldcp->ldc_stats.multixmt++;
4350 	ldcp->ldc_stats.opackets++;
4351 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4352 
4353 	mutex_enter(&priv_desc->dstate_lock);
4354 	pub->hdr.dstate = VIO_DESC_READY;
4355 	mutex_exit(&priv_desc->dstate_lock);
4356 
4357 	/*
4358 	 * Determine whether or not we need to send a message to our
4359 	 * peer prompting them to read our newly updated descriptor(s).
4360 	 */
4361 	mutex_enter(&dp->restart_lock);
4362 	if (dp->restart_reqd) {
4363 		dp->restart_reqd = B_FALSE;
4364 		ldcp->ldc_stats.dring_data_msgs++;
4365 		mutex_exit(&dp->restart_lock);
4366 
4367 		/*
4368 		 * Send a vio_dring_msg to peer to prompt them to read
4369 		 * the updated descriptor ring.
4370 		 */
4371 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4372 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4373 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4374 		dring_pkt.tag.vio_sid = ldcp->local_session;
4375 
4376 		/* Note - for now using first ring */
4377 		dring_pkt.dring_ident = dp->ident;
4378 
4379 		/*
4380 		 * If last_ack_recv is -1 then we know we've not
4381 		 * received any ack's yet, so this must be the first
4382 		 * msg sent, so set the start to the begining of the ring.
4383 		 */
4384 		mutex_enter(&dp->dlock);
4385 		if (dp->last_ack_recv == -1) {
4386 			dring_pkt.start_idx = 0;
4387 		} else {
4388 			dring_pkt.start_idx =
4389 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4390 		}
4391 		dring_pkt.end_idx = -1;
4392 		mutex_exit(&dp->dlock);
4393 
4394 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4395 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4396 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4397 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4398 		    dring_pkt.end_idx);
4399 
4400 		RW_EXIT(&ldcp->lane_out.dlistrw);
4401 
4402 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4403 		    sizeof (vio_dring_msg_t), B_TRUE);
4404 
4405 		return (status);
4406 
4407 	} else {
4408 		mutex_exit(&dp->restart_lock);
4409 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4410 		    ldcp->ldc_id, idx);
4411 	}
4412 
4413 vsw_dringsend_free_exit:
4414 
4415 	RW_EXIT(&ldcp->lane_out.dlistrw);
4416 
4417 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4418 	return (status);
4419 }
4420 
4421 /*
4422  * Send an in-band descriptor message over ldc.
4423  */
4424 static int
4425 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4426 {
4427 	vsw_t			*vswp = ldcp->ldc_vswp;
4428 	vnet_ibnd_desc_t	ibnd_msg;
4429 	vsw_private_desc_t	*priv_desc = NULL;
4430 	dring_info_t		*dp = NULL;
4431 	size_t			n, size = 0;
4432 	caddr_t			bufp;
4433 	mblk_t			*bp;
4434 	int			idx, i;
4435 	int			status = LDC_TX_SUCCESS;
4436 	static int		warn_msg = 1;
4437 	lane_t			*lp = &ldcp->lane_out;
4438 
4439 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4440 
4441 	ASSERT(mp != NULL);
4442 
4443 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4444 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4445 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4446 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4447 		    ldcp->lane_out.lstate);
4448 		ldcp->ldc_stats.oerrors++;
4449 		return (LDC_TX_FAILURE);
4450 	}
4451 
4452 	/*
4453 	 * only expect single dring to exist, which we use
4454 	 * as an internal buffer, rather than a transfer channel.
4455 	 */
4456 	READ_ENTER(&ldcp->lane_out.dlistrw);
4457 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4458 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4459 		    __func__, ldcp->ldc_id);
4460 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4461 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4462 		RW_EXIT(&ldcp->lane_out.dlistrw);
4463 		ldcp->ldc_stats.oerrors++;
4464 		return (LDC_TX_FAILURE);
4465 	}
4466 
4467 	size = msgsize(mp);
4468 	if (size > (size_t)lp->mtu) {
4469 		RW_EXIT(&ldcp->lane_out.dlistrw);
4470 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4471 		    ldcp->ldc_id, size);
4472 		ldcp->ldc_stats.oerrors++;
4473 		return (LDC_TX_FAILURE);
4474 	}
4475 
4476 	/*
4477 	 * Find a free descriptor in our buffer ring
4478 	 */
4479 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4480 		RW_EXIT(&ldcp->lane_out.dlistrw);
4481 		if (warn_msg) {
4482 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4483 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4484 			warn_msg = 0;
4485 		}
4486 
4487 		/* nothing more we can do */
4488 		status = LDC_TX_NORESOURCES;
4489 		goto vsw_descrsend_free_exit;
4490 	} else {
4491 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4492 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4493 		warn_msg = 1;
4494 	}
4495 
4496 	/* copy data into the descriptor */
4497 	bufp = priv_desc->datap;
4498 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4499 		n = MBLKL(bp);
4500 		bcopy(bp->b_rptr, bufp, n);
4501 		bufp += n;
4502 	}
4503 
4504 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4505 
4506 	/* create and send the in-band descp msg */
4507 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4508 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4509 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4510 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4511 
4512 	/*
4513 	 * Copy the mem cookies describing the data from the
4514 	 * private region of the descriptor ring into the inband
4515 	 * descriptor.
4516 	 */
4517 	for (i = 0; i < priv_desc->ncookies; i++) {
4518 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4519 		    sizeof (ldc_mem_cookie_t));
4520 	}
4521 
4522 	ibnd_msg.hdr.desc_handle = idx;
4523 	ibnd_msg.ncookies = priv_desc->ncookies;
4524 	ibnd_msg.nbytes = size;
4525 
4526 	ldcp->ldc_stats.opackets++;
4527 	ldcp->ldc_stats.obytes += size;
4528 
4529 	RW_EXIT(&ldcp->lane_out.dlistrw);
4530 
4531 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4532 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4533 
4534 vsw_descrsend_free_exit:
4535 
4536 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4537 	return (status);
4538 }
4539 
4540 static void
4541 vsw_send_ver(void *arg)
4542 {
4543 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4544 	vsw_t		*vswp = ldcp->ldc_vswp;
4545 	lane_t		*lp = &ldcp->lane_out;
4546 	vio_ver_msg_t	ver_msg;
4547 
4548 	D1(vswp, "%s enter", __func__);
4549 
4550 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4551 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4552 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4553 	ver_msg.tag.vio_sid = ldcp->local_session;
4554 
4555 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4556 		ver_msg.ver_major = vsw_versions[0].ver_major;
4557 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4558 	} else {
4559 		/* use the major,minor that we've ack'd */
4560 		lane_t	*lpi = &ldcp->lane_in;
4561 		ver_msg.ver_major = lpi->ver_major;
4562 		ver_msg.ver_minor = lpi->ver_minor;
4563 	}
4564 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4565 
4566 	lp->lstate |= VSW_VER_INFO_SENT;
4567 	lp->ver_major = ver_msg.ver_major;
4568 	lp->ver_minor = ver_msg.ver_minor;
4569 
4570 	DUMP_TAG(ver_msg.tag);
4571 
4572 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4573 
4574 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4575 }
4576 
4577 static void
4578 vsw_send_attr(vsw_ldc_t *ldcp)
4579 {
4580 	vsw_t			*vswp = ldcp->ldc_vswp;
4581 	lane_t			*lp = &ldcp->lane_out;
4582 	vnet_attr_msg_t		attr_msg;
4583 
4584 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4585 
4586 	/*
4587 	 * Subtype is set to INFO by default
4588 	 */
4589 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4590 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4591 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4592 	attr_msg.tag.vio_sid = ldcp->local_session;
4593 
4594 	/* payload copied from default settings for lane */
4595 	attr_msg.mtu = lp->mtu;
4596 	attr_msg.addr_type = lp->addr_type;
4597 	attr_msg.xfer_mode = lp->xfer_mode;
4598 	attr_msg.ack_freq = lp->xfer_mode;
4599 
4600 	READ_ENTER(&vswp->if_lockrw);
4601 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4602 	RW_EXIT(&vswp->if_lockrw);
4603 
4604 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4605 
4606 	DUMP_TAG(attr_msg.tag);
4607 
4608 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4609 
4610 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4611 }
4612 
4613 /*
4614  * Create dring info msg (which also results in the creation of
4615  * a dring).
4616  */
4617 static vio_dring_reg_msg_t *
4618 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4619 {
4620 	vio_dring_reg_msg_t	*mp;
4621 	dring_info_t		*dp;
4622 	vsw_t			*vswp = ldcp->ldc_vswp;
4623 
4624 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4625 
4626 	/*
4627 	 * If we can't create a dring, obviously no point sending
4628 	 * a message.
4629 	 */
4630 	if ((dp = vsw_create_dring(ldcp)) == NULL)
4631 		return (NULL);
4632 
4633 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4634 
4635 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4636 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4637 	mp->tag.vio_subtype_env = VIO_DRING_REG;
4638 	mp->tag.vio_sid = ldcp->local_session;
4639 
4640 	/* payload */
4641 	mp->num_descriptors = dp->num_descriptors;
4642 	mp->descriptor_size = dp->descriptor_size;
4643 	mp->options = dp->options;
4644 	mp->ncookies = dp->ncookies;
4645 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4646 
4647 	mp->dring_ident = 0;
4648 
4649 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4650 
4651 	return (mp);
4652 }
4653 
4654 static void
4655 vsw_send_dring_info(vsw_ldc_t *ldcp)
4656 {
4657 	vio_dring_reg_msg_t	*dring_msg;
4658 	vsw_t			*vswp = ldcp->ldc_vswp;
4659 
4660 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4661 
4662 	dring_msg = vsw_create_dring_info_pkt(ldcp);
4663 	if (dring_msg == NULL) {
4664 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4665 		    vswp->instance, __func__);
4666 		return;
4667 	}
4668 
4669 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4670 
4671 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4672 
4673 	(void) vsw_send_msg(ldcp, dring_msg,
4674 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4675 
4676 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4677 
4678 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4679 }
4680 
4681 static void
4682 vsw_send_rdx(vsw_ldc_t *ldcp)
4683 {
4684 	vsw_t		*vswp = ldcp->ldc_vswp;
4685 	vio_rdx_msg_t	rdx_msg;
4686 
4687 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4688 
4689 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4690 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4691 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4692 	rdx_msg.tag.vio_sid = ldcp->local_session;
4693 
4694 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4695 
4696 	DUMP_TAG(rdx_msg.tag);
4697 
4698 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4699 
4700 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4701 }
4702 
4703 /*
4704  * Generic routine to send message out over ldc channel.
4705  *
4706  * It is possible that when we attempt to write over the ldc channel
4707  * that we get notified that it has been reset. Depending on the value
4708  * of the handle_reset flag we either handle that event here or simply
4709  * notify the caller that the channel was reset.
4710  */
4711 static int
4712 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
4713 {
4714 	int			rv;
4715 	size_t			msglen = size;
4716 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
4717 	vsw_t			*vswp = ldcp->ldc_vswp;
4718 	vio_dring_msg_t		*dmsg;
4719 	vio_raw_data_msg_t	*rmsg;
4720 	vnet_ibnd_desc_t	*imsg;
4721 	boolean_t		data_msg = B_FALSE;
4722 
4723 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
4724 	    ldcp->ldc_id, size);
4725 
4726 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
4727 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
4728 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
4729 
4730 	mutex_enter(&ldcp->ldc_txlock);
4731 
4732 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
4733 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
4734 			dmsg = (vio_dring_msg_t *)tag;
4735 			dmsg->seq_num = ldcp->lane_out.seq_num;
4736 			data_msg = B_TRUE;
4737 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
4738 			rmsg = (vio_raw_data_msg_t *)tag;
4739 			rmsg->seq_num = ldcp->lane_out.seq_num;
4740 			data_msg = B_TRUE;
4741 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
4742 			imsg = (vnet_ibnd_desc_t *)tag;
4743 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
4744 			data_msg = B_TRUE;
4745 		}
4746 	}
4747 
4748 	do {
4749 		msglen = size;
4750 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
4751 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
4752 
4753 	if (rv == 0 && data_msg == B_TRUE) {
4754 		ldcp->lane_out.seq_num++;
4755 	}
4756 
4757 	if ((rv != 0) || (msglen != size)) {
4758 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
4759 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
4760 		ldcp->ldc_stats.oerrors++;
4761 	}
4762 
4763 	mutex_exit(&ldcp->ldc_txlock);
4764 
4765 	/*
4766 	 * If channel has been reset we either handle it here or
4767 	 * simply report back that it has been reset and let caller
4768 	 * decide what to do.
4769 	 */
4770 	if (rv == ECONNRESET) {
4771 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
4772 
4773 		/*
4774 		 * N.B - must never be holding the dlistrw lock when
4775 		 * we do a reset of the channel.
4776 		 */
4777 		if (handle_reset) {
4778 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4779 		}
4780 	}
4781 
4782 	return (rv);
4783 }
4784 
4785 /*
4786  * Remove the specified address from the list of address maintained
4787  * in this port node.
4788  */
4789 mcst_addr_t *
4790 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4791 {
4792 	vsw_t		*vswp = NULL;
4793 	vsw_port_t	*port = NULL;
4794 	mcst_addr_t	*prev_p = NULL;
4795 	mcst_addr_t	*curr_p = NULL;
4796 
4797 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4798 	    __func__, devtype, addr);
4799 
4800 	if (devtype == VSW_VNETPORT) {
4801 		port = (vsw_port_t *)arg;
4802 		mutex_enter(&port->mca_lock);
4803 		prev_p = curr_p = port->mcap;
4804 	} else {
4805 		vswp = (vsw_t *)arg;
4806 		mutex_enter(&vswp->mca_lock);
4807 		prev_p = curr_p = vswp->mcap;
4808 	}
4809 
4810 	while (curr_p != NULL) {
4811 		if (curr_p->addr == addr) {
4812 			D2(NULL, "%s: address found", __func__);
4813 			/* match found */
4814 			if (prev_p == curr_p) {
4815 				/* list head */
4816 				if (devtype == VSW_VNETPORT)
4817 					port->mcap = curr_p->nextp;
4818 				else
4819 					vswp->mcap = curr_p->nextp;
4820 			} else {
4821 				prev_p->nextp = curr_p->nextp;
4822 			}
4823 			break;
4824 		} else {
4825 			prev_p = curr_p;
4826 			curr_p = curr_p->nextp;
4827 		}
4828 	}
4829 
4830 	if (devtype == VSW_VNETPORT)
4831 		mutex_exit(&port->mca_lock);
4832 	else
4833 		mutex_exit(&vswp->mca_lock);
4834 
4835 	D1(NULL, "%s: exit", __func__);
4836 
4837 	return (curr_p);
4838 }
4839 
4840 /*
4841  * Creates a descriptor ring (dring) and links it into the
4842  * link of outbound drings for this channel.
4843  *
4844  * Returns NULL if creation failed.
4845  */
4846 static dring_info_t *
4847 vsw_create_dring(vsw_ldc_t *ldcp)
4848 {
4849 	vsw_private_desc_t	*priv_addr = NULL;
4850 	vsw_t			*vswp = ldcp->ldc_vswp;
4851 	ldc_mem_info_t		minfo;
4852 	dring_info_t		*dp, *tp;
4853 	int			i;
4854 
4855 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4856 
4857 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4858 
4859 	/* create public section of ring */
4860 	if ((ldc_mem_dring_create(vsw_ntxds,
4861 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
4862 
4863 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
4864 		    "failed", ldcp->ldc_id);
4865 		goto create_fail_exit;
4866 	}
4867 
4868 	ASSERT(dp->handle != NULL);
4869 
4870 	/*
4871 	 * Get the base address of the public section of the ring.
4872 	 */
4873 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4874 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
4875 		    ldcp->ldc_id);
4876 		goto dring_fail_exit;
4877 	} else {
4878 		ASSERT(minfo.vaddr != 0);
4879 		dp->pub_addr = minfo.vaddr;
4880 	}
4881 
4882 	dp->num_descriptors = vsw_ntxds;
4883 	dp->descriptor_size = VSW_PUB_SIZE;
4884 	dp->options = VIO_TX_DRING;
4885 	dp->ncookies = 1;	/* guaranteed by ldc */
4886 
4887 	/*
4888 	 * create private portion of ring
4889 	 */
4890 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
4891 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
4892 
4893 	if (vsw_setup_ring(ldcp, dp)) {
4894 		DERR(vswp, "%s: unable to setup ring", __func__);
4895 		goto dring_fail_exit;
4896 	}
4897 
4898 	/* haven't used any descriptors yet */
4899 	dp->end_idx = 0;
4900 	dp->last_ack_recv = -1;
4901 
4902 	/* bind dring to the channel */
4903 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
4904 	    LDC_SHADOW_MAP, LDC_MEM_RW,
4905 	    &dp->cookie[0], &dp->ncookies)) != 0) {
4906 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
4907 		    "%lld", ldcp->ldc_id);
4908 		goto dring_fail_exit;
4909 	}
4910 
4911 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4912 	dp->restart_reqd = B_TRUE;
4913 
4914 	/*
4915 	 * Only ever create rings for outgoing lane. Link it onto
4916 	 * end of list.
4917 	 */
4918 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4919 	if (ldcp->lane_out.dringp == NULL) {
4920 		D2(vswp, "vsw_create_dring: adding first outbound ring");
4921 		ldcp->lane_out.dringp = dp;
4922 	} else {
4923 		tp = ldcp->lane_out.dringp;
4924 		while (tp->next != NULL)
4925 			tp = tp->next;
4926 
4927 		tp->next = dp;
4928 	}
4929 	RW_EXIT(&ldcp->lane_out.dlistrw);
4930 
4931 	return (dp);
4932 
4933 dring_fail_exit:
4934 	(void) ldc_mem_dring_destroy(dp->handle);
4935 
4936 create_fail_exit:
4937 	if (dp->priv_addr != NULL) {
4938 		priv_addr = dp->priv_addr;
4939 		for (i = 0; i < vsw_ntxds; i++) {
4940 			if (priv_addr->memhandle != NULL)
4941 				(void) ldc_mem_free_handle(
4942 				    priv_addr->memhandle);
4943 			priv_addr++;
4944 		}
4945 		kmem_free(dp->priv_addr,
4946 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
4947 	}
4948 	mutex_destroy(&dp->dlock);
4949 
4950 	kmem_free(dp, sizeof (dring_info_t));
4951 	return (NULL);
4952 }
4953 
4954 /*
4955  * Create a ring consisting of just a private portion and link
4956  * it into the list of rings for the outbound lane.
4957  *
4958  * These type of rings are used primarily for temporary data
4959  * storage (i.e. as data buffers).
4960  */
4961 void
4962 vsw_create_privring(vsw_ldc_t *ldcp)
4963 {
4964 	dring_info_t		*dp, *tp;
4965 	vsw_t			*vswp = ldcp->ldc_vswp;
4966 
4967 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4968 
4969 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4970 
4971 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4972 
4973 	/* no public section */
4974 	dp->pub_addr = NULL;
4975 
4976 	dp->priv_addr = kmem_zalloc(
4977 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
4978 
4979 	dp->num_descriptors = vsw_ntxds;
4980 
4981 	if (vsw_setup_ring(ldcp, dp)) {
4982 		DERR(vswp, "%s: setup of ring failed", __func__);
4983 		kmem_free(dp->priv_addr,
4984 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
4985 		mutex_destroy(&dp->dlock);
4986 		kmem_free(dp, sizeof (dring_info_t));
4987 		return;
4988 	}
4989 
4990 	/* haven't used any descriptors yet */
4991 	dp->end_idx = 0;
4992 
4993 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4994 	dp->restart_reqd = B_TRUE;
4995 
4996 	/*
4997 	 * Only ever create rings for outgoing lane. Link it onto
4998 	 * end of list.
4999 	 */
5000 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5001 	if (ldcp->lane_out.dringp == NULL) {
5002 		D2(vswp, "%s: adding first outbound privring", __func__);
5003 		ldcp->lane_out.dringp = dp;
5004 	} else {
5005 		tp = ldcp->lane_out.dringp;
5006 		while (tp->next != NULL)
5007 			tp = tp->next;
5008 
5009 		tp->next = dp;
5010 	}
5011 	RW_EXIT(&ldcp->lane_out.dlistrw);
5012 
5013 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5014 }
5015 
5016 /*
5017  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5018  * failure.
5019  */
5020 int
5021 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5022 {
5023 	vnet_public_desc_t	*pub_addr = NULL;
5024 	vsw_private_desc_t	*priv_addr = NULL;
5025 	vsw_t			*vswp = ldcp->ldc_vswp;
5026 	uint64_t		*tmpp;
5027 	uint64_t		offset = 0;
5028 	uint32_t		ncookies = 0;
5029 	static char		*name = "vsw_setup_ring";
5030 	int			i, j, nc, rv;
5031 	size_t			data_sz;
5032 
5033 	priv_addr = dp->priv_addr;
5034 	pub_addr = dp->pub_addr;
5035 
5036 	/* public section may be null but private should never be */
5037 	ASSERT(priv_addr != NULL);
5038 
5039 	/*
5040 	 * Allocate the region of memory which will be used to hold
5041 	 * the data the descriptors will refer to.
5042 	 */
5043 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5044 	data_sz = VNET_ROUNDUP_2K(data_sz);
5045 	dp->desc_data_sz = data_sz;
5046 	dp->data_sz = vsw_ntxds * data_sz;
5047 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5048 
5049 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5050 	    dp->data_sz, dp->data_addr);
5051 
5052 	tmpp = (uint64_t *)dp->data_addr;
5053 	offset = dp->desc_data_sz/sizeof (tmpp);
5054 
5055 	/*
5056 	 * Initialise some of the private and public (if they exist)
5057 	 * descriptor fields.
5058 	 */
5059 	for (i = 0; i < vsw_ntxds; i++) {
5060 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5061 
5062 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5063 		    &priv_addr->memhandle)) != 0) {
5064 			DERR(vswp, "%s: alloc mem handle failed", name);
5065 			goto setup_ring_cleanup;
5066 		}
5067 
5068 		priv_addr->datap = (void *)tmpp;
5069 
5070 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5071 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5072 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5073 		    &(priv_addr->memcookie[0]), &ncookies);
5074 		if (rv != 0) {
5075 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5076 			    "(rv %d)", name, ldcp->ldc_id, rv);
5077 			goto setup_ring_cleanup;
5078 		}
5079 		priv_addr->bound = 1;
5080 
5081 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5082 		    name, i, priv_addr->memcookie[0].addr,
5083 		    priv_addr->memcookie[0].size);
5084 
5085 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5086 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5087 			    "invalid num of cookies (%d) for size 0x%llx",
5088 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5089 
5090 			goto setup_ring_cleanup;
5091 		} else {
5092 			for (j = 1; j < ncookies; j++) {
5093 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5094 				    &(priv_addr->memcookie[j]));
5095 				if (rv != 0) {
5096 					DERR(vswp, "%s: ldc_mem_nextcookie "
5097 					    "failed rv (%d)", name, rv);
5098 					goto setup_ring_cleanup;
5099 				}
5100 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5101 				    "size 0x%llx", name, j,
5102 				    priv_addr->memcookie[j].addr,
5103 				    priv_addr->memcookie[j].size);
5104 			}
5105 
5106 		}
5107 		priv_addr->ncookies = ncookies;
5108 		priv_addr->dstate = VIO_DESC_FREE;
5109 
5110 		if (pub_addr != NULL) {
5111 
5112 			/* link pub and private sides */
5113 			priv_addr->descp = pub_addr;
5114 
5115 			pub_addr->ncookies = priv_addr->ncookies;
5116 
5117 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5118 				bcopy(&priv_addr->memcookie[nc],
5119 				    &pub_addr->memcookie[nc],
5120 				    sizeof (ldc_mem_cookie_t));
5121 			}
5122 
5123 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5124 			pub_addr++;
5125 		}
5126 
5127 		/*
5128 		 * move to next element in the dring and the next
5129 		 * position in the data buffer.
5130 		 */
5131 		priv_addr++;
5132 		tmpp += offset;
5133 	}
5134 
5135 	return (0);
5136 
5137 setup_ring_cleanup:
5138 	priv_addr = dp->priv_addr;
5139 
5140 	for (j = 0; j < i; j++) {
5141 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5142 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5143 
5144 		mutex_destroy(&priv_addr->dstate_lock);
5145 
5146 		priv_addr++;
5147 	}
5148 	kmem_free(dp->data_addr, dp->data_sz);
5149 
5150 	return (1);
5151 }
5152 
5153 /*
5154  * Searches the private section of a ring for a free descriptor,
5155  * starting at the location of the last free descriptor found
5156  * previously.
5157  *
5158  * Returns 0 if free descriptor is available, and updates state
5159  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5160  *
5161  * FUTURE: might need to return contiguous range of descriptors
5162  * as dring info msg assumes all will be contiguous.
5163  */
5164 static int
5165 vsw_dring_find_free_desc(dring_info_t *dringp,
5166 		vsw_private_desc_t **priv_p, int *idx)
5167 {
5168 	vsw_private_desc_t	*addr = NULL;
5169 	int			num = vsw_ntxds;
5170 	int			ret = 1;
5171 
5172 	D1(NULL, "%s enter\n", __func__);
5173 
5174 	ASSERT(dringp->priv_addr != NULL);
5175 
5176 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5177 	    __func__, dringp, dringp->end_idx);
5178 
5179 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5180 
5181 	mutex_enter(&addr->dstate_lock);
5182 	if (addr->dstate == VIO_DESC_FREE) {
5183 		addr->dstate = VIO_DESC_READY;
5184 		*priv_p = addr;
5185 		*idx = dringp->end_idx;
5186 		dringp->end_idx = (dringp->end_idx + 1) % num;
5187 		ret = 0;
5188 
5189 	}
5190 	mutex_exit(&addr->dstate_lock);
5191 
5192 	/* ring full */
5193 	if (ret == 1) {
5194 		D2(NULL, "%s: no desp free: started at %d", __func__,
5195 		    dringp->end_idx);
5196 	}
5197 
5198 	D1(NULL, "%s: exit\n", __func__);
5199 
5200 	return (ret);
5201 }
5202 
5203 /*
5204  * Map from a dring identifier to the ring itself. Returns
5205  * pointer to ring or NULL if no match found.
5206  *
5207  * Should be called with dlistrw rwlock held as reader.
5208  */
5209 static dring_info_t *
5210 vsw_ident2dring(lane_t *lane, uint64_t ident)
5211 {
5212 	dring_info_t	*dp = NULL;
5213 
5214 	if ((dp = lane->dringp) == NULL) {
5215 		return (NULL);
5216 	} else {
5217 		if (dp->ident == ident)
5218 			return (dp);
5219 
5220 		while (dp != NULL) {
5221 			if (dp->ident == ident)
5222 				break;
5223 			dp = dp->next;
5224 		}
5225 	}
5226 
5227 	return (dp);
5228 }
5229 
5230 /*
5231  * Set the default lane attributes. These are copied into
5232  * the attr msg we send to our peer. If they are not acceptable
5233  * then (currently) the handshake ends.
5234  */
5235 static void
5236 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5237 {
5238 	bzero(lp, sizeof (lane_t));
5239 
5240 	READ_ENTER(&vswp->if_lockrw);
5241 	ether_copy(&(vswp->if_addr), &(lp->addr));
5242 	RW_EXIT(&vswp->if_lockrw);
5243 
5244 	lp->mtu = vswp->max_frame_size;
5245 	lp->addr_type = ADDR_TYPE_MAC;
5246 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5247 	lp->ack_freq = 0;	/* for shared mode */
5248 	lp->seq_num = VNET_ISS;
5249 }
5250 
5251 /*
5252  * Verify that the attributes are acceptable.
5253  *
5254  * FUTURE: If some attributes are not acceptable, change them
5255  * our desired values.
5256  */
5257 static int
5258 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5259 {
5260 	int			ret = 0;
5261 	struct ether_addr	ea;
5262 	vsw_port_t		*port = ldcp->ldc_port;
5263 	lane_t			*lp = &ldcp->lane_out;
5264 
5265 	D1(NULL, "vsw_check_attr enter\n");
5266 
5267 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5268 	    (pkt->xfer_mode != lp->xfer_mode)) {
5269 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5270 		ret = 1;
5271 	}
5272 
5273 	/* Only support MAC addresses at moment. */
5274 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5275 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5276 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5277 		ret = 1;
5278 	}
5279 
5280 	/*
5281 	 * MAC address supplied by device should match that stored
5282 	 * in the vsw-port OBP node. Need to decide what to do if they
5283 	 * don't match, for the moment just warn but don't fail.
5284 	 */
5285 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5286 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5287 		DERR(NULL, "vsw_check_attr: device supplied address "
5288 		    "0x%llx doesn't match node address 0x%llx\n",
5289 		    pkt->addr, port->p_macaddr);
5290 	}
5291 
5292 	/*
5293 	 * Ack freq only makes sense in pkt mode, in shared
5294 	 * mode the ring descriptors say whether or not to
5295 	 * send back an ACK.
5296 	 */
5297 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5298 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5299 	    (VSW_VER_LT(ldcp, 1, 2) &&
5300 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5301 		if (pkt->ack_freq > 0) {
5302 			D2(NULL, "vsw_check_attr: non zero ack freq "
5303 			    " in SHM mode\n");
5304 			ret = 1;
5305 		}
5306 	}
5307 
5308 	/*
5309 	 * Note: for the moment we only support ETHER
5310 	 * frames. This may change in the future.
5311 	 */
5312 	if ((pkt->mtu > lp->mtu) || (pkt->mtu <= 0)) {
5313 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5314 		    pkt->mtu);
5315 		ret = 1;
5316 	}
5317 
5318 	D1(NULL, "vsw_check_attr exit\n");
5319 
5320 	return (ret);
5321 }
5322 
5323 /*
5324  * Returns 1 if there is a problem, 0 otherwise.
5325  */
5326 static int
5327 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5328 {
5329 	_NOTE(ARGUNUSED(pkt))
5330 
5331 	int	ret = 0;
5332 
5333 	D1(NULL, "vsw_check_dring_info enter\n");
5334 
5335 	if ((pkt->num_descriptors == 0) ||
5336 	    (pkt->descriptor_size == 0) ||
5337 	    (pkt->ncookies != 1)) {
5338 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5339 		ret = 1;
5340 	}
5341 
5342 	D1(NULL, "vsw_check_dring_info exit\n");
5343 
5344 	return (ret);
5345 }
5346 
5347 /*
5348  * Returns 1 if two memory cookies match. Otherwise returns 0.
5349  */
5350 static int
5351 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5352 {
5353 	if ((m1->addr != m2->addr) ||
5354 	    (m2->size != m2->size)) {
5355 		return (0);
5356 	} else {
5357 		return (1);
5358 	}
5359 }
5360 
5361 /*
5362  * Returns 1 if ring described in reg message matches that
5363  * described by dring_info structure. Otherwise returns 0.
5364  */
5365 static int
5366 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5367 {
5368 	if ((msg->descriptor_size != dp->descriptor_size) ||
5369 	    (msg->num_descriptors != dp->num_descriptors) ||
5370 	    (msg->ncookies != dp->ncookies) ||
5371 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5372 		return (0);
5373 	} else {
5374 		return (1);
5375 	}
5376 
5377 }
5378 
5379 static caddr_t
5380 vsw_print_ethaddr(uint8_t *a, char *ebuf)
5381 {
5382 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
5383 	    a[0], a[1], a[2], a[3], a[4], a[5]);
5384 	return (ebuf);
5385 }
5386 
5387 /*
5388  * Reset and free all the resources associated with
5389  * the channel.
5390  */
5391 static void
5392 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5393 {
5394 	dring_info_t		*dp, *dpp;
5395 	lane_t			*lp = NULL;
5396 	int			rv = 0;
5397 
5398 	ASSERT(ldcp != NULL);
5399 
5400 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5401 
5402 	if (dir == INBOUND) {
5403 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5404 		    " of channel %lld", __func__, ldcp->ldc_id);
5405 		lp = &ldcp->lane_in;
5406 	} else {
5407 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5408 		    " of channel %lld", __func__, ldcp->ldc_id);
5409 		lp = &ldcp->lane_out;
5410 	}
5411 
5412 	lp->lstate = VSW_LANE_INACTIV;
5413 	lp->seq_num = VNET_ISS;
5414 
5415 	if (lp->dringp) {
5416 		if (dir == INBOUND) {
5417 			WRITE_ENTER(&lp->dlistrw);
5418 			dp = lp->dringp;
5419 			while (dp != NULL) {
5420 				dpp = dp->next;
5421 				if (dp->handle != NULL)
5422 					(void) ldc_mem_dring_unmap(dp->handle);
5423 				kmem_free(dp, sizeof (dring_info_t));
5424 				dp = dpp;
5425 			}
5426 			RW_EXIT(&lp->dlistrw);
5427 		} else {
5428 			/*
5429 			 * unbind, destroy exported dring, free dring struct
5430 			 */
5431 			WRITE_ENTER(&lp->dlistrw);
5432 			dp = lp->dringp;
5433 			rv = vsw_free_ring(dp);
5434 			RW_EXIT(&lp->dlistrw);
5435 		}
5436 		if (rv == 0) {
5437 			lp->dringp = NULL;
5438 		}
5439 	}
5440 
5441 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5442 }
5443 
5444 /*
5445  * Free ring and all associated resources.
5446  *
5447  * Should be called with dlistrw rwlock held as writer.
5448  */
5449 static int
5450 vsw_free_ring(dring_info_t *dp)
5451 {
5452 	vsw_private_desc_t	*paddr = NULL;
5453 	dring_info_t		*dpp;
5454 	int			i, rv = 1;
5455 
5456 	while (dp != NULL) {
5457 		mutex_enter(&dp->dlock);
5458 		dpp = dp->next;
5459 		if (dp->priv_addr != NULL) {
5460 			/*
5461 			 * First unbind and free the memory handles
5462 			 * stored in each descriptor within the ring.
5463 			 */
5464 			for (i = 0; i < vsw_ntxds; i++) {
5465 				paddr = (vsw_private_desc_t *)
5466 				    dp->priv_addr + i;
5467 				if (paddr->memhandle != NULL) {
5468 					if (paddr->bound == 1) {
5469 						rv = ldc_mem_unbind_handle(
5470 						    paddr->memhandle);
5471 
5472 						if (rv != 0) {
5473 							DERR(NULL, "error "
5474 							"unbinding handle for "
5475 							"ring 0x%llx at pos %d",
5476 							    dp, i);
5477 							mutex_exit(&dp->dlock);
5478 							return (rv);
5479 						}
5480 						paddr->bound = 0;
5481 					}
5482 
5483 					rv = ldc_mem_free_handle(
5484 					    paddr->memhandle);
5485 					if (rv != 0) {
5486 						DERR(NULL, "error freeing "
5487 						    "handle for ring 0x%llx "
5488 						    "at pos %d", dp, i);
5489 						mutex_exit(&dp->dlock);
5490 						return (rv);
5491 					}
5492 					paddr->memhandle = NULL;
5493 				}
5494 				mutex_destroy(&paddr->dstate_lock);
5495 			}
5496 			kmem_free(dp->priv_addr,
5497 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5498 		}
5499 
5500 		/*
5501 		 * Now unbind and destroy the ring itself.
5502 		 */
5503 		if (dp->handle != NULL) {
5504 			(void) ldc_mem_dring_unbind(dp->handle);
5505 			(void) ldc_mem_dring_destroy(dp->handle);
5506 		}
5507 
5508 		if (dp->data_addr != NULL) {
5509 			kmem_free(dp->data_addr, dp->data_sz);
5510 		}
5511 
5512 		mutex_exit(&dp->dlock);
5513 		mutex_destroy(&dp->dlock);
5514 		mutex_destroy(&dp->restart_lock);
5515 		kmem_free(dp, sizeof (dring_info_t));
5516 
5517 		dp = dpp;
5518 	}
5519 	return (0);
5520 }
5521 
5522 /*
5523  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5524  * This thread is woken up by the LDC interrupt handler to process
5525  * LDC packets and receive data.
5526  */
5527 static void
5528 vsw_ldc_rx_worker(void *arg)
5529 {
5530 	callb_cpr_t	cprinfo;
5531 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5532 	vsw_t *vswp = ldcp->ldc_vswp;
5533 
5534 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5535 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5536 	    "vsw_rx_thread");
5537 	mutex_enter(&ldcp->rx_thr_lock);
5538 	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5539 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5540 
5541 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5542 		/*
5543 		 * Wait until the data is received or a stop
5544 		 * request is received.
5545 		 */
5546 		while (!(ldcp->rx_thr_flags &
5547 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5548 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5549 		}
5550 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5551 
5552 		/*
5553 		 * First process the stop request.
5554 		 */
5555 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5556 			D2(vswp, "%s(%lld):Rx thread stopped\n",
5557 			    __func__, ldcp->ldc_id);
5558 			break;
5559 		}
5560 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5561 		mutex_exit(&ldcp->rx_thr_lock);
5562 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5563 		    __func__, ldcp->ldc_id);
5564 		mutex_enter(&ldcp->ldc_cblock);
5565 		vsw_process_pkt(ldcp);
5566 		mutex_exit(&ldcp->ldc_cblock);
5567 		mutex_enter(&ldcp->rx_thr_lock);
5568 	}
5569 
5570 	/*
5571 	 * Update the run status and wakeup the thread that
5572 	 * has sent the stop request.
5573 	 */
5574 	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5575 	cv_signal(&ldcp->rx_thr_cv);
5576 	CALLB_CPR_EXIT(&cprinfo);
5577 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5578 	thread_exit();
5579 }
5580 
5581 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5582 static void
5583 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5584 {
5585 	vsw_t *vswp = ldcp->ldc_vswp;
5586 
5587 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5588 	/*
5589 	 * Send a stop request by setting the stop flag and
5590 	 * wait until the receive thread stops.
5591 	 */
5592 	mutex_enter(&ldcp->rx_thr_lock);
5593 	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5594 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5595 		cv_signal(&ldcp->rx_thr_cv);
5596 		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5597 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5598 		}
5599 	}
5600 	mutex_exit(&ldcp->rx_thr_lock);
5601 	ldcp->rx_thread = NULL;
5602 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5603 }
5604 
5605 /*
5606  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5607  * This thread is woken up by the vsw_portsend to transmit
5608  * packets.
5609  */
5610 static void
5611 vsw_ldc_tx_worker(void *arg)
5612 {
5613 	callb_cpr_t	cprinfo;
5614 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5615 	vsw_t *vswp = ldcp->ldc_vswp;
5616 	mblk_t *mp;
5617 	mblk_t *tmp;
5618 
5619 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5620 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5621 	    "vnet_tx_thread");
5622 	mutex_enter(&ldcp->tx_thr_lock);
5623 	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5624 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5625 
5626 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5627 		/*
5628 		 * Wait until the data is received or a stop
5629 		 * request is received.
5630 		 */
5631 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5632 		    (ldcp->tx_mhead == NULL)) {
5633 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5634 		}
5635 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5636 
5637 		/*
5638 		 * First process the stop request.
5639 		 */
5640 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5641 			D2(vswp, "%s(%lld):tx thread stopped\n",
5642 			    __func__, ldcp->ldc_id);
5643 			break;
5644 		}
5645 		mp = ldcp->tx_mhead;
5646 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5647 		ldcp->tx_cnt = 0;
5648 		mutex_exit(&ldcp->tx_thr_lock);
5649 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5650 		    __func__, ldcp->ldc_id);
5651 		while (mp != NULL) {
5652 			tmp = mp->b_next;
5653 			mp->b_next = mp->b_prev = NULL;
5654 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5655 			mp = tmp;
5656 		}
5657 		mutex_enter(&ldcp->tx_thr_lock);
5658 	}
5659 
5660 	/*
5661 	 * Update the run status and wakeup the thread that
5662 	 * has sent the stop request.
5663 	 */
5664 	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5665 	cv_signal(&ldcp->tx_thr_cv);
5666 	CALLB_CPR_EXIT(&cprinfo);
5667 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5668 	thread_exit();
5669 }
5670 
5671 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5672 static void
5673 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5674 {
5675 	vsw_t *vswp = ldcp->ldc_vswp;
5676 
5677 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5678 	/*
5679 	 * Send a stop request by setting the stop flag and
5680 	 * wait until the receive thread stops.
5681 	 */
5682 	mutex_enter(&ldcp->tx_thr_lock);
5683 	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5684 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5685 		cv_signal(&ldcp->tx_thr_cv);
5686 		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5687 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5688 		}
5689 	}
5690 	mutex_exit(&ldcp->tx_thr_lock);
5691 	ldcp->tx_thread = NULL;
5692 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5693 }
5694 
5695 /* vsw_reclaim_dring -- reclaim descriptors */
5696 static int
5697 vsw_reclaim_dring(dring_info_t *dp, int start)
5698 {
5699 	int i, j, len;
5700 	vsw_private_desc_t *priv_addr;
5701 	vnet_public_desc_t *pub_addr;
5702 
5703 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5704 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5705 	len = dp->num_descriptors;
5706 
5707 	D2(NULL, "%s: start index %ld\n", __func__, start);
5708 
5709 	j = 0;
5710 	for (i = start; j < len; i = (i + 1) % len, j++) {
5711 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5712 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5713 
5714 		mutex_enter(&priv_addr->dstate_lock);
5715 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
5716 			mutex_exit(&priv_addr->dstate_lock);
5717 			break;
5718 		}
5719 		pub_addr->hdr.dstate = VIO_DESC_FREE;
5720 		priv_addr->dstate = VIO_DESC_FREE;
5721 		/* clear all the fields */
5722 		priv_addr->datalen = 0;
5723 		pub_addr->hdr.ack = 0;
5724 		mutex_exit(&priv_addr->dstate_lock);
5725 
5726 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
5727 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
5728 	}
5729 	return (j);
5730 }
5731 
5732 /*
5733  * Debugging routines
5734  */
5735 static void
5736 display_state(void)
5737 {
5738 	vsw_t		*vswp;
5739 	vsw_port_list_t	*plist;
5740 	vsw_port_t 	*port;
5741 	vsw_ldc_list_t	*ldcl;
5742 	vsw_ldc_t 	*ldcp;
5743 	extern vsw_t 	*vsw_head;
5744 
5745 	cmn_err(CE_NOTE, "***** system state *****");
5746 
5747 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
5748 		plist = &vswp->plist;
5749 		READ_ENTER(&plist->lockrw);
5750 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
5751 		    vswp->instance, plist->num_ports);
5752 
5753 		for (port = plist->head; port != NULL; port = port->p_next) {
5754 			ldcl = &port->p_ldclist;
5755 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
5756 			    port->p_instance, port->num_ldcs);
5757 			READ_ENTER(&ldcl->lockrw);
5758 			ldcp = ldcl->head;
5759 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
5760 				cmn_err(CE_CONT, "chan %lu : dev %d : "
5761 				    "status %d : phase %u\n",
5762 				    ldcp->ldc_id, ldcp->dev_class,
5763 				    ldcp->ldc_status, ldcp->hphase);
5764 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
5765 				    "psession %lu\n", ldcp->ldc_id,
5766 				    ldcp->local_session, ldcp->peer_session);
5767 
5768 				cmn_err(CE_CONT, "Inbound lane:\n");
5769 				display_lane(&ldcp->lane_in);
5770 				cmn_err(CE_CONT, "Outbound lane:\n");
5771 				display_lane(&ldcp->lane_out);
5772 			}
5773 			RW_EXIT(&ldcl->lockrw);
5774 		}
5775 		RW_EXIT(&plist->lockrw);
5776 	}
5777 	cmn_err(CE_NOTE, "***** system state *****");
5778 }
5779 
5780 static void
5781 display_lane(lane_t *lp)
5782 {
5783 	dring_info_t	*drp;
5784 
5785 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
5786 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
5787 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
5788 	    lp->addr_type, lp->addr, lp->xfer_mode);
5789 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
5790 
5791 	cmn_err(CE_CONT, "Dring info:\n");
5792 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
5793 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
5794 		    drp->num_descriptors, drp->descriptor_size);
5795 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
5796 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
5797 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
5798 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
5799 		    drp->ident, drp->end_idx);
5800 		display_ring(drp);
5801 	}
5802 }
5803 
5804 static void
5805 display_ring(dring_info_t *dringp)
5806 {
5807 	uint64_t		i;
5808 	uint64_t		priv_count = 0;
5809 	uint64_t		pub_count = 0;
5810 	vnet_public_desc_t	*pub_addr = NULL;
5811 	vsw_private_desc_t	*priv_addr = NULL;
5812 
5813 	for (i = 0; i < vsw_ntxds; i++) {
5814 		if (dringp->pub_addr != NULL) {
5815 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
5816 
5817 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
5818 				pub_count++;
5819 		}
5820 
5821 		if (dringp->priv_addr != NULL) {
5822 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
5823 
5824 			if (priv_addr->dstate == VIO_DESC_FREE)
5825 				priv_count++;
5826 		}
5827 	}
5828 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
5829 	    i, priv_count, pub_count);
5830 }
5831 
5832 static void
5833 dump_flags(uint64_t state)
5834 {
5835 	int	i;
5836 
5837 	typedef struct flag_name {
5838 		int	flag_val;
5839 		char	*flag_name;
5840 	} flag_name_t;
5841 
5842 	flag_name_t	flags[] = {
5843 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
5844 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
5845 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
5846 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
5847 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
5848 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
5849 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
5850 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
5851 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
5852 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
5853 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
5854 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
5855 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
5856 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
5857 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
5858 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
5859 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
5860 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
5861 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
5862 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
5863 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
5864 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
5865 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
5866 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
5867 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
5868 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
5869 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
5870 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
5871 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
5872 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
5873 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
5874 
5875 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
5876 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
5877 		if (state & flags[i].flag_val)
5878 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
5879 	}
5880 }
5881