xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_ldc.c (revision 160abee025ef30c34521b981edd40ffcaab560aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 #include <sys/callb.h>
75 
76 /* Port add/deletion/etc routines */
77 static	int vsw_port_delete(vsw_port_t *port);
78 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
79 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
80 static	int vsw_init_ldcs(vsw_port_t *port);
81 static	int vsw_uninit_ldcs(vsw_port_t *port);
82 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
83 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
84 static	int vsw_drain_ldcs(vsw_port_t *port);
85 static	int vsw_drain_port_taskq(vsw_port_t *port);
86 static	void vsw_marker_task(void *);
87 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
88 int vsw_detach_ports(vsw_t *vswp);
89 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
90 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
91 int vsw_port_detach(vsw_t *vswp, int p_instance);
92 int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt);
93 int vsw_port_attach(vsw_t *vswp, int p_instance,
94 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
95 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
96 
97 
98 /* Interrupt routines */
99 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
100 
101 /* Handshake routines */
102 static	void vsw_ldc_reinit(vsw_ldc_t *);
103 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
104 static	void vsw_conn_task(void *);
105 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
106 static	void vsw_next_milestone(vsw_ldc_t *);
107 static	int vsw_supported_version(vio_ver_msg_t *);
108 
109 /* Data processing routines */
110 static void vsw_process_pkt(void *);
111 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
112 static void vsw_process_ctrl_pkt(void *);
113 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
114 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
115 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
116 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
117 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
120 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
123 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
124 
125 /* Switching/data transmit routines */
126 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
127 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
128 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, int retries);
129 
130 /* Packet creation routines */
131 static void vsw_send_ver(void *);
132 static void vsw_send_attr(vsw_ldc_t *);
133 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
134 static void vsw_send_dring_info(vsw_ldc_t *);
135 static void vsw_send_rdx(vsw_ldc_t *);
136 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
137 
138 /* Dring routines */
139 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
140 static void vsw_create_privring(vsw_ldc_t *);
141 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
142 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
143     int *);
144 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
145 static int vsw_reclaim_dring(dring_info_t *dp, int start);
146 
147 static void vsw_set_lane_attr(vsw_t *, lane_t *);
148 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
149 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
150 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
151 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
152 
153 /* Rcv/Tx thread routines */
154 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
155 static void vsw_ldc_tx_worker(void *arg);
156 static uint_t vsw_rx_softintr(caddr_t arg1, caddr_t arg2);
157 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
158 static void vsw_ldc_rx_worker(void *arg);
159 
160 /* Misc support routines */
161 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
162 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
163 static int vsw_free_ring(dring_info_t *);
164 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
165 static int vsw_get_same_dest_list(struct ether_header *ehp,
166     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
167 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
168 static void vsw_mac_rx(vsw_t *vswp, int caller, mac_resource_handle_t mrh,
169     mblk_t *mp, mblk_t *mpt, vsw_macrx_flags_t flags);
170 
171 /* Debugging routines */
172 static void dump_flags(uint64_t);
173 static void display_state(void);
174 static void display_lane(lane_t *);
175 static void display_ring(dring_info_t *);
176 
177 /*
178  * Functions imported from other files.
179  */
180 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
181 extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
182 extern void vsw_reconfig_hw(vsw_t *);
183 extern int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
184 extern int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
185 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
186 extern void vsw_del_mcst_port(vsw_port_t *port);
187 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
188 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
189 
190 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
191 #define	VSW_PORT_REF_DELAY	30	/* delay for port ref_cnt to become 0 */
192 
193 /*
194  * Tunables used in this file.
195  */
196 extern int vsw_num_handshakes;
197 extern int vsw_wretries;
198 extern int vsw_desc_delay;
199 extern int vsw_read_attempts;
200 extern int vsw_ldc_tx_delay;
201 extern int vsw_ldc_tx_retries;
202 extern int vsw_ldc_tx_max_failures;
203 extern boolean_t vsw_ldc_rxthr_enabled;
204 extern boolean_t vsw_ldc_txthr_enabled;
205 extern uint32_t vsw_chain_len;
206 extern uint32_t vsw_mblk_size1;
207 extern uint32_t vsw_mblk_size2;
208 extern uint32_t vsw_mblk_size3;
209 extern uint32_t vsw_num_mblks1;
210 extern uint32_t vsw_num_mblks2;
211 extern uint32_t vsw_num_mblks3;
212 
213 
214 #define	LDC_ENTER_LOCK(ldcp)	\
215 				mutex_enter(&((ldcp)->ldc_cblock));\
216 				mutex_enter(&((ldcp)->ldc_rxlock));\
217 				mutex_enter(&((ldcp)->ldc_txlock));
218 #define	LDC_EXIT_LOCK(ldcp)	\
219 				mutex_exit(&((ldcp)->ldc_txlock));\
220 				mutex_exit(&((ldcp)->ldc_rxlock));\
221 				mutex_exit(&((ldcp)->ldc_cblock));
222 
223 
224 /* supported versions */
225 static	ver_sup_t	vsw_versions[] = { {1, 0} };
226 
227 /*
228  * For the moment the state dump routines have their own
229  * private flag.
230  */
231 #define	DUMP_STATE	0
232 
233 #if DUMP_STATE
234 
235 #define	DUMP_TAG(tag) \
236 {			\
237 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
238 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
239 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
240 }
241 
242 #define	DUMP_TAG_PTR(tag) \
243 {			\
244 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
245 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
246 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
247 }
248 
249 #define	DUMP_FLAGS(flags) dump_flags(flags);
250 #define	DISPLAY_STATE()	display_state()
251 
252 #else
253 
254 #define	DUMP_TAG(tag)
255 #define	DUMP_TAG_PTR(tag)
256 #define	DUMP_FLAGS(state)
257 #define	DISPLAY_STATE()
258 
259 #endif	/* DUMP_STATE */
260 
261 /*
262  * Attach the specified port.
263  *
264  * Returns 0 on success, 1 on failure.
265  */
266 int
267 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
268 struct ether_addr *macaddr)
269 {
270 	vsw_port_list_t		*plist = &vswp->plist;
271 	vsw_port_t		*port, **prev_port;
272 	int			i;
273 
274 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
275 
276 	/* port already exists? */
277 	READ_ENTER(&plist->lockrw);
278 	for (port = plist->head; port != NULL; port = port->p_next) {
279 		if (port->p_instance == p_instance) {
280 			DWARN(vswp, "%s: port instance %d already attached",
281 			    __func__, p_instance);
282 			RW_EXIT(&plist->lockrw);
283 			return (1);
284 		}
285 	}
286 	RW_EXIT(&plist->lockrw);
287 
288 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
289 	port->p_vswp = vswp;
290 	port->p_instance = p_instance;
291 	port->p_ldclist.num_ldcs = 0;
292 	port->p_ldclist.head = NULL;
293 	port->addr_set = VSW_ADDR_UNSET;
294 
295 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
296 
297 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
298 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
299 
300 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
301 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
302 	port->state = VSW_PORT_INIT;
303 
304 	if (nids > VSW_PORT_MAX_LDCS) {
305 		D2(vswp, "%s: using first of %d ldc ids",
306 		    __func__, nids);
307 		nids = VSW_PORT_MAX_LDCS;
308 	}
309 
310 	D2(vswp, "%s: %d nids", __func__, nids);
311 	for (i = 0; i < nids; i++) {
312 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
313 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
314 			DERR(vswp, "%s: ldc_attach failed", __func__);
315 
316 			rw_destroy(&port->p_ldclist.lockrw);
317 
318 			cv_destroy(&port->state_cv);
319 			mutex_destroy(&port->state_lock);
320 
321 			mutex_destroy(&port->tx_lock);
322 			mutex_destroy(&port->mca_lock);
323 			kmem_free(port, sizeof (vsw_port_t));
324 			return (1);
325 		}
326 	}
327 
328 	ether_copy(macaddr, &port->p_macaddr);
329 
330 	if (vswp->switching_setup_done == B_TRUE) {
331 		/*
332 		 * If the underlying physical device has been setup,
333 		 * program the mac address of this port in it.
334 		 * Otherwise, port macaddr will be set after the physical
335 		 * device is successfully setup by the timeout handler.
336 		 */
337 		mutex_enter(&vswp->hw_lock);
338 		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
339 		mutex_exit(&vswp->hw_lock);
340 	}
341 
342 	WRITE_ENTER(&plist->lockrw);
343 
344 	/* create the fdb entry for this port/mac address */
345 	(void) vsw_add_fdb(vswp, port);
346 
347 	/* link it into the list of ports for this vsw instance */
348 	prev_port = (vsw_port_t **)(&plist->head);
349 	port->p_next = *prev_port;
350 	*prev_port = port;
351 	plist->num_ports++;
352 
353 	RW_EXIT(&plist->lockrw);
354 
355 	/*
356 	 * Initialise the port and any ldc's under it.
357 	 */
358 	(void) vsw_init_ldcs(port);
359 
360 	D1(vswp, "%s: exit", __func__);
361 	return (0);
362 }
363 
364 /*
365  * Detach the specified port.
366  *
367  * Returns 0 on success, 1 on failure.
368  */
369 int
370 vsw_port_detach(vsw_t *vswp, int p_instance)
371 {
372 	vsw_port_t	*port = NULL;
373 	vsw_port_list_t	*plist = &vswp->plist;
374 
375 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
376 
377 	WRITE_ENTER(&plist->lockrw);
378 
379 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
380 		RW_EXIT(&plist->lockrw);
381 		return (1);
382 	}
383 
384 	if (vsw_plist_del_node(vswp, port)) {
385 		RW_EXIT(&plist->lockrw);
386 		return (1);
387 	}
388 
389 	/* Remove the fdb entry for this port/mac address */
390 	(void) vsw_del_fdb(vswp, port);
391 
392 	/* Remove any multicast addresses.. */
393 	vsw_del_mcst_port(port);
394 
395 	/*
396 	 * No longer need to hold writer lock on port list now
397 	 * that we have unlinked the target port from the list.
398 	 */
399 	RW_EXIT(&plist->lockrw);
400 
401 	/* Remove address if was programmed into HW. */
402 	mutex_enter(&vswp->hw_lock);
403 
404 	/*
405 	 * Port's address may not have been set in hardware. This could
406 	 * happen if the underlying physical device is not yet available and
407 	 * vsw_setup_switching_timeout() may be in progress.
408 	 * We remove its addr from hardware only if it has been set before.
409 	 */
410 	if (port->addr_set != VSW_ADDR_UNSET)
411 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
412 
413 	if (vswp->recfg_reqd)
414 		vsw_reconfig_hw(vswp);
415 
416 	mutex_exit(&vswp->hw_lock);
417 
418 	if (vsw_port_delete(port)) {
419 		return (1);
420 	}
421 
422 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
423 	return (0);
424 }
425 
426 /*
427  * Detach all active ports.
428  *
429  * Returns 0 on success, 1 on failure.
430  */
431 int
432 vsw_detach_ports(vsw_t *vswp)
433 {
434 	vsw_port_list_t 	*plist = &vswp->plist;
435 	vsw_port_t		*port = NULL;
436 
437 	D1(vswp, "%s: enter", __func__);
438 
439 	WRITE_ENTER(&plist->lockrw);
440 
441 	while ((port = plist->head) != NULL) {
442 		if (vsw_plist_del_node(vswp, port)) {
443 			DERR(vswp, "%s: Error deleting port %d"
444 			    " from port list", __func__, port->p_instance);
445 			RW_EXIT(&plist->lockrw);
446 			return (1);
447 		}
448 
449 		/* Remove address if was programmed into HW. */
450 		mutex_enter(&vswp->hw_lock);
451 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
452 		mutex_exit(&vswp->hw_lock);
453 
454 		/* Remove the fdb entry for this port/mac address */
455 		(void) vsw_del_fdb(vswp, port);
456 
457 		/* Remove any multicast addresses.. */
458 		vsw_del_mcst_port(port);
459 
460 		/*
461 		 * No longer need to hold the lock on the port list
462 		 * now that we have unlinked the target port from the
463 		 * list.
464 		 */
465 		RW_EXIT(&plist->lockrw);
466 		if (vsw_port_delete(port)) {
467 			DERR(vswp, "%s: Error deleting port %d",
468 			    __func__, port->p_instance);
469 			return (1);
470 		}
471 		WRITE_ENTER(&plist->lockrw);
472 	}
473 	RW_EXIT(&plist->lockrw);
474 
475 	D1(vswp, "%s: exit", __func__);
476 
477 	return (0);
478 }
479 
480 /*
481  * Delete the specified port.
482  *
483  * Returns 0 on success, 1 on failure.
484  */
485 static int
486 vsw_port_delete(vsw_port_t *port)
487 {
488 	vsw_ldc_list_t 		*ldcl;
489 	vsw_t			*vswp = port->p_vswp;
490 
491 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
492 
493 	(void) vsw_uninit_ldcs(port);
494 
495 	/*
496 	 * Wait for any pending ctrl msg tasks which reference this
497 	 * port to finish.
498 	 */
499 	if (vsw_drain_port_taskq(port))
500 		return (1);
501 
502 	/*
503 	 * Wait for port reference count to hit zero.
504 	 */
505 	while (port->ref_cnt != 0) {
506 		delay(drv_usectohz(VSW_PORT_REF_DELAY));
507 	}
508 
509 	/*
510 	 * Wait for any active callbacks to finish
511 	 */
512 	if (vsw_drain_ldcs(port))
513 		return (1);
514 
515 	ldcl = &port->p_ldclist;
516 	WRITE_ENTER(&ldcl->lockrw);
517 	while (ldcl->num_ldcs > 0) {
518 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
519 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
520 			    vswp->instance, ldcl->head->ldc_id);
521 			RW_EXIT(&ldcl->lockrw);
522 			return (1);
523 		}
524 	}
525 	RW_EXIT(&ldcl->lockrw);
526 
527 	rw_destroy(&port->p_ldclist.lockrw);
528 
529 	mutex_destroy(&port->mca_lock);
530 	mutex_destroy(&port->tx_lock);
531 	cv_destroy(&port->state_cv);
532 	mutex_destroy(&port->state_lock);
533 
534 	kmem_free(port, sizeof (vsw_port_t));
535 
536 	D1(vswp, "%s: exit", __func__);
537 
538 	return (0);
539 }
540 
541 /*
542  * Attach a logical domain channel (ldc) under a specified port.
543  *
544  * Returns 0 on success, 1 on failure.
545  */
546 static int
547 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
548 {
549 	vsw_t 		*vswp = port->p_vswp;
550 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
551 	vsw_ldc_t 	*ldcp = NULL;
552 	ldc_attr_t 	attr;
553 	ldc_status_t	istatus;
554 	int 		status = DDI_FAILURE;
555 	int		rv;
556 	char		kname[MAXNAMELEN];
557 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
558 			    PROG_callback = 0x2, PROG_rx_thread = 0x4,
559 			    PROG_tx_thread = 0x8}
560 			progress;
561 
562 	progress = PROG_init;
563 
564 	D1(vswp, "%s: enter", __func__);
565 
566 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
567 	if (ldcp == NULL) {
568 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
569 		return (1);
570 	}
571 	ldcp->ldc_id = ldc_id;
572 
573 	/* Allocate pools of receive mblks */
574 	rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
575 	    vsw_mblk_size1, vsw_mblk_size2, vsw_mblk_size3,
576 	    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
577 	if (rv) {
578 		DWARN(vswp, "%s: unable to create free mblk pools for"
579 		    " channel %ld (rv %d)", __func__, ldc_id, rv);
580 		kmem_free(ldcp, sizeof (vsw_ldc_t));
581 		return (1);
582 	}
583 
584 	progress |= PROG_mblks;
585 
586 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
587 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
588 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
589 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
590 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
591 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
592 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
593 
594 	/* required for handshake with peer */
595 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
596 	ldcp->peer_session = 0;
597 	ldcp->session_status = 0;
598 	ldcp->hss_id = 1;	/* Initial handshake session id */
599 
600 	/* only set for outbound lane, inbound set by peer */
601 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
602 
603 	attr.devclass = LDC_DEV_NT_SVC;
604 	attr.instance = ddi_get_instance(vswp->dip);
605 	attr.mode = LDC_MODE_UNRELIABLE;
606 	attr.mtu = VSW_LDC_MTU;
607 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
608 	if (status != 0) {
609 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
610 		    __func__, ldc_id, status);
611 		goto ldc_attach_fail;
612 	}
613 
614 	if (vsw_ldc_rxthr_enabled) {
615 		ldcp->rx_thr_flags = 0;
616 
617 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
618 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
619 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
620 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
621 
622 		progress |= PROG_rx_thread;
623 		if (ldcp->rx_thread == NULL) {
624 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
625 			    __func__, ldc_id);
626 			goto ldc_attach_fail;
627 		}
628 	}
629 
630 	if (vsw_ldc_txthr_enabled) {
631 		ldcp->tx_thr_flags = 0;
632 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
633 
634 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
635 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
636 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
637 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
638 
639 		progress |= PROG_tx_thread;
640 		if (ldcp->tx_thread == NULL) {
641 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
642 			    __func__, ldc_id);
643 			goto ldc_attach_fail;
644 		}
645 	}
646 
647 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
648 	if (status != 0) {
649 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
650 		    __func__, ldc_id, status);
651 		(void) ldc_fini(ldcp->ldc_handle);
652 		goto ldc_attach_fail;
653 	}
654 
655 	progress |= PROG_callback;
656 
657 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
658 
659 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
660 		DERR(vswp, "%s: ldc_status failed", __func__);
661 		mutex_destroy(&ldcp->status_lock);
662 		goto ldc_attach_fail;
663 	}
664 
665 	ldcp->ldc_status = istatus;
666 	ldcp->ldc_port = port;
667 	ldcp->ldc_vswp = vswp;
668 
669 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
670 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
671 	    kname, &ldcp->ldc_stats);
672 	if (ldcp->ksp == NULL) {
673 		DERR(vswp, "%s: kstats setup failed", __func__);
674 		goto ldc_attach_fail;
675 	}
676 
677 	/* link it into the list of channels for this port */
678 	WRITE_ENTER(&ldcl->lockrw);
679 	ldcp->ldc_next = ldcl->head;
680 	ldcl->head = ldcp;
681 	ldcl->num_ldcs++;
682 	RW_EXIT(&ldcl->lockrw);
683 
684 	D1(vswp, "%s: exit", __func__);
685 	return (0);
686 
687 ldc_attach_fail:
688 
689 	if (progress & PROG_callback) {
690 		(void) ldc_unreg_callback(ldcp->ldc_handle);
691 	}
692 
693 	if (progress & PROG_rx_thread) {
694 		if (ldcp->rx_thread != NULL) {
695 			vsw_stop_rx_thread(ldcp);
696 		}
697 		mutex_destroy(&ldcp->rx_thr_lock);
698 		cv_destroy(&ldcp->rx_thr_cv);
699 	}
700 
701 	if (progress & PROG_tx_thread) {
702 		if (ldcp->tx_thread != NULL) {
703 			vsw_stop_tx_thread(ldcp);
704 		}
705 		mutex_destroy(&ldcp->tx_thr_lock);
706 		cv_destroy(&ldcp->tx_thr_cv);
707 	}
708 	if (ldcp->ksp != NULL) {
709 		vgen_destroy_kstats(ldcp->ksp);
710 	}
711 	mutex_destroy(&ldcp->ldc_txlock);
712 	mutex_destroy(&ldcp->ldc_rxlock);
713 	mutex_destroy(&ldcp->ldc_cblock);
714 	mutex_destroy(&ldcp->drain_cv_lock);
715 
716 	cv_destroy(&ldcp->drain_cv);
717 
718 	rw_destroy(&ldcp->lane_in.dlistrw);
719 	rw_destroy(&ldcp->lane_out.dlistrw);
720 
721 	if (progress & PROG_mblks) {
722 		vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
723 	}
724 	kmem_free(ldcp, sizeof (vsw_ldc_t));
725 
726 	return (1);
727 }
728 
729 /*
730  * Detach a logical domain channel (ldc) belonging to a
731  * particular port.
732  *
733  * Returns 0 on success, 1 on failure.
734  */
735 static int
736 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
737 {
738 	vsw_t 		*vswp = port->p_vswp;
739 	vsw_ldc_t 	*ldcp, *prev_ldcp;
740 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
741 	int 		rv;
742 
743 	prev_ldcp = ldcl->head;
744 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
745 		if (ldcp->ldc_id == ldc_id) {
746 			break;
747 		}
748 	}
749 
750 	/* specified ldc id not found */
751 	if (ldcp == NULL) {
752 		DERR(vswp, "%s: ldcp = NULL", __func__);
753 		return (1);
754 	}
755 
756 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
757 
758 	/* Stop the receive thread */
759 	if (ldcp->rx_thread != NULL) {
760 		vsw_stop_rx_thread(ldcp);
761 		mutex_destroy(&ldcp->rx_thr_lock);
762 		cv_destroy(&ldcp->rx_thr_cv);
763 	}
764 
765 	/* Stop the tx thread */
766 	if (ldcp->tx_thread != NULL) {
767 		vsw_stop_tx_thread(ldcp);
768 		mutex_destroy(&ldcp->tx_thr_lock);
769 		cv_destroy(&ldcp->tx_thr_cv);
770 		if (ldcp->tx_mhead != NULL) {
771 			freemsgchain(ldcp->tx_mhead);
772 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
773 		}
774 	}
775 
776 	/* Destory kstats */
777 	vgen_destroy_kstats(ldcp->ksp);
778 
779 	/*
780 	 * Before we can close the channel we must release any mapped
781 	 * resources (e.g. drings).
782 	 */
783 	vsw_free_lane_resources(ldcp, INBOUND);
784 	vsw_free_lane_resources(ldcp, OUTBOUND);
785 
786 	/*
787 	 * If the close fails we are in serious trouble, as won't
788 	 * be able to delete the parent port.
789 	 */
790 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
791 		DERR(vswp, "%s: error %d closing channel %lld",
792 		    __func__, rv, ldcp->ldc_id);
793 		return (1);
794 	}
795 
796 	(void) ldc_fini(ldcp->ldc_handle);
797 
798 	ldcp->ldc_status = LDC_INIT;
799 	ldcp->ldc_handle = NULL;
800 	ldcp->ldc_vswp = NULL;
801 
802 
803 	/*
804 	 * Most likely some mblks are still in use and
805 	 * have not been returned to the pool. These mblks are
806 	 * added to the pool that is maintained in the device instance.
807 	 * Another attempt will be made to destroy the pool
808 	 * when the device detaches.
809 	 */
810 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
811 
812 	/* unlink it from the list */
813 	prev_ldcp = ldcp->ldc_next;
814 	ldcl->num_ldcs--;
815 
816 	mutex_destroy(&ldcp->ldc_txlock);
817 	mutex_destroy(&ldcp->ldc_rxlock);
818 	mutex_destroy(&ldcp->ldc_cblock);
819 	cv_destroy(&ldcp->drain_cv);
820 	mutex_destroy(&ldcp->drain_cv_lock);
821 	mutex_destroy(&ldcp->status_lock);
822 	rw_destroy(&ldcp->lane_in.dlistrw);
823 	rw_destroy(&ldcp->lane_out.dlistrw);
824 
825 	kmem_free(ldcp, sizeof (vsw_ldc_t));
826 
827 	return (0);
828 }
829 
830 /*
831  * Open and attempt to bring up the channel. Note that channel
832  * can only be brought up if peer has also opened channel.
833  *
834  * Returns 0 if can open and bring up channel, otherwise
835  * returns 1.
836  */
837 static int
838 vsw_ldc_init(vsw_ldc_t *ldcp)
839 {
840 	vsw_t 		*vswp = ldcp->ldc_vswp;
841 	ldc_status_t	istatus = 0;
842 	int		rv;
843 
844 	D1(vswp, "%s: enter", __func__);
845 
846 	LDC_ENTER_LOCK(ldcp);
847 
848 	/* don't start at 0 in case clients don't like that */
849 	ldcp->next_ident = 1;
850 
851 	rv = ldc_open(ldcp->ldc_handle);
852 	if (rv != 0) {
853 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
854 		    __func__, ldcp->ldc_id, rv);
855 		LDC_EXIT_LOCK(ldcp);
856 		return (1);
857 	}
858 
859 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
860 		DERR(vswp, "%s: unable to get status", __func__);
861 		LDC_EXIT_LOCK(ldcp);
862 		return (1);
863 
864 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
865 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
866 		    __func__, ldcp->ldc_id, istatus);
867 		LDC_EXIT_LOCK(ldcp);
868 		return (1);
869 	}
870 
871 	mutex_enter(&ldcp->status_lock);
872 	ldcp->ldc_status = istatus;
873 	mutex_exit(&ldcp->status_lock);
874 
875 	rv = ldc_up(ldcp->ldc_handle);
876 	if (rv != 0) {
877 		/*
878 		 * Not a fatal error for ldc_up() to fail, as peer
879 		 * end point may simply not be ready yet.
880 		 */
881 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
882 		    ldcp->ldc_id, rv);
883 		LDC_EXIT_LOCK(ldcp);
884 		return (1);
885 	}
886 
887 	/*
888 	 * ldc_up() call is non-blocking so need to explicitly
889 	 * check channel status to see if in fact the channel
890 	 * is UP.
891 	 */
892 	mutex_enter(&ldcp->status_lock);
893 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
894 		DERR(vswp, "%s: unable to get status", __func__);
895 		mutex_exit(&ldcp->status_lock);
896 		LDC_EXIT_LOCK(ldcp);
897 		return (1);
898 
899 	}
900 
901 	if (ldcp->ldc_status == LDC_UP) {
902 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
903 		    ldcp->ldc_id, istatus);
904 		mutex_exit(&ldcp->status_lock);
905 		LDC_EXIT_LOCK(ldcp);
906 
907 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
908 		return (0);
909 	}
910 
911 	mutex_exit(&ldcp->status_lock);
912 	LDC_EXIT_LOCK(ldcp);
913 
914 	D1(vswp, "%s: exit", __func__);
915 	return (0);
916 }
917 
918 /* disable callbacks on the channel */
919 static int
920 vsw_ldc_uninit(vsw_ldc_t *ldcp)
921 {
922 	vsw_t	*vswp = ldcp->ldc_vswp;
923 	int	rv;
924 
925 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
926 
927 	LDC_ENTER_LOCK(ldcp);
928 
929 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
930 	if (rv != 0) {
931 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
932 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
933 		LDC_EXIT_LOCK(ldcp);
934 		return (1);
935 	}
936 
937 	mutex_enter(&ldcp->status_lock);
938 	ldcp->ldc_status = LDC_INIT;
939 	mutex_exit(&ldcp->status_lock);
940 
941 	LDC_EXIT_LOCK(ldcp);
942 
943 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
944 
945 	return (0);
946 }
947 
948 static int
949 vsw_init_ldcs(vsw_port_t *port)
950 {
951 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
952 	vsw_ldc_t	*ldcp;
953 
954 	READ_ENTER(&ldcl->lockrw);
955 	ldcp =  ldcl->head;
956 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
957 		(void) vsw_ldc_init(ldcp);
958 	}
959 	RW_EXIT(&ldcl->lockrw);
960 
961 	return (0);
962 }
963 
964 static int
965 vsw_uninit_ldcs(vsw_port_t *port)
966 {
967 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
968 	vsw_ldc_t	*ldcp;
969 
970 	D1(NULL, "vsw_uninit_ldcs: enter\n");
971 
972 	READ_ENTER(&ldcl->lockrw);
973 	ldcp =  ldcl->head;
974 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
975 		(void) vsw_ldc_uninit(ldcp);
976 	}
977 	RW_EXIT(&ldcl->lockrw);
978 
979 	D1(NULL, "vsw_uninit_ldcs: exit\n");
980 
981 	return (0);
982 }
983 
984 /*
985  * Wait until the callback(s) associated with the ldcs under the specified
986  * port have completed.
987  *
988  * Prior to this function being invoked each channel under this port
989  * should have been quiesced via ldc_set_cb_mode(DISABLE).
990  *
991  * A short explaination of what we are doing below..
992  *
993  * The simplest approach would be to have a reference counter in
994  * the ldc structure which is increment/decremented by the callbacks as
995  * they use the channel. The drain function could then simply disable any
996  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
997  * there is a tiny window here - before the callback is able to get the lock
998  * on the channel it is interrupted and this function gets to execute. It
999  * sees that the ref count is zero and believes its free to delete the
1000  * associated data structures.
1001  *
1002  * We get around this by taking advantage of the fact that before the ldc
1003  * framework invokes a callback it sets a flag to indicate that there is a
1004  * callback active (or about to become active). If when we attempt to
1005  * unregister a callback when this active flag is set then the unregister
1006  * will fail with EWOULDBLOCK.
1007  *
1008  * If the unregister fails we do a cv_timedwait. We will either be signaled
1009  * by the callback as it is exiting (note we have to wait a short period to
1010  * allow the callback to return fully to the ldc framework and it to clear
1011  * the active flag), or by the timer expiring. In either case we again attempt
1012  * the unregister. We repeat this until we can succesfully unregister the
1013  * callback.
1014  *
1015  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1016  * the case where the callback has finished but the ldc framework has not yet
1017  * cleared the active flag. In this case we would never get a cv_signal.
1018  */
1019 static int
1020 vsw_drain_ldcs(vsw_port_t *port)
1021 {
1022 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1023 	vsw_ldc_t	*ldcp;
1024 	vsw_t		*vswp = port->p_vswp;
1025 
1026 	D1(vswp, "%s: enter", __func__);
1027 
1028 	READ_ENTER(&ldcl->lockrw);
1029 
1030 	ldcp = ldcl->head;
1031 
1032 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1033 		/*
1034 		 * If we can unregister the channel callback then we
1035 		 * know that there is no callback either running or
1036 		 * scheduled to run for this channel so move on to next
1037 		 * channel in the list.
1038 		 */
1039 		mutex_enter(&ldcp->drain_cv_lock);
1040 
1041 		/* prompt active callbacks to quit */
1042 		ldcp->drain_state = VSW_LDC_DRAINING;
1043 
1044 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1045 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1046 			    ldcp->ldc_id);
1047 			mutex_exit(&ldcp->drain_cv_lock);
1048 			continue;
1049 		} else {
1050 			/*
1051 			 * If we end up here we know that either 1) a callback
1052 			 * is currently executing, 2) is about to start (i.e.
1053 			 * the ldc framework has set the active flag but
1054 			 * has not actually invoked the callback yet, or 3)
1055 			 * has finished and has returned to the ldc framework
1056 			 * but the ldc framework has not yet cleared the
1057 			 * active bit.
1058 			 *
1059 			 * Wait for it to finish.
1060 			 */
1061 			while (ldc_unreg_callback(ldcp->ldc_handle)
1062 			    == EWOULDBLOCK)
1063 				(void) cv_timedwait(&ldcp->drain_cv,
1064 				    &ldcp->drain_cv_lock, lbolt + hz);
1065 
1066 			mutex_exit(&ldcp->drain_cv_lock);
1067 			D2(vswp, "%s: unreg callback for chan %ld after "
1068 			    "timeout", __func__, ldcp->ldc_id);
1069 		}
1070 	}
1071 	RW_EXIT(&ldcl->lockrw);
1072 
1073 	D1(vswp, "%s: exit", __func__);
1074 	return (0);
1075 }
1076 
1077 /*
1078  * Wait until all tasks which reference this port have completed.
1079  *
1080  * Prior to this function being invoked each channel under this port
1081  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1082  */
1083 static int
1084 vsw_drain_port_taskq(vsw_port_t *port)
1085 {
1086 	vsw_t		*vswp = port->p_vswp;
1087 
1088 	D1(vswp, "%s: enter", __func__);
1089 
1090 	/*
1091 	 * Mark the port as in the process of being detached, and
1092 	 * dispatch a marker task to the queue so we know when all
1093 	 * relevant tasks have completed.
1094 	 */
1095 	mutex_enter(&port->state_lock);
1096 	port->state = VSW_PORT_DETACHING;
1097 
1098 	if ((vswp->taskq_p == NULL) ||
1099 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1100 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1101 		DERR(vswp, "%s: unable to dispatch marker task",
1102 		    __func__);
1103 		mutex_exit(&port->state_lock);
1104 		return (1);
1105 	}
1106 
1107 	/*
1108 	 * Wait for the marker task to finish.
1109 	 */
1110 	while (port->state != VSW_PORT_DETACHABLE)
1111 		cv_wait(&port->state_cv, &port->state_lock);
1112 
1113 	mutex_exit(&port->state_lock);
1114 
1115 	D1(vswp, "%s: exit", __func__);
1116 
1117 	return (0);
1118 }
1119 
1120 static void
1121 vsw_marker_task(void *arg)
1122 {
1123 	vsw_port_t	*port = arg;
1124 	vsw_t		*vswp = port->p_vswp;
1125 
1126 	D1(vswp, "%s: enter", __func__);
1127 
1128 	mutex_enter(&port->state_lock);
1129 
1130 	/*
1131 	 * No further tasks should be dispatched which reference
1132 	 * this port so ok to mark it as safe to detach.
1133 	 */
1134 	port->state = VSW_PORT_DETACHABLE;
1135 
1136 	cv_signal(&port->state_cv);
1137 
1138 	mutex_exit(&port->state_lock);
1139 
1140 	D1(vswp, "%s: exit", __func__);
1141 }
1142 
1143 vsw_port_t *
1144 vsw_lookup_port(vsw_t *vswp, int p_instance)
1145 {
1146 	vsw_port_list_t *plist = &vswp->plist;
1147 	vsw_port_t	*port;
1148 
1149 	for (port = plist->head; port != NULL; port = port->p_next) {
1150 		if (port->p_instance == p_instance) {
1151 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1152 			return (port);
1153 		}
1154 	}
1155 
1156 	return (NULL);
1157 }
1158 
1159 /*
1160  * Search for and remove the specified port from the port
1161  * list. Returns 0 if able to locate and remove port, otherwise
1162  * returns 1.
1163  */
1164 static int
1165 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1166 {
1167 	vsw_port_list_t *plist = &vswp->plist;
1168 	vsw_port_t	*curr_p, *prev_p;
1169 
1170 	if (plist->head == NULL)
1171 		return (1);
1172 
1173 	curr_p = prev_p = plist->head;
1174 
1175 	while (curr_p != NULL) {
1176 		if (curr_p == port) {
1177 			if (prev_p == curr_p) {
1178 				plist->head = curr_p->p_next;
1179 			} else {
1180 				prev_p->p_next = curr_p->p_next;
1181 			}
1182 			plist->num_ports--;
1183 			break;
1184 		} else {
1185 			prev_p = curr_p;
1186 			curr_p = curr_p->p_next;
1187 		}
1188 	}
1189 	return (0);
1190 }
1191 
1192 /*
1193  * Interrupt handler for ldc messages.
1194  */
1195 static uint_t
1196 vsw_ldc_cb(uint64_t event, caddr_t arg)
1197 {
1198 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1199 	vsw_t 		*vswp = ldcp->ldc_vswp;
1200 
1201 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1202 
1203 	mutex_enter(&ldcp->ldc_cblock);
1204 	ldcp->ldc_stats.callbacks++;
1205 
1206 	mutex_enter(&ldcp->status_lock);
1207 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1208 		mutex_exit(&ldcp->status_lock);
1209 		mutex_exit(&ldcp->ldc_cblock);
1210 		return (LDC_SUCCESS);
1211 	}
1212 	mutex_exit(&ldcp->status_lock);
1213 
1214 	if (event & LDC_EVT_UP) {
1215 		/*
1216 		 * Channel has come up.
1217 		 */
1218 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1219 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1220 
1221 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1222 
1223 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1224 	}
1225 
1226 	if (event & LDC_EVT_READ) {
1227 		/*
1228 		 * Data available for reading.
1229 		 */
1230 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1231 		    __func__, ldcp->ldc_id, event);
1232 
1233 		if (ldcp->rx_thread != NULL) {
1234 			/*
1235 			 * If the receive thread is enabled, then
1236 			 * wakeup the receive thread to process the
1237 			 * LDC messages.
1238 			 */
1239 			mutex_exit(&ldcp->ldc_cblock);
1240 			mutex_enter(&ldcp->rx_thr_lock);
1241 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1242 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1243 				cv_signal(&ldcp->rx_thr_cv);
1244 			}
1245 			mutex_exit(&ldcp->rx_thr_lock);
1246 			mutex_enter(&ldcp->ldc_cblock);
1247 		} else {
1248 			vsw_process_pkt(ldcp);
1249 		}
1250 
1251 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1252 
1253 		goto vsw_cb_exit;
1254 	}
1255 
1256 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1257 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1258 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1259 
1260 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1261 	}
1262 
1263 	/*
1264 	 * Catch either LDC_EVT_WRITE which we don't support or any
1265 	 * unknown event.
1266 	 */
1267 	if (event &
1268 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1269 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1270 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1271 	}
1272 
1273 vsw_cb_exit:
1274 	mutex_exit(&ldcp->ldc_cblock);
1275 
1276 	/*
1277 	 * Let the drain function know we are finishing if it
1278 	 * is waiting.
1279 	 */
1280 	mutex_enter(&ldcp->drain_cv_lock);
1281 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1282 		cv_signal(&ldcp->drain_cv);
1283 	mutex_exit(&ldcp->drain_cv_lock);
1284 
1285 	return (LDC_SUCCESS);
1286 }
1287 
1288 /*
1289  * Reinitialise data structures associated with the channel.
1290  */
1291 static void
1292 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1293 {
1294 	vsw_t		*vswp = ldcp->ldc_vswp;
1295 	vsw_port_t	*port;
1296 	vsw_ldc_list_t	*ldcl;
1297 
1298 	D1(vswp, "%s: enter", __func__);
1299 
1300 	port = ldcp->ldc_port;
1301 	ldcl = &port->p_ldclist;
1302 
1303 	READ_ENTER(&ldcl->lockrw);
1304 
1305 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1306 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1307 
1308 	vsw_free_lane_resources(ldcp, INBOUND);
1309 	vsw_free_lane_resources(ldcp, OUTBOUND);
1310 	RW_EXIT(&ldcl->lockrw);
1311 
1312 	ldcp->lane_in.lstate = 0;
1313 	ldcp->lane_out.lstate = 0;
1314 
1315 	/*
1316 	 * Remove parent port from any multicast groups
1317 	 * it may have registered with. Client must resend
1318 	 * multicast add command after handshake completes.
1319 	 */
1320 	(void) vsw_del_fdb(vswp, port);
1321 
1322 	vsw_del_mcst_port(port);
1323 
1324 	ldcp->peer_session = 0;
1325 	ldcp->session_status = 0;
1326 	ldcp->hcnt = 0;
1327 	ldcp->hphase = VSW_MILESTONE0;
1328 	ldcp->tx_failures = 0;
1329 
1330 	D1(vswp, "%s: exit", __func__);
1331 }
1332 
1333 /*
1334  * Process a connection event.
1335  *
1336  * Note - care must be taken to ensure that this function is
1337  * not called with the dlistrw lock held.
1338  */
1339 static void
1340 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1341 {
1342 	vsw_t		*vswp = ldcp->ldc_vswp;
1343 	vsw_conn_evt_t	*conn = NULL;
1344 
1345 	D1(vswp, "%s: enter", __func__);
1346 
1347 	/*
1348 	 * Check if either a reset or restart event is pending
1349 	 * or in progress. If so just return.
1350 	 *
1351 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1352 	 * being received by the callback handler, or a ECONNRESET error
1353 	 * code being returned from a ldc_read() or ldc_write() call.
1354 	 *
1355 	 * A VSW_CONN_RESTART event occurs when some error checking code
1356 	 * decides that there is a problem with data from the channel,
1357 	 * and that the handshake should be restarted.
1358 	 */
1359 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1360 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1361 		return;
1362 
1363 	/*
1364 	 * If it is an LDC_UP event we first check the recorded
1365 	 * state of the channel. If this is UP then we know that
1366 	 * the channel moving to the UP state has already been dealt
1367 	 * with and don't need to dispatch a  new task.
1368 	 *
1369 	 * The reason for this check is that when we do a ldc_up(),
1370 	 * depending on the state of the peer, we may or may not get
1371 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1372 	 * every time we do ldc_up() we explicitly check the channel
1373 	 * status to see has it come up (ldc_up() is asynch and will
1374 	 * complete at some undefined time), and take the appropriate
1375 	 * action.
1376 	 *
1377 	 * The flip side of this is that we may get a LDC_UP event
1378 	 * when we have already seen that the channel is up and have
1379 	 * dealt with that.
1380 	 */
1381 	mutex_enter(&ldcp->status_lock);
1382 	if (evt == VSW_CONN_UP) {
1383 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1384 			mutex_exit(&ldcp->status_lock);
1385 			return;
1386 		}
1387 	}
1388 	mutex_exit(&ldcp->status_lock);
1389 
1390 	/*
1391 	 * The transaction group id allows us to identify and discard
1392 	 * any tasks which are still pending on the taskq and refer
1393 	 * to the handshake session we are about to restart or reset.
1394 	 * These stale messages no longer have any real meaning.
1395 	 */
1396 	(void) atomic_inc_32(&ldcp->hss_id);
1397 
1398 	ASSERT(vswp->taskq_p != NULL);
1399 
1400 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1401 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1402 		    " connection event", vswp->instance);
1403 		goto err_exit;
1404 	}
1405 
1406 	conn->evt = evt;
1407 	conn->ldcp = ldcp;
1408 
1409 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1410 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1411 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1412 		    vswp->instance);
1413 
1414 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1415 		goto err_exit;
1416 	}
1417 
1418 	D1(vswp, "%s: exit", __func__);
1419 	return;
1420 
1421 err_exit:
1422 	/*
1423 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1424 	 * that future requests will at least be attempted and will hopefully
1425 	 * succeed.
1426 	 */
1427 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1428 		ldcp->reset_active = 0;
1429 }
1430 
1431 /*
1432  * Deal with events relating to a connection. Invoked from a taskq.
1433  */
1434 static void
1435 vsw_conn_task(void *arg)
1436 {
1437 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1438 	vsw_ldc_t	*ldcp = NULL;
1439 	vsw_t		*vswp = NULL;
1440 	uint16_t	evt;
1441 	ldc_status_t	curr_status;
1442 
1443 	ldcp = conn->ldcp;
1444 	evt = conn->evt;
1445 	vswp = ldcp->ldc_vswp;
1446 
1447 	D1(vswp, "%s: enter", __func__);
1448 
1449 	/* can safely free now have copied out data */
1450 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1451 
1452 	mutex_enter(&ldcp->status_lock);
1453 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1454 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1455 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1456 		mutex_exit(&ldcp->status_lock);
1457 		return;
1458 	}
1459 
1460 	/*
1461 	 * If we wish to restart the handshake on this channel, then if
1462 	 * the channel is UP we bring it DOWN to flush the underlying
1463 	 * ldc queue.
1464 	 */
1465 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1466 		(void) ldc_down(ldcp->ldc_handle);
1467 
1468 	/*
1469 	 * re-init all the associated data structures.
1470 	 */
1471 	vsw_ldc_reinit(ldcp);
1472 
1473 	/*
1474 	 * Bring the channel back up (note it does no harm to
1475 	 * do this even if the channel is already UP, Just
1476 	 * becomes effectively a no-op).
1477 	 */
1478 	(void) ldc_up(ldcp->ldc_handle);
1479 
1480 	/*
1481 	 * Check if channel is now UP. This will only happen if
1482 	 * peer has also done a ldc_up().
1483 	 */
1484 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1485 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1486 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1487 		mutex_exit(&ldcp->status_lock);
1488 		return;
1489 	}
1490 
1491 	ldcp->ldc_status = curr_status;
1492 
1493 	/* channel UP so restart handshake by sending version info */
1494 	if (curr_status == LDC_UP) {
1495 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1496 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1497 			    " handshake attempts (%d) on channel %ld",
1498 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1499 			mutex_exit(&ldcp->status_lock);
1500 			return;
1501 		}
1502 
1503 		if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1504 		    DDI_NOSLEEP) != DDI_SUCCESS) {
1505 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1506 			    vswp->instance);
1507 
1508 			/*
1509 			 * Don't count as valid restart attempt if couldn't
1510 			 * send version msg.
1511 			 */
1512 			if (ldcp->hcnt > 0)
1513 				ldcp->hcnt--;
1514 		}
1515 	}
1516 
1517 	/*
1518 	 * Mark that the process is complete by clearing the flag.
1519 	 *
1520 	 * Note is it possible that the taskq dispatch above may have failed,
1521 	 * most likely due to memory shortage. We still clear the flag so
1522 	 * future attempts will at least be attempted and will hopefully
1523 	 * succeed.
1524 	 */
1525 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1526 		ldcp->reset_active = 0;
1527 
1528 	mutex_exit(&ldcp->status_lock);
1529 
1530 	D1(vswp, "%s: exit", __func__);
1531 }
1532 
1533 /*
1534  * returns 0 if legal for event signified by flag to have
1535  * occured at the time it did. Otherwise returns 1.
1536  */
1537 int
1538 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1539 {
1540 	vsw_t		*vswp = ldcp->ldc_vswp;
1541 	uint64_t	state;
1542 	uint64_t	phase;
1543 
1544 	if (dir == INBOUND)
1545 		state = ldcp->lane_in.lstate;
1546 	else
1547 		state = ldcp->lane_out.lstate;
1548 
1549 	phase = ldcp->hphase;
1550 
1551 	switch (flag) {
1552 	case VSW_VER_INFO_RECV:
1553 		if (phase > VSW_MILESTONE0) {
1554 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1555 			    " when in state %d\n", ldcp->ldc_id, phase);
1556 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1557 			return (1);
1558 		}
1559 		break;
1560 
1561 	case VSW_VER_ACK_RECV:
1562 	case VSW_VER_NACK_RECV:
1563 		if (!(state & VSW_VER_INFO_SENT)) {
1564 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1565 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1566 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1567 			return (1);
1568 		} else
1569 			state &= ~VSW_VER_INFO_SENT;
1570 		break;
1571 
1572 	case VSW_ATTR_INFO_RECV:
1573 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1574 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1575 			    " when in state %d\n", ldcp->ldc_id, phase);
1576 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1577 			return (1);
1578 		}
1579 		break;
1580 
1581 	case VSW_ATTR_ACK_RECV:
1582 	case VSW_ATTR_NACK_RECV:
1583 		if (!(state & VSW_ATTR_INFO_SENT)) {
1584 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1585 			    " or ATTR_NACK when in state %d\n",
1586 			    ldcp->ldc_id, phase);
1587 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1588 			return (1);
1589 		} else
1590 			state &= ~VSW_ATTR_INFO_SENT;
1591 		break;
1592 
1593 	case VSW_DRING_INFO_RECV:
1594 		if (phase < VSW_MILESTONE1) {
1595 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1596 			    " when in state %d\n", ldcp->ldc_id, phase);
1597 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1598 			return (1);
1599 		}
1600 		break;
1601 
1602 	case VSW_DRING_ACK_RECV:
1603 	case VSW_DRING_NACK_RECV:
1604 		if (!(state & VSW_DRING_INFO_SENT)) {
1605 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1606 			    " or DRING_NACK when in state %d\n",
1607 			    ldcp->ldc_id, phase);
1608 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1609 			return (1);
1610 		} else
1611 			state &= ~VSW_DRING_INFO_SENT;
1612 		break;
1613 
1614 	case VSW_RDX_INFO_RECV:
1615 		if (phase < VSW_MILESTONE3) {
1616 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1617 			    " when in state %d\n", ldcp->ldc_id, phase);
1618 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1619 			return (1);
1620 		}
1621 		break;
1622 
1623 	case VSW_RDX_ACK_RECV:
1624 	case VSW_RDX_NACK_RECV:
1625 		if (!(state & VSW_RDX_INFO_SENT)) {
1626 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1627 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1628 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1629 			return (1);
1630 		} else
1631 			state &= ~VSW_RDX_INFO_SENT;
1632 		break;
1633 
1634 	case VSW_MCST_INFO_RECV:
1635 		if (phase < VSW_MILESTONE3) {
1636 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1637 			    " when in state %d\n", ldcp->ldc_id, phase);
1638 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1639 			return (1);
1640 		}
1641 		break;
1642 
1643 	default:
1644 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1645 		    ldcp->ldc_id, flag);
1646 		return (1);
1647 	}
1648 
1649 	if (dir == INBOUND)
1650 		ldcp->lane_in.lstate = state;
1651 	else
1652 		ldcp->lane_out.lstate = state;
1653 
1654 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1655 
1656 	return (0);
1657 }
1658 
1659 void
1660 vsw_next_milestone(vsw_ldc_t *ldcp)
1661 {
1662 	vsw_t		*vswp = ldcp->ldc_vswp;
1663 
1664 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1665 	    ldcp->ldc_id, ldcp->hphase);
1666 
1667 	DUMP_FLAGS(ldcp->lane_in.lstate);
1668 	DUMP_FLAGS(ldcp->lane_out.lstate);
1669 
1670 	switch (ldcp->hphase) {
1671 
1672 	case VSW_MILESTONE0:
1673 		/*
1674 		 * If we haven't started to handshake with our peer,
1675 		 * start to do so now.
1676 		 */
1677 		if (ldcp->lane_out.lstate == 0) {
1678 			D2(vswp, "%s: (chan %lld) starting handshake "
1679 			    "with peer", __func__, ldcp->ldc_id);
1680 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1681 		}
1682 
1683 		/*
1684 		 * Only way to pass this milestone is to have successfully
1685 		 * negotiated version info.
1686 		 */
1687 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1688 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1689 
1690 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1691 			    __func__, ldcp->ldc_id);
1692 
1693 			/*
1694 			 * Next milestone is passed when attribute
1695 			 * information has been successfully exchanged.
1696 			 */
1697 			ldcp->hphase = VSW_MILESTONE1;
1698 			vsw_send_attr(ldcp);
1699 
1700 		}
1701 		break;
1702 
1703 	case VSW_MILESTONE1:
1704 		/*
1705 		 * Only way to pass this milestone is to have successfully
1706 		 * negotiated attribute information.
1707 		 */
1708 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1709 
1710 			ldcp->hphase = VSW_MILESTONE2;
1711 
1712 			/*
1713 			 * If the peer device has said it wishes to
1714 			 * use descriptor rings then we send it our ring
1715 			 * info, otherwise we just set up a private ring
1716 			 * which we use an internal buffer
1717 			 */
1718 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
1719 				vsw_send_dring_info(ldcp);
1720 		}
1721 		break;
1722 
1723 	case VSW_MILESTONE2:
1724 		/*
1725 		 * If peer has indicated in its attribute message that
1726 		 * it wishes to use descriptor rings then the only way
1727 		 * to pass this milestone is for us to have received
1728 		 * valid dring info.
1729 		 *
1730 		 * If peer is not using descriptor rings then just fall
1731 		 * through.
1732 		 */
1733 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
1734 		    (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
1735 			break;
1736 
1737 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1738 		    __func__, ldcp->ldc_id);
1739 
1740 		ldcp->hphase = VSW_MILESTONE3;
1741 		vsw_send_rdx(ldcp);
1742 		break;
1743 
1744 	case VSW_MILESTONE3:
1745 		/*
1746 		 * Pass this milestone when all paramaters have been
1747 		 * successfully exchanged and RDX sent in both directions.
1748 		 *
1749 		 * Mark outbound lane as available to transmit data.
1750 		 */
1751 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
1752 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
1753 
1754 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1755 			    __func__, ldcp->ldc_id);
1756 			D2(vswp, "%s: ** handshake complete (0x%llx : "
1757 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
1758 			    ldcp->lane_out.lstate);
1759 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
1760 			ldcp->hphase = VSW_MILESTONE4;
1761 			ldcp->hcnt = 0;
1762 			DISPLAY_STATE();
1763 		} else {
1764 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1765 			    __func__, ldcp->lane_in.lstate,
1766 			    ldcp->lane_out.lstate);
1767 		}
1768 		break;
1769 
1770 	case VSW_MILESTONE4:
1771 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1772 		    ldcp->ldc_id);
1773 		break;
1774 
1775 	default:
1776 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1777 		    ldcp->ldc_id, ldcp->hphase);
1778 	}
1779 
1780 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1781 	    ldcp->hphase);
1782 }
1783 
1784 /*
1785  * Check if major version is supported.
1786  *
1787  * Returns 0 if finds supported major number, and if necessary
1788  * adjusts the minor field.
1789  *
1790  * Returns 1 if can't match major number exactly. Sets mjor/minor
1791  * to next lowest support values, or to zero if no other values possible.
1792  */
1793 static int
1794 vsw_supported_version(vio_ver_msg_t *vp)
1795 {
1796 	int	i;
1797 
1798 	D1(NULL, "vsw_supported_version: enter");
1799 
1800 	for (i = 0; i < VSW_NUM_VER; i++) {
1801 		if (vsw_versions[i].ver_major == vp->ver_major) {
1802 			/*
1803 			 * Matching or lower major version found. Update
1804 			 * minor number if necessary.
1805 			 */
1806 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1807 				D2(NULL, "%s: adjusting minor value from %d "
1808 				    "to %d", __func__, vp->ver_minor,
1809 				    vsw_versions[i].ver_minor);
1810 				vp->ver_minor = vsw_versions[i].ver_minor;
1811 			}
1812 
1813 			return (0);
1814 		}
1815 
1816 		if (vsw_versions[i].ver_major < vp->ver_major) {
1817 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1818 				D2(NULL, "%s: adjusting minor value from %d "
1819 				    "to %d", __func__, vp->ver_minor,
1820 				    vsw_versions[i].ver_minor);
1821 				vp->ver_minor = vsw_versions[i].ver_minor;
1822 			}
1823 			return (1);
1824 		}
1825 	}
1826 
1827 	/* No match was possible, zero out fields */
1828 	vp->ver_major = 0;
1829 	vp->ver_minor = 0;
1830 
1831 	D1(NULL, "vsw_supported_version: exit");
1832 
1833 	return (1);
1834 }
1835 
1836 /*
1837  * Main routine for processing messages received over LDC.
1838  */
1839 static void
1840 vsw_process_pkt(void *arg)
1841 {
1842 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1843 	vsw_t 		*vswp = ldcp->ldc_vswp;
1844 	size_t		msglen;
1845 	vio_msg_tag_t	tag;
1846 	def_msg_t	dmsg;
1847 	int 		rv = 0;
1848 
1849 
1850 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1851 
1852 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
1853 
1854 	/*
1855 	 * If channel is up read messages until channel is empty.
1856 	 */
1857 	do {
1858 		msglen = sizeof (dmsg);
1859 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
1860 
1861 		if (rv != 0) {
1862 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
1863 			    __func__, ldcp->ldc_id, rv, msglen);
1864 		}
1865 
1866 		/* channel has been reset */
1867 		if (rv == ECONNRESET) {
1868 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1869 			break;
1870 		}
1871 
1872 		if (msglen == 0) {
1873 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
1874 			    ldcp->ldc_id);
1875 			break;
1876 		}
1877 
1878 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
1879 		    ldcp->ldc_id, msglen);
1880 
1881 		/*
1882 		 * Figure out what sort of packet we have gotten by
1883 		 * examining the msg tag, and then switch it appropriately.
1884 		 */
1885 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
1886 
1887 		switch (tag.vio_msgtype) {
1888 		case VIO_TYPE_CTRL:
1889 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
1890 			break;
1891 		case VIO_TYPE_DATA:
1892 			vsw_process_data_pkt(ldcp, &dmsg, tag);
1893 			break;
1894 		case VIO_TYPE_ERR:
1895 			vsw_process_err_pkt(ldcp, &dmsg, tag);
1896 			break;
1897 		default:
1898 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
1899 			    "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
1900 			break;
1901 		}
1902 	} while (msglen);
1903 
1904 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1905 }
1906 
1907 /*
1908  * Dispatch a task to process a VIO control message.
1909  */
1910 static void
1911 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
1912 {
1913 	vsw_ctrl_task_t		*ctaskp = NULL;
1914 	vsw_port_t		*port = ldcp->ldc_port;
1915 	vsw_t			*vswp = port->p_vswp;
1916 
1917 	D1(vswp, "%s: enter", __func__);
1918 
1919 	/*
1920 	 * We need to handle RDX ACK messages in-band as once they
1921 	 * are exchanged it is possible that we will get an
1922 	 * immediate (legitimate) data packet.
1923 	 */
1924 	if ((tag.vio_subtype_env == VIO_RDX) &&
1925 	    (tag.vio_subtype == VIO_SUBTYPE_ACK)) {
1926 
1927 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
1928 			return;
1929 
1930 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
1931 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
1932 		    "(ostate 0x%llx : hphase %d)", __func__,
1933 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
1934 		vsw_next_milestone(ldcp);
1935 		return;
1936 	}
1937 
1938 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
1939 
1940 	if (ctaskp == NULL) {
1941 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
1942 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1943 		return;
1944 	}
1945 
1946 	ctaskp->ldcp = ldcp;
1947 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
1948 	ctaskp->hss_id = ldcp->hss_id;
1949 
1950 	/*
1951 	 * Dispatch task to processing taskq if port is not in
1952 	 * the process of being detached.
1953 	 */
1954 	mutex_enter(&port->state_lock);
1955 	if (port->state == VSW_PORT_INIT) {
1956 		if ((vswp->taskq_p == NULL) ||
1957 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
1958 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
1959 			DERR(vswp, "%s: unable to dispatch task to taskq",
1960 			    __func__);
1961 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
1962 			mutex_exit(&port->state_lock);
1963 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1964 			return;
1965 		}
1966 	} else {
1967 		DWARN(vswp, "%s: port %d detaching, not dispatching "
1968 		    "task", __func__, port->p_instance);
1969 	}
1970 
1971 	mutex_exit(&port->state_lock);
1972 
1973 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
1974 	    ldcp->ldc_id);
1975 	D1(vswp, "%s: exit", __func__);
1976 }
1977 
1978 /*
1979  * Process a VIO ctrl message. Invoked from taskq.
1980  */
1981 static void
1982 vsw_process_ctrl_pkt(void *arg)
1983 {
1984 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
1985 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
1986 	vsw_t 		*vswp = ldcp->ldc_vswp;
1987 	vio_msg_tag_t	tag;
1988 	uint16_t	env;
1989 
1990 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
1991 
1992 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
1993 	env = tag.vio_subtype_env;
1994 
1995 	/* stale pkt check */
1996 	if (ctaskp->hss_id < ldcp->hss_id) {
1997 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
1998 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
1999 		return;
2000 	}
2001 
2002 	/* session id check */
2003 	if (ldcp->session_status & VSW_PEER_SESSION) {
2004 		if (ldcp->peer_session != tag.vio_sid) {
2005 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2006 			    __func__, ldcp->ldc_id, tag.vio_sid);
2007 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2008 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2009 			return;
2010 		}
2011 	}
2012 
2013 	/*
2014 	 * Switch on vio_subtype envelope, then let lower routines
2015 	 * decide if its an INFO, ACK or NACK packet.
2016 	 */
2017 	switch (env) {
2018 	case VIO_VER_INFO:
2019 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2020 		break;
2021 	case VIO_DRING_REG:
2022 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2023 		break;
2024 	case VIO_DRING_UNREG:
2025 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2026 		break;
2027 	case VIO_ATTR_INFO:
2028 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2029 		break;
2030 	case VNET_MCAST_INFO:
2031 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2032 		break;
2033 	case VIO_RDX:
2034 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2035 		break;
2036 	default:
2037 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2038 	}
2039 
2040 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2041 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2042 }
2043 
2044 /*
2045  * Version negotiation. We can end up here either because our peer
2046  * has responded to a handshake message we have sent it, or our peer
2047  * has initiated a handshake with us. If its the former then can only
2048  * be ACK or NACK, if its the later can only be INFO.
2049  *
2050  * If its an ACK we move to the next stage of the handshake, namely
2051  * attribute exchange. If its a NACK we see if we can specify another
2052  * version, if we can't we stop.
2053  *
2054  * If it is an INFO we reset all params associated with communication
2055  * in that direction over this channel (remember connection is
2056  * essentially 2 independent simplex channels).
2057  */
2058 void
2059 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2060 {
2061 	vio_ver_msg_t	*ver_pkt;
2062 	vsw_t 		*vswp = ldcp->ldc_vswp;
2063 
2064 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2065 
2066 	/*
2067 	 * We know this is a ctrl/version packet so
2068 	 * cast it into the correct structure.
2069 	 */
2070 	ver_pkt = (vio_ver_msg_t *)pkt;
2071 
2072 	switch (ver_pkt->tag.vio_subtype) {
2073 	case VIO_SUBTYPE_INFO:
2074 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2075 
2076 		/*
2077 		 * Record the session id, which we will use from now
2078 		 * until we see another VER_INFO msg. Even then the
2079 		 * session id in most cases will be unchanged, execpt
2080 		 * if channel was reset.
2081 		 */
2082 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2083 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2084 			DERR(vswp, "%s: updating session id for chan %lld "
2085 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2086 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2087 		}
2088 
2089 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2090 		ldcp->session_status |= VSW_PEER_SESSION;
2091 
2092 		/* Legal message at this time ? */
2093 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2094 			return;
2095 
2096 		/*
2097 		 * First check the device class. Currently only expect
2098 		 * to be talking to a network device. In the future may
2099 		 * also talk to another switch.
2100 		 */
2101 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2102 			DERR(vswp, "%s: illegal device class %d", __func__,
2103 			    ver_pkt->dev_class);
2104 
2105 			ver_pkt->tag.vio_sid = ldcp->local_session;
2106 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2107 
2108 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2109 
2110 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2111 			    sizeof (vio_ver_msg_t), B_TRUE);
2112 
2113 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2114 			vsw_next_milestone(ldcp);
2115 			return;
2116 		} else {
2117 			ldcp->dev_class = ver_pkt->dev_class;
2118 		}
2119 
2120 		/*
2121 		 * Now check the version.
2122 		 */
2123 		if (vsw_supported_version(ver_pkt) == 0) {
2124 			/*
2125 			 * Support this major version and possibly
2126 			 * adjusted minor version.
2127 			 */
2128 
2129 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2130 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2131 
2132 			/* Store accepted values */
2133 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2134 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2135 
2136 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2137 
2138 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2139 		} else {
2140 			/*
2141 			 * NACK back with the next lower major/minor
2142 			 * pairing we support (if don't suuport any more
2143 			 * versions then they will be set to zero.
2144 			 */
2145 
2146 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2147 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2148 
2149 			/* Store updated values */
2150 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2151 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2152 
2153 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2154 
2155 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2156 		}
2157 
2158 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2159 		ver_pkt->tag.vio_sid = ldcp->local_session;
2160 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2161 		    sizeof (vio_ver_msg_t), B_TRUE);
2162 
2163 		vsw_next_milestone(ldcp);
2164 		break;
2165 
2166 	case VIO_SUBTYPE_ACK:
2167 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2168 
2169 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2170 			return;
2171 
2172 		/* Store updated values */
2173 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
2174 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2175 
2176 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2177 		vsw_next_milestone(ldcp);
2178 
2179 		break;
2180 
2181 	case VIO_SUBTYPE_NACK:
2182 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2183 
2184 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2185 			return;
2186 
2187 		/*
2188 		 * If our peer sent us a NACK with the ver fields set to
2189 		 * zero then there is nothing more we can do. Otherwise see
2190 		 * if we support either the version suggested, or a lesser
2191 		 * one.
2192 		 */
2193 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2194 			DERR(vswp, "%s: peer unable to negotiate any "
2195 			    "further.", __func__);
2196 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2197 			vsw_next_milestone(ldcp);
2198 			return;
2199 		}
2200 
2201 		/*
2202 		 * Check to see if we support this major version or
2203 		 * a lower one. If we don't then maj/min will be set
2204 		 * to zero.
2205 		 */
2206 		(void) vsw_supported_version(ver_pkt);
2207 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2208 			/* Nothing more we can do */
2209 			DERR(vswp, "%s: version negotiation failed.\n",
2210 			    __func__);
2211 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2212 			vsw_next_milestone(ldcp);
2213 		} else {
2214 			/* found a supported major version */
2215 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2216 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2217 
2218 			D2(vswp, "%s: resending with updated values (%x, %x)",
2219 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2220 
2221 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2222 			ver_pkt->tag.vio_sid = ldcp->local_session;
2223 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2224 
2225 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2226 
2227 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2228 			    sizeof (vio_ver_msg_t), B_TRUE);
2229 
2230 			vsw_next_milestone(ldcp);
2231 
2232 		}
2233 		break;
2234 
2235 	default:
2236 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2237 		    ver_pkt->tag.vio_subtype);
2238 	}
2239 
2240 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2241 }
2242 
2243 /*
2244  * Process an attribute packet. We can end up here either because our peer
2245  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2246  * peer has sent us an attribute INFO message
2247  *
2248  * If its an ACK we then move to the next stage of the handshake which
2249  * is to send our descriptor ring info to our peer. If its a NACK then
2250  * there is nothing more we can (currently) do.
2251  *
2252  * If we get a valid/acceptable INFO packet (and we have already negotiated
2253  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2254  * NACK back and reset channel state to INACTIV.
2255  *
2256  * FUTURE: in time we will probably negotiate over attributes, but for
2257  * the moment unacceptable attributes are regarded as a fatal error.
2258  *
2259  */
2260 void
2261 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2262 {
2263 	vnet_attr_msg_t		*attr_pkt;
2264 	vsw_t			*vswp = ldcp->ldc_vswp;
2265 	vsw_port_t		*port = ldcp->ldc_port;
2266 	uint64_t		macaddr = 0;
2267 	int			i;
2268 
2269 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2270 
2271 	/*
2272 	 * We know this is a ctrl/attr packet so
2273 	 * cast it into the correct structure.
2274 	 */
2275 	attr_pkt = (vnet_attr_msg_t *)pkt;
2276 
2277 	switch (attr_pkt->tag.vio_subtype) {
2278 	case VIO_SUBTYPE_INFO:
2279 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2280 
2281 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2282 			return;
2283 
2284 		/*
2285 		 * If the attributes are unacceptable then we NACK back.
2286 		 */
2287 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
2288 
2289 			DERR(vswp, "%s (chan %d): invalid attributes",
2290 			    __func__, ldcp->ldc_id);
2291 
2292 			vsw_free_lane_resources(ldcp, INBOUND);
2293 
2294 			attr_pkt->tag.vio_sid = ldcp->local_session;
2295 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2296 
2297 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2298 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2299 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2300 			    sizeof (vnet_attr_msg_t), B_TRUE);
2301 
2302 			vsw_next_milestone(ldcp);
2303 			return;
2304 		}
2305 
2306 		/*
2307 		 * Otherwise store attributes for this lane and update
2308 		 * lane state.
2309 		 */
2310 		ldcp->lane_in.mtu = attr_pkt->mtu;
2311 		ldcp->lane_in.addr = attr_pkt->addr;
2312 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
2313 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
2314 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
2315 
2316 		macaddr = ldcp->lane_in.addr;
2317 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2318 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2319 			macaddr >>= 8;
2320 		}
2321 
2322 		/* create the fdb entry for this port/mac address */
2323 		(void) vsw_add_fdb(vswp, port);
2324 
2325 		/* setup device specifc xmit routines */
2326 		mutex_enter(&port->tx_lock);
2327 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
2328 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2329 			port->transmit = vsw_dringsend;
2330 			ldcp->lane_out.xfer_mode = VIO_DRING_MODE;
2331 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
2332 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2333 			vsw_create_privring(ldcp);
2334 			port->transmit = vsw_descrsend;
2335 			ldcp->lane_out.xfer_mode = VIO_DESC_MODE;
2336 		}
2337 		mutex_exit(&port->tx_lock);
2338 
2339 		attr_pkt->tag.vio_sid = ldcp->local_session;
2340 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2341 
2342 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2343 
2344 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
2345 
2346 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2347 		    sizeof (vnet_attr_msg_t), B_TRUE);
2348 
2349 		vsw_next_milestone(ldcp);
2350 		break;
2351 
2352 	case VIO_SUBTYPE_ACK:
2353 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2354 
2355 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2356 			return;
2357 
2358 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
2359 		vsw_next_milestone(ldcp);
2360 		break;
2361 
2362 	case VIO_SUBTYPE_NACK:
2363 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2364 
2365 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2366 			return;
2367 
2368 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
2369 		vsw_next_milestone(ldcp);
2370 		break;
2371 
2372 	default:
2373 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2374 		    attr_pkt->tag.vio_subtype);
2375 	}
2376 
2377 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2378 }
2379 
2380 /*
2381  * Process a dring info packet. We can end up here either because our peer
2382  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2383  * peer has sent us a dring INFO message.
2384  *
2385  * If we get a valid/acceptable INFO packet (and we have already negotiated
2386  * a version) we ACK back and update the lane state, otherwise we NACK back.
2387  *
2388  * FUTURE: nothing to stop client from sending us info on multiple dring's
2389  * but for the moment we will just use the first one we are given.
2390  *
2391  */
2392 void
2393 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2394 {
2395 	vio_dring_reg_msg_t	*dring_pkt;
2396 	vsw_t			*vswp = ldcp->ldc_vswp;
2397 	ldc_mem_info_t		minfo;
2398 	dring_info_t		*dp, *dbp;
2399 	int			dring_found = 0;
2400 
2401 	/*
2402 	 * We know this is a ctrl/dring packet so
2403 	 * cast it into the correct structure.
2404 	 */
2405 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2406 
2407 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2408 
2409 	switch (dring_pkt->tag.vio_subtype) {
2410 	case VIO_SUBTYPE_INFO:
2411 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2412 
2413 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2414 			return;
2415 
2416 		/*
2417 		 * If the dring params are unacceptable then we NACK back.
2418 		 */
2419 		if (vsw_check_dring_info(dring_pkt)) {
2420 
2421 			DERR(vswp, "%s (%lld): invalid dring info",
2422 			    __func__, ldcp->ldc_id);
2423 
2424 			vsw_free_lane_resources(ldcp, INBOUND);
2425 
2426 			dring_pkt->tag.vio_sid = ldcp->local_session;
2427 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2428 
2429 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2430 
2431 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2432 
2433 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2434 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2435 
2436 			vsw_next_milestone(ldcp);
2437 			return;
2438 		}
2439 
2440 		/*
2441 		 * Otherwise, attempt to map in the dring using the
2442 		 * cookie. If that succeeds we send back a unique dring
2443 		 * identifier that the sending side will use in future
2444 		 * to refer to this descriptor ring.
2445 		 */
2446 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2447 
2448 		dp->num_descriptors = dring_pkt->num_descriptors;
2449 		dp->descriptor_size = dring_pkt->descriptor_size;
2450 		dp->options = dring_pkt->options;
2451 		dp->ncookies = dring_pkt->ncookies;
2452 
2453 		/*
2454 		 * Note: should only get one cookie. Enforced in
2455 		 * the ldc layer.
2456 		 */
2457 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2458 		    sizeof (ldc_mem_cookie_t));
2459 
2460 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2461 		    dp->num_descriptors, dp->descriptor_size);
2462 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2463 		    dp->options, dp->ncookies);
2464 
2465 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2466 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2467 		    LDC_SHADOW_MAP, &(dp->handle))) != 0) {
2468 
2469 			DERR(vswp, "%s: dring_map failed\n", __func__);
2470 
2471 			kmem_free(dp, sizeof (dring_info_t));
2472 			vsw_free_lane_resources(ldcp, INBOUND);
2473 
2474 			dring_pkt->tag.vio_sid = ldcp->local_session;
2475 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2476 
2477 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2478 
2479 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2480 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2481 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2482 
2483 			vsw_next_milestone(ldcp);
2484 			return;
2485 		}
2486 
2487 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2488 
2489 			DERR(vswp, "%s: dring_addr failed\n", __func__);
2490 
2491 			kmem_free(dp, sizeof (dring_info_t));
2492 			vsw_free_lane_resources(ldcp, INBOUND);
2493 
2494 			dring_pkt->tag.vio_sid = ldcp->local_session;
2495 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2496 
2497 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2498 
2499 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2500 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2501 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2502 
2503 			vsw_next_milestone(ldcp);
2504 			return;
2505 		} else {
2506 			/* store the address of the pub part of ring */
2507 			dp->pub_addr = minfo.vaddr;
2508 		}
2509 
2510 		/* no private section as we are importing */
2511 		dp->priv_addr = NULL;
2512 
2513 		/*
2514 		 * Using simple mono increasing int for ident at
2515 		 * the moment.
2516 		 */
2517 		dp->ident = ldcp->next_ident;
2518 		ldcp->next_ident++;
2519 
2520 		dp->end_idx = 0;
2521 		dp->next = NULL;
2522 
2523 		/*
2524 		 * Link it onto the end of the list of drings
2525 		 * for this lane.
2526 		 */
2527 		if (ldcp->lane_in.dringp == NULL) {
2528 			D2(vswp, "%s: adding first INBOUND dring", __func__);
2529 			ldcp->lane_in.dringp = dp;
2530 		} else {
2531 			dbp = ldcp->lane_in.dringp;
2532 
2533 			while (dbp->next != NULL)
2534 				dbp = dbp->next;
2535 
2536 			dbp->next = dp;
2537 		}
2538 
2539 		/* acknowledge it */
2540 		dring_pkt->tag.vio_sid = ldcp->local_session;
2541 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2542 		dring_pkt->dring_ident = dp->ident;
2543 
2544 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2545 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
2546 
2547 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
2548 		vsw_next_milestone(ldcp);
2549 		break;
2550 
2551 	case VIO_SUBTYPE_ACK:
2552 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2553 
2554 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
2555 			return;
2556 
2557 		/*
2558 		 * Peer is acknowledging our dring info and will have
2559 		 * sent us a dring identifier which we will use to
2560 		 * refer to this ring w.r.t. our peer.
2561 		 */
2562 		dp = ldcp->lane_out.dringp;
2563 		if (dp != NULL) {
2564 			/*
2565 			 * Find the ring this ident should be associated
2566 			 * with.
2567 			 */
2568 			if (vsw_dring_match(dp, dring_pkt)) {
2569 				dring_found = 1;
2570 
2571 			} else while (dp != NULL) {
2572 				if (vsw_dring_match(dp, dring_pkt)) {
2573 					dring_found = 1;
2574 					break;
2575 				}
2576 				dp = dp->next;
2577 			}
2578 
2579 			if (dring_found == 0) {
2580 				DERR(NULL, "%s: unrecognised ring cookie",
2581 				    __func__);
2582 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2583 				return;
2584 			}
2585 
2586 		} else {
2587 			DERR(vswp, "%s: DRING ACK received but no drings "
2588 			    "allocated", __func__);
2589 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2590 			return;
2591 		}
2592 
2593 		/* store ident */
2594 		dp->ident = dring_pkt->dring_ident;
2595 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
2596 		vsw_next_milestone(ldcp);
2597 		break;
2598 
2599 	case VIO_SUBTYPE_NACK:
2600 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2601 
2602 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
2603 			return;
2604 
2605 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
2606 		vsw_next_milestone(ldcp);
2607 		break;
2608 
2609 	default:
2610 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2611 		    dring_pkt->tag.vio_subtype);
2612 	}
2613 
2614 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2615 }
2616 
2617 /*
2618  * Process a request from peer to unregister a dring.
2619  *
2620  * For the moment we just restart the handshake if our
2621  * peer endpoint attempts to unregister a dring.
2622  */
2623 void
2624 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
2625 {
2626 	vsw_t			*vswp = ldcp->ldc_vswp;
2627 	vio_dring_unreg_msg_t	*dring_pkt;
2628 
2629 	/*
2630 	 * We know this is a ctrl/dring packet so
2631 	 * cast it into the correct structure.
2632 	 */
2633 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
2634 
2635 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2636 
2637 	switch (dring_pkt->tag.vio_subtype) {
2638 	case VIO_SUBTYPE_INFO:
2639 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2640 
2641 		DWARN(vswp, "%s: restarting handshake..", __func__);
2642 		break;
2643 
2644 	case VIO_SUBTYPE_ACK:
2645 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2646 
2647 		DWARN(vswp, "%s: restarting handshake..", __func__);
2648 		break;
2649 
2650 	case VIO_SUBTYPE_NACK:
2651 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2652 
2653 		DWARN(vswp, "%s: restarting handshake..", __func__);
2654 		break;
2655 
2656 	default:
2657 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2658 		    dring_pkt->tag.vio_subtype);
2659 	}
2660 
2661 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2662 
2663 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2664 }
2665 
2666 #define	SND_MCST_NACK(ldcp, pkt) \
2667 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2668 	pkt->tag.vio_sid = ldcp->local_session; \
2669 	(void) vsw_send_msg(ldcp, (void *)pkt, \
2670 			sizeof (vnet_mcast_msg_t), B_TRUE);
2671 
2672 /*
2673  * Process a multicast request from a vnet.
2674  *
2675  * Vnet's specify a multicast address that they are interested in. This
2676  * address is used as a key into the hash table which forms the multicast
2677  * forwarding database (mFDB).
2678  *
2679  * The table keys are the multicast addresses, while the table entries
2680  * are pointers to lists of ports which wish to receive packets for the
2681  * specified multicast address.
2682  *
2683  * When a multicast packet is being switched we use the address as a key
2684  * into the hash table, and then walk the appropriate port list forwarding
2685  * the pkt to each port in turn.
2686  *
2687  * If a vnet is no longer interested in a particular multicast grouping
2688  * we simply find the correct location in the hash table and then delete
2689  * the relevant port from the port list.
2690  *
2691  * To deal with the case whereby a port is being deleted without first
2692  * removing itself from the lists in the hash table, we maintain a list
2693  * of multicast addresses the port has registered an interest in, within
2694  * the port structure itself. We then simply walk that list of addresses
2695  * using them as keys into the hash table and remove the port from the
2696  * appropriate lists.
2697  */
2698 static void
2699 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
2700 {
2701 	vnet_mcast_msg_t	*mcst_pkt;
2702 	vsw_port_t		*port = ldcp->ldc_port;
2703 	vsw_t			*vswp = ldcp->ldc_vswp;
2704 	int			i;
2705 
2706 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2707 
2708 	/*
2709 	 * We know this is a ctrl/mcast packet so
2710 	 * cast it into the correct structure.
2711 	 */
2712 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
2713 
2714 	switch (mcst_pkt->tag.vio_subtype) {
2715 	case VIO_SUBTYPE_INFO:
2716 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2717 
2718 		/*
2719 		 * Check if in correct state to receive a multicast
2720 		 * message (i.e. handshake complete). If not reset
2721 		 * the handshake.
2722 		 */
2723 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
2724 			return;
2725 
2726 		/*
2727 		 * Before attempting to add or remove address check
2728 		 * that they are valid multicast addresses.
2729 		 * If not, then NACK back.
2730 		 */
2731 		for (i = 0; i < mcst_pkt->count; i++) {
2732 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
2733 				DERR(vswp, "%s: invalid multicast address",
2734 				    __func__);
2735 				SND_MCST_NACK(ldcp, mcst_pkt);
2736 				return;
2737 			}
2738 		}
2739 
2740 		/*
2741 		 * Now add/remove the addresses. If this fails we
2742 		 * NACK back.
2743 		 */
2744 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
2745 			SND_MCST_NACK(ldcp, mcst_pkt);
2746 			return;
2747 		}
2748 
2749 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2750 		mcst_pkt->tag.vio_sid = ldcp->local_session;
2751 
2752 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
2753 
2754 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
2755 		    sizeof (vnet_mcast_msg_t), B_TRUE);
2756 		break;
2757 
2758 	case VIO_SUBTYPE_ACK:
2759 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2760 
2761 		/*
2762 		 * We shouldn't ever get a multicast ACK message as
2763 		 * at the moment we never request multicast addresses
2764 		 * to be set on some other device. This may change in
2765 		 * the future if we have cascading switches.
2766 		 */
2767 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
2768 			return;
2769 
2770 				/* Do nothing */
2771 		break;
2772 
2773 	case VIO_SUBTYPE_NACK:
2774 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2775 
2776 		/*
2777 		 * We shouldn't get a multicast NACK packet for the
2778 		 * same reasons as we shouldn't get a ACK packet.
2779 		 */
2780 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
2781 			return;
2782 
2783 				/* Do nothing */
2784 		break;
2785 
2786 	default:
2787 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2788 		    mcst_pkt->tag.vio_subtype);
2789 	}
2790 
2791 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2792 }
2793 
2794 static void
2795 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
2796 {
2797 	vio_rdx_msg_t	*rdx_pkt;
2798 	vsw_t		*vswp = ldcp->ldc_vswp;
2799 
2800 	/*
2801 	 * We know this is a ctrl/rdx packet so
2802 	 * cast it into the correct structure.
2803 	 */
2804 	rdx_pkt = (vio_rdx_msg_t *)pkt;
2805 
2806 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2807 
2808 	switch (rdx_pkt->tag.vio_subtype) {
2809 	case VIO_SUBTYPE_INFO:
2810 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2811 
2812 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
2813 			return;
2814 
2815 		rdx_pkt->tag.vio_sid = ldcp->local_session;
2816 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2817 
2818 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
2819 
2820 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
2821 
2822 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
2823 		    sizeof (vio_rdx_msg_t), B_TRUE);
2824 
2825 		vsw_next_milestone(ldcp);
2826 		break;
2827 
2828 	case VIO_SUBTYPE_ACK:
2829 		/*
2830 		 * Should be handled in-band by callback handler.
2831 		 */
2832 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
2833 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2834 		break;
2835 
2836 	case VIO_SUBTYPE_NACK:
2837 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2838 
2839 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
2840 			return;
2841 
2842 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
2843 		vsw_next_milestone(ldcp);
2844 		break;
2845 
2846 	default:
2847 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2848 		    rdx_pkt->tag.vio_subtype);
2849 	}
2850 
2851 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2852 }
2853 
2854 static void
2855 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
2856 {
2857 	uint16_t	env = tag.vio_subtype_env;
2858 	vsw_t		*vswp = ldcp->ldc_vswp;
2859 
2860 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2861 
2862 	/* session id check */
2863 	if (ldcp->session_status & VSW_PEER_SESSION) {
2864 		if (ldcp->peer_session != tag.vio_sid) {
2865 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2866 			    __func__, ldcp->ldc_id, tag.vio_sid);
2867 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2868 			return;
2869 		}
2870 	}
2871 
2872 	/*
2873 	 * It is an error for us to be getting data packets
2874 	 * before the handshake has completed.
2875 	 */
2876 	if (ldcp->hphase != VSW_MILESTONE4) {
2877 		DERR(vswp, "%s: got data packet before handshake complete "
2878 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
2879 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
2880 		DUMP_FLAGS(ldcp->lane_in.lstate);
2881 		DUMP_FLAGS(ldcp->lane_out.lstate);
2882 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2883 		return;
2884 	}
2885 
2886 	/*
2887 	 * To reduce the locking contention, release the
2888 	 * ldc_cblock here and re-acquire it once we are done
2889 	 * receiving packets.
2890 	 */
2891 	mutex_exit(&ldcp->ldc_cblock);
2892 	mutex_enter(&ldcp->ldc_rxlock);
2893 
2894 	/*
2895 	 * Switch on vio_subtype envelope, then let lower routines
2896 	 * decide if its an INFO, ACK or NACK packet.
2897 	 */
2898 	if (env == VIO_DRING_DATA) {
2899 		vsw_process_data_dring_pkt(ldcp, dpkt);
2900 	} else if (env == VIO_PKT_DATA) {
2901 		vsw_process_data_raw_pkt(ldcp, dpkt);
2902 	} else if (env == VIO_DESC_DATA) {
2903 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
2904 	} else {
2905 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2906 	}
2907 
2908 	mutex_exit(&ldcp->ldc_rxlock);
2909 	mutex_enter(&ldcp->ldc_cblock);
2910 
2911 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2912 }
2913 
2914 #define	SND_DRING_NACK(ldcp, pkt) \
2915 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2916 	pkt->tag.vio_sid = ldcp->local_session; \
2917 	(void) vsw_send_msg(ldcp, (void *)pkt, \
2918 			sizeof (vio_dring_msg_t), B_TRUE);
2919 
2920 static void
2921 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
2922 {
2923 	vio_dring_msg_t		*dring_pkt;
2924 	vnet_public_desc_t	*pub_addr = NULL;
2925 	vsw_private_desc_t	*priv_addr = NULL;
2926 	dring_info_t		*dp = NULL;
2927 	vsw_t			*vswp = ldcp->ldc_vswp;
2928 	mblk_t			*mp = NULL;
2929 	mblk_t			*bp = NULL;
2930 	mblk_t			*bpt = NULL;
2931 	size_t			nbytes = 0;
2932 	uint64_t		ncookies = 0;
2933 	uint64_t		chain = 0;
2934 	uint64_t		len;
2935 	uint32_t		pos, start, datalen;
2936 	uint32_t		range_start, range_end;
2937 	int32_t			end, num, cnt = 0;
2938 	int			i, rv, msg_rv = 0;
2939 	boolean_t		ack_needed = B_FALSE;
2940 	boolean_t		prev_desc_ack = B_FALSE;
2941 	int			read_attempts = 0;
2942 	struct ether_header	*ehp;
2943 
2944 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2945 
2946 	/*
2947 	 * We know this is a data/dring packet so
2948 	 * cast it into the correct structure.
2949 	 */
2950 	dring_pkt = (vio_dring_msg_t *)dpkt;
2951 
2952 	/*
2953 	 * Switch on the vio_subtype. If its INFO then we need to
2954 	 * process the data. If its an ACK we need to make sure
2955 	 * it makes sense (i.e did we send an earlier data/info),
2956 	 * and if its a NACK then we maybe attempt a retry.
2957 	 */
2958 	switch (dring_pkt->tag.vio_subtype) {
2959 	case VIO_SUBTYPE_INFO:
2960 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
2961 
2962 		READ_ENTER(&ldcp->lane_in.dlistrw);
2963 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
2964 		    dring_pkt->dring_ident)) == NULL) {
2965 			RW_EXIT(&ldcp->lane_in.dlistrw);
2966 
2967 			DERR(vswp, "%s(%lld): unable to find dring from "
2968 			    "ident 0x%llx", __func__, ldcp->ldc_id,
2969 			    dring_pkt->dring_ident);
2970 
2971 			SND_DRING_NACK(ldcp, dring_pkt);
2972 			return;
2973 		}
2974 
2975 		start = pos = dring_pkt->start_idx;
2976 		end = dring_pkt->end_idx;
2977 		len = dp->num_descriptors;
2978 
2979 		range_start = range_end = pos;
2980 
2981 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
2982 		    __func__, ldcp->ldc_id, start, end);
2983 
2984 		if (end == -1) {
2985 			num = -1;
2986 		} else if (end >= 0) {
2987 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
2988 
2989 			/* basic sanity check */
2990 			if (end > len) {
2991 				RW_EXIT(&ldcp->lane_in.dlistrw);
2992 				DERR(vswp, "%s(%lld): endpoint %lld outside "
2993 				    "ring length %lld", __func__,
2994 				    ldcp->ldc_id, end, len);
2995 
2996 				SND_DRING_NACK(ldcp, dring_pkt);
2997 				return;
2998 			}
2999 		} else {
3000 			RW_EXIT(&ldcp->lane_in.dlistrw);
3001 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3002 			    __func__, ldcp->ldc_id, end);
3003 			SND_DRING_NACK(ldcp, dring_pkt);
3004 			return;
3005 		}
3006 
3007 		while (cnt != num) {
3008 vsw_recheck_desc:
3009 			if ((rv = ldc_mem_dring_acquire(dp->handle,
3010 			    pos, pos)) != 0) {
3011 				RW_EXIT(&ldcp->lane_in.dlistrw);
3012 				DERR(vswp, "%s(%lld): unable to acquire "
3013 				    "descriptor at pos %d: err %d",
3014 				    __func__, pos, ldcp->ldc_id, rv);
3015 				SND_DRING_NACK(ldcp, dring_pkt);
3016 				ldcp->ldc_stats.ierrors++;
3017 				return;
3018 			}
3019 
3020 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3021 
3022 			/*
3023 			 * When given a bounded range of descriptors
3024 			 * to process, its an error to hit a descriptor
3025 			 * which is not ready. In the non-bounded case
3026 			 * (end_idx == -1) this simply indicates we have
3027 			 * reached the end of the current active range.
3028 			 */
3029 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
3030 				/* unbound - no error */
3031 				if (end == -1) {
3032 					if (read_attempts == vsw_read_attempts)
3033 						break;
3034 
3035 					delay(drv_usectohz(vsw_desc_delay));
3036 					read_attempts++;
3037 					goto vsw_recheck_desc;
3038 				}
3039 
3040 				/* bounded - error - so NACK back */
3041 				RW_EXIT(&ldcp->lane_in.dlistrw);
3042 				DERR(vswp, "%s(%lld): descriptor not READY "
3043 				    "(%d)", __func__, ldcp->ldc_id,
3044 				    pub_addr->hdr.dstate);
3045 				SND_DRING_NACK(ldcp, dring_pkt);
3046 				return;
3047 			}
3048 
3049 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3050 
3051 			range_end = pos;
3052 
3053 			/*
3054 			 * If we ACK'd the previous descriptor then now
3055 			 * record the new range start position for later
3056 			 * ACK's.
3057 			 */
3058 			if (prev_desc_ack) {
3059 				range_start = pos;
3060 
3061 				D2(vswp, "%s(%lld): updating range start to be "
3062 				    "%d", __func__, ldcp->ldc_id, range_start);
3063 
3064 				prev_desc_ack = B_FALSE;
3065 			}
3066 
3067 			/*
3068 			 * Data is padded to align on 8 byte boundary,
3069 			 * datalen is actual data length, i.e. minus that
3070 			 * padding.
3071 			 */
3072 			datalen = pub_addr->nbytes;
3073 
3074 			/*
3075 			 * Does peer wish us to ACK when we have finished
3076 			 * with this descriptor ?
3077 			 */
3078 			if (pub_addr->hdr.ack)
3079 				ack_needed = B_TRUE;
3080 
3081 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3082 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3083 			    __func__, ldcp->ldc_id, pos, pub_addr,
3084 			    pub_addr->hdr.dstate, datalen);
3085 
3086 			/*
3087 			 * Mark that we are starting to process descriptor.
3088 			 */
3089 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
3090 
3091 			/*
3092 			 * Ensure that we ask ldc for an aligned
3093 			 * number of bytes.
3094 			 */
3095 			nbytes = (datalen + VNET_IPALIGN + 7) & ~7;
3096 
3097 			mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3098 			if (mp == NULL) {
3099 				ldcp->ldc_stats.rx_vio_allocb_fail++;
3100 				/*
3101 				 * No free receive buffers available, so
3102 				 * fallback onto allocb(9F). Make sure that
3103 				 * we get a data buffer which is a multiple
3104 				 * of 8 as this is required by ldc_mem_copy.
3105 				 */
3106 				DTRACE_PROBE(allocb);
3107 				if ((mp = allocb(datalen + VNET_IPALIGN + 8,
3108 				    BPRI_MED)) == NULL) {
3109 					DERR(vswp, "%s(%ld): allocb failed",
3110 					    __func__, ldcp->ldc_id);
3111 					pub_addr->hdr.dstate = VIO_DESC_DONE;
3112 					(void) ldc_mem_dring_release(dp->handle,
3113 					    pos, pos);
3114 					ldcp->ldc_stats.ierrors++;
3115 					ldcp->ldc_stats.rx_allocb_fail++;
3116 					break;
3117 				}
3118 			}
3119 
3120 			ncookies = pub_addr->ncookies;
3121 			rv = ldc_mem_copy(ldcp->ldc_handle,
3122 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3123 			    pub_addr->memcookie, ncookies, LDC_COPY_IN);
3124 
3125 			if (rv != 0) {
3126 				DERR(vswp, "%s(%d): unable to copy in data "
3127 				    "from %d cookies in desc %d (rv %d)",
3128 				    __func__, ldcp->ldc_id, ncookies, pos, rv);
3129 				freemsg(mp);
3130 
3131 				pub_addr->hdr.dstate = VIO_DESC_DONE;
3132 				(void) ldc_mem_dring_release(dp->handle,
3133 				    pos, pos);
3134 				ldcp->ldc_stats.ierrors++;
3135 				break;
3136 			} else {
3137 				D2(vswp, "%s(%d): copied in %ld bytes"
3138 				    " using %d cookies", __func__,
3139 				    ldcp->ldc_id, nbytes, ncookies);
3140 			}
3141 
3142 			/* adjust the read pointer to skip over the padding */
3143 			mp->b_rptr += VNET_IPALIGN;
3144 
3145 			/* point to the actual end of data */
3146 			mp->b_wptr = mp->b_rptr + datalen;
3147 
3148 			/* update statistics */
3149 			ehp = (struct ether_header *)mp->b_rptr;
3150 			if (IS_BROADCAST(ehp))
3151 				ldcp->ldc_stats.brdcstrcv++;
3152 			else if (IS_MULTICAST(ehp))
3153 				ldcp->ldc_stats.multircv++;
3154 
3155 			ldcp->ldc_stats.ipackets++;
3156 			ldcp->ldc_stats.rbytes += datalen;
3157 
3158 			/* build a chain of received packets */
3159 			if (bp == NULL) {
3160 				/* first pkt */
3161 				bp = mp;
3162 				bp->b_next = bp->b_prev = NULL;
3163 				bpt = bp;
3164 				chain = 1;
3165 			} else {
3166 				mp->b_next = mp->b_prev = NULL;
3167 				bpt->b_next = mp;
3168 				bpt = mp;
3169 				chain++;
3170 			}
3171 
3172 			/* mark we are finished with this descriptor */
3173 			pub_addr->hdr.dstate = VIO_DESC_DONE;
3174 
3175 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
3176 
3177 			/*
3178 			 * Send an ACK back to peer if requested.
3179 			 */
3180 			if (ack_needed) {
3181 				ack_needed = B_FALSE;
3182 
3183 				dring_pkt->start_idx = range_start;
3184 				dring_pkt->end_idx = range_end;
3185 
3186 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3187 				    " requested", __func__, ldcp->ldc_id,
3188 				    dring_pkt->start_idx, dring_pkt->end_idx);
3189 
3190 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3191 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3192 				dring_pkt->tag.vio_sid = ldcp->local_session;
3193 
3194 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3195 				    sizeof (vio_dring_msg_t), B_FALSE);
3196 
3197 				/*
3198 				 * Check if ACK was successfully sent. If not
3199 				 * we break and deal with that below.
3200 				 */
3201 				if (msg_rv != 0)
3202 					break;
3203 
3204 				prev_desc_ack = B_TRUE;
3205 				range_start = pos;
3206 			}
3207 
3208 			/* next descriptor */
3209 			pos = (pos + 1) % len;
3210 			cnt++;
3211 
3212 			/*
3213 			 * Break out of loop here and stop processing to
3214 			 * allow some other network device (or disk) to
3215 			 * get access to the cpu.
3216 			 */
3217 			if (chain > vsw_chain_len) {
3218 				D3(vswp, "%s(%lld): switching chain of %d "
3219 				    "msgs", __func__, ldcp->ldc_id, chain);
3220 				break;
3221 			}
3222 		}
3223 		RW_EXIT(&ldcp->lane_in.dlistrw);
3224 
3225 		/*
3226 		 * If when we attempted to send the ACK we found that the
3227 		 * channel had been reset then now handle this. We deal with
3228 		 * it here as we cannot reset the channel while holding the
3229 		 * dlistrw lock, and we don't want to acquire/release it
3230 		 * continuously in the above loop, as a channel reset should
3231 		 * be a rare event.
3232 		 */
3233 		if (msg_rv == ECONNRESET) {
3234 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3235 			break;
3236 		}
3237 
3238 		/* send the chain of packets to be switched */
3239 		if (bp != NULL) {
3240 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3241 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3242 			    __func__, ldcp->ldc_id, chain);
3243 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3244 			    ldcp->ldc_port, NULL);
3245 		}
3246 
3247 		DTRACE_PROBE1(msg_cnt, int, cnt);
3248 
3249 		/*
3250 		 * We are now finished so ACK back with the state
3251 		 * set to STOPPING so our peer knows we are finished
3252 		 */
3253 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3254 		dring_pkt->tag.vio_sid = ldcp->local_session;
3255 
3256 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3257 
3258 		DTRACE_PROBE(stop_process_sent);
3259 
3260 		/*
3261 		 * We have not processed any more descriptors beyond
3262 		 * the last one we ACK'd.
3263 		 */
3264 		if (prev_desc_ack)
3265 			range_start = range_end;
3266 
3267 		dring_pkt->start_idx = range_start;
3268 		dring_pkt->end_idx = range_end;
3269 
3270 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3271 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3272 		    dring_pkt->end_idx);
3273 
3274 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3275 		    sizeof (vio_dring_msg_t), B_TRUE);
3276 		break;
3277 
3278 	case VIO_SUBTYPE_ACK:
3279 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3280 		/*
3281 		 * Verify that the relevant descriptors are all
3282 		 * marked as DONE
3283 		 */
3284 		READ_ENTER(&ldcp->lane_out.dlistrw);
3285 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3286 		    dring_pkt->dring_ident)) == NULL) {
3287 			RW_EXIT(&ldcp->lane_out.dlistrw);
3288 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3289 			return;
3290 		}
3291 
3292 		start = end = 0;
3293 		start = dring_pkt->start_idx;
3294 		end = dring_pkt->end_idx;
3295 		len = dp->num_descriptors;
3296 
3297 
3298 		mutex_enter(&dp->dlock);
3299 		dp->last_ack_recv = end;
3300 		ldcp->ldc_stats.dring_data_acks++;
3301 		mutex_exit(&dp->dlock);
3302 
3303 		(void) vsw_reclaim_dring(dp, start);
3304 
3305 		/*
3306 		 * If our peer is stopping processing descriptors then
3307 		 * we check to make sure it has processed all the descriptors
3308 		 * we have updated. If not then we send it a new message
3309 		 * to prompt it to restart.
3310 		 */
3311 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3312 			DTRACE_PROBE(stop_process_recv);
3313 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3314 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3315 			    dring_pkt->end_idx);
3316 
3317 			/*
3318 			 * Check next descriptor in public section of ring.
3319 			 * If its marked as READY then we need to prompt our
3320 			 * peer to start processing the ring again.
3321 			 */
3322 			i = (end + 1) % len;
3323 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3324 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3325 
3326 			/*
3327 			 * Hold the restart lock across all of this to
3328 			 * make sure that its not possible for us to
3329 			 * decide that a msg needs to be sent in the future
3330 			 * but the sending code having already checked is
3331 			 * about to exit.
3332 			 */
3333 			mutex_enter(&dp->restart_lock);
3334 			ldcp->ldc_stats.dring_stopped_acks++;
3335 			mutex_enter(&priv_addr->dstate_lock);
3336 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3337 
3338 				mutex_exit(&priv_addr->dstate_lock);
3339 
3340 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3341 				dring_pkt->tag.vio_sid = ldcp->local_session;
3342 
3343 				dring_pkt->seq_num =
3344 				    atomic_inc_64_nv(&ldcp->lane_out.seq_num);
3345 
3346 				dring_pkt->start_idx = (end + 1) % len;
3347 				dring_pkt->end_idx = -1;
3348 
3349 				D2(vswp, "%s(%lld) : sending restart msg:"
3350 				    " %d : %d", __func__, ldcp->ldc_id,
3351 				    dring_pkt->start_idx, dring_pkt->end_idx);
3352 
3353 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3354 				    sizeof (vio_dring_msg_t), B_FALSE);
3355 				ldcp->ldc_stats.dring_data_msgs++;
3356 
3357 			} else {
3358 				mutex_exit(&priv_addr->dstate_lock);
3359 				dp->restart_reqd = B_TRUE;
3360 			}
3361 			mutex_exit(&dp->restart_lock);
3362 		}
3363 		RW_EXIT(&ldcp->lane_out.dlistrw);
3364 
3365 		/* only do channel reset after dropping dlistrw lock */
3366 		if (msg_rv == ECONNRESET)
3367 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3368 
3369 		break;
3370 
3371 	case VIO_SUBTYPE_NACK:
3372 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3373 		    __func__, ldcp->ldc_id);
3374 		/*
3375 		 * Something is badly wrong if we are getting NACK's
3376 		 * for our data pkts. So reset the channel.
3377 		 */
3378 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3379 
3380 		break;
3381 
3382 	default:
3383 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3384 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3385 	}
3386 
3387 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3388 }
3389 
3390 /*
3391  * VIO_PKT_DATA (a.k.a raw data mode )
3392  *
3393  * Note - currently not supported. Do nothing.
3394  */
3395 static void
3396 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
3397 {
3398 	_NOTE(ARGUNUSED(dpkt))
3399 
3400 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3401 	DERR(NULL, "%s (%lld): currently unsupported", __func__, ldcp->ldc_id);
3402 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3403 }
3404 
3405 /*
3406  * Process an in-band descriptor message (most likely from
3407  * OBP).
3408  */
3409 static void
3410 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3411 {
3412 	vnet_ibnd_desc_t	*ibnd_desc;
3413 	dring_info_t		*dp = NULL;
3414 	vsw_private_desc_t	*priv_addr = NULL;
3415 	vsw_t			*vswp = ldcp->ldc_vswp;
3416 	mblk_t			*mp = NULL;
3417 	size_t			nbytes = 0;
3418 	size_t			off = 0;
3419 	uint64_t		idx = 0;
3420 	uint32_t		num = 1, len, datalen = 0;
3421 	uint64_t		ncookies = 0;
3422 	int			i, rv;
3423 	int			j = 0;
3424 
3425 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3426 
3427 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3428 
3429 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3430 	case VIO_SUBTYPE_INFO:
3431 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3432 
3433 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3434 			return;
3435 
3436 		/*
3437 		 * Data is padded to align on a 8 byte boundary,
3438 		 * nbytes is actual data length, i.e. minus that
3439 		 * padding.
3440 		 */
3441 		datalen = ibnd_desc->nbytes;
3442 
3443 		D2(vswp, "%s(%lld): processing inband desc : "
3444 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3445 
3446 		ncookies = ibnd_desc->ncookies;
3447 
3448 		/*
3449 		 * allocb(9F) returns an aligned data block. We
3450 		 * need to ensure that we ask ldc for an aligned
3451 		 * number of bytes also.
3452 		 */
3453 		nbytes = datalen;
3454 		if (nbytes & 0x7) {
3455 			off = 8 - (nbytes & 0x7);
3456 			nbytes += off;
3457 		}
3458 
3459 		mp = allocb(datalen, BPRI_MED);
3460 		if (mp == NULL) {
3461 			DERR(vswp, "%s(%lld): allocb failed",
3462 			    __func__, ldcp->ldc_id);
3463 			ldcp->ldc_stats.rx_allocb_fail++;
3464 			return;
3465 		}
3466 
3467 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3468 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3469 		    LDC_COPY_IN);
3470 
3471 		if (rv != 0) {
3472 			DERR(vswp, "%s(%d): unable to copy in data from "
3473 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3474 			freemsg(mp);
3475 			ldcp->ldc_stats.ierrors++;
3476 			return;
3477 		}
3478 
3479 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3480 		    __func__, ldcp->ldc_id, nbytes, ncookies);
3481 
3482 		/* point to the actual end of data */
3483 		mp->b_wptr = mp->b_rptr + datalen;
3484 		ldcp->ldc_stats.ipackets++;
3485 		ldcp->ldc_stats.rbytes += datalen;
3486 
3487 		/*
3488 		 * We ACK back every in-band descriptor message we process
3489 		 */
3490 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3491 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3492 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3493 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3494 
3495 		/* send the packet to be switched */
3496 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3497 		    ldcp->ldc_port, NULL);
3498 
3499 		break;
3500 
3501 	case VIO_SUBTYPE_ACK:
3502 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3503 
3504 		/* Verify the ACK is valid */
3505 		idx = ibnd_desc->hdr.desc_handle;
3506 
3507 		if (idx >= VSW_RING_NUM_EL) {
3508 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3509 			    "(idx %ld)", vswp->instance, idx);
3510 			return;
3511 		}
3512 
3513 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3514 			DERR(vswp, "%s: no dring found", __func__);
3515 			return;
3516 		}
3517 
3518 		len = dp->num_descriptors;
3519 		/*
3520 		 * If the descriptor we are being ACK'ed for is not the
3521 		 * one we expected, then pkts were lost somwhere, either
3522 		 * when we tried to send a msg, or a previous ACK msg from
3523 		 * our peer. In either case we now reclaim the descriptors
3524 		 * in the range from the last ACK we received up to the
3525 		 * current ACK.
3526 		 */
3527 		if (idx != dp->last_ack_recv) {
3528 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3529 			    __func__, dp->last_ack_recv, idx);
3530 			num = idx >= dp->last_ack_recv ?
3531 			    idx - dp->last_ack_recv + 1:
3532 			    (len - dp->last_ack_recv + 1) + idx;
3533 		}
3534 
3535 		/*
3536 		 * When we sent the in-band message to our peer we
3537 		 * marked the copy in our private ring as READY. We now
3538 		 * check that the descriptor we are being ACK'ed for is in
3539 		 * fact READY, i.e. it is one we have shared with our peer.
3540 		 *
3541 		 * If its not we flag an error, but still reset the descr
3542 		 * back to FREE.
3543 		 */
3544 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3545 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3546 			mutex_enter(&priv_addr->dstate_lock);
3547 			if (priv_addr->dstate != VIO_DESC_READY) {
3548 				DERR(vswp, "%s: (%ld) desc at index %ld not "
3549 				    "READY (0x%lx)", __func__,
3550 				    ldcp->ldc_id, idx, priv_addr->dstate);
3551 				DERR(vswp, "%s: bound %d: ncookies %ld : "
3552 				    "datalen %ld", __func__,
3553 				    priv_addr->bound, priv_addr->ncookies,
3554 				    priv_addr->datalen);
3555 			}
3556 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3557 			    ldcp->ldc_id, idx);
3558 			/* release resources associated with sent msg */
3559 			priv_addr->datalen = 0;
3560 			priv_addr->dstate = VIO_DESC_FREE;
3561 			mutex_exit(&priv_addr->dstate_lock);
3562 		}
3563 		/* update to next expected value */
3564 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3565 
3566 		break;
3567 
3568 	case VIO_SUBTYPE_NACK:
3569 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3570 
3571 		/*
3572 		 * We should only get a NACK if our peer doesn't like
3573 		 * something about a message we have sent it. If this
3574 		 * happens we just release the resources associated with
3575 		 * the message. (We are relying on higher layers to decide
3576 		 * whether or not to resend.
3577 		 */
3578 
3579 		/* limit check */
3580 		idx = ibnd_desc->hdr.desc_handle;
3581 
3582 		if (idx >= VSW_RING_NUM_EL) {
3583 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3584 			    __func__, idx);
3585 			return;
3586 		}
3587 
3588 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3589 			DERR(vswp, "%s: no dring found", __func__);
3590 			return;
3591 		}
3592 
3593 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3594 
3595 		/* move to correct location in ring */
3596 		priv_addr += idx;
3597 
3598 		/* release resources associated with sent msg */
3599 		mutex_enter(&priv_addr->dstate_lock);
3600 		priv_addr->datalen = 0;
3601 		priv_addr->dstate = VIO_DESC_FREE;
3602 		mutex_exit(&priv_addr->dstate_lock);
3603 
3604 		break;
3605 
3606 	default:
3607 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3608 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3609 	}
3610 
3611 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3612 }
3613 
3614 static void
3615 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
3616 {
3617 	_NOTE(ARGUNUSED(epkt))
3618 
3619 	vsw_t		*vswp = ldcp->ldc_vswp;
3620 	uint16_t	env = tag.vio_subtype_env;
3621 
3622 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3623 
3624 	/*
3625 	 * Error vio_subtypes have yet to be defined. So for
3626 	 * the moment we can't do anything.
3627 	 */
3628 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3629 
3630 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3631 }
3632 
3633 /* transmit the packet over the given port */
3634 int
3635 vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt)
3636 {
3637 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
3638 	vsw_ldc_t 	*ldcp;
3639 	mblk_t		*tmp;
3640 	int		status = 0;
3641 
3642 	READ_ENTER(&ldcl->lockrw);
3643 	/*
3644 	 * Note for now, we have a single channel.
3645 	 */
3646 	ldcp = ldcl->head;
3647 	if (ldcp == NULL) {
3648 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
3649 		freemsg(mp);
3650 		RW_EXIT(&ldcl->lockrw);
3651 		return (1);
3652 	}
3653 
3654 	/*
3655 	 * If the TX thread is enabled, then queue the packets
3656 	 * and signal the tx thread.
3657 	 */
3658 	if (ldcp->tx_thread != NULL) {
3659 		mutex_enter(&ldcp->tx_thr_lock);
3660 		if (ldcp->tx_mhead == NULL) {
3661 			ldcp->tx_mhead = mp;
3662 			ldcp->tx_mtail = mpt;
3663 			cv_signal(&ldcp->tx_thr_cv);
3664 		} else {
3665 			ldcp->tx_mtail->b_next = mp;
3666 			ldcp->tx_mtail = mpt;
3667 		}
3668 		mutex_exit(&ldcp->tx_thr_lock);
3669 	} else {
3670 		while (mp != NULL) {
3671 			tmp = mp->b_next;
3672 			mp->b_next = mp->b_prev = NULL;
3673 			(void) vsw_ldcsend(ldcp, mp, 1);
3674 			mp = tmp;
3675 		}
3676 	}
3677 
3678 	RW_EXIT(&ldcl->lockrw);
3679 
3680 	return (status);
3681 }
3682 
3683 /*
3684  * Transmit the packet over the given LDC channel.
3685  *
3686  * The 'retries' argument indicates how many times a packet
3687  * is retried before it is dropped. Note, the retry is done
3688  * only for a resource related failure, for all other failures
3689  * the packet is dropped immediately.
3690  *
3691  * The 'tx_failure' counter is used as mechanism to track
3692  * continuous failures. Once these failures are more than
3693  * 'vsw_ldc_tx_max_failures' tunable, the packets are tried only
3694  * once and then they are dropped. This is done to avoid
3695  * buffering too many packets.
3696  */
3697 static int
3698 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, int retries)
3699 {
3700 	int i;
3701 	int rc;
3702 	int status = 0;
3703 	vsw_port_t *port = ldcp->ldc_port;
3704 	dring_info_t *dp = NULL;
3705 
3706 
3707 	for (i = 0; i < retries; ) {
3708 		/*
3709 		 * Send the message out using the appropriate
3710 		 * transmit function which will free mblock when it
3711 		 * is finished with it.
3712 		 */
3713 		mutex_enter(&port->tx_lock);
3714 		if (port->transmit != NULL) {
3715 			status = (*port->transmit)(ldcp, mp);
3716 		}
3717 		if (status == LDC_TX_SUCCESS) {
3718 			ldcp->tx_failures = 0;
3719 			mutex_exit(&port->tx_lock);
3720 			break;
3721 		} else if (ldcp->tx_failures > vsw_ldc_tx_max_failures) {
3722 			/*
3723 			 * If the failures crossed the threshold then
3724 			 * break here.
3725 			 */
3726 			ldcp->ldc_stats.oerrors++;
3727 			mutex_exit(&port->tx_lock);
3728 			break;
3729 		} else {
3730 			ldcp->tx_failures++;
3731 		}
3732 		i++;	/* increment the counter here */
3733 
3734 		/* If its the last retry, then update the oerror */
3735 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
3736 			ldcp->ldc_stats.oerrors++;
3737 		}
3738 		mutex_exit(&port->tx_lock);
3739 
3740 		if (status != LDC_TX_NORESOURCES) {
3741 			/*
3742 			 * No retrying required for errors un-related
3743 			 * to resources.
3744 			 */
3745 			break;
3746 		}
3747 		READ_ENTER(&ldcp->lane_out.dlistrw);
3748 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
3749 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE)) {
3750 			rc = vsw_reclaim_dring(dp, dp->end_idx);
3751 		} else {
3752 			/*
3753 			 * If there is no dring or the xfer_mode is
3754 			 * set to DESC_MODE(ie., OBP), then simply break here.
3755 			 */
3756 			RW_EXIT(&ldcp->lane_out.dlistrw);
3757 			break;
3758 		}
3759 		RW_EXIT(&ldcp->lane_out.dlistrw);
3760 
3761 		/*
3762 		 * Delay only if none were reclaimed
3763 		 * and its not the last retry.
3764 		 */
3765 		if ((rc == 0) && (i < retries)) {
3766 			delay(drv_usectohz(vsw_ldc_tx_delay));
3767 		}
3768 	}
3769 	freemsg(mp);
3770 	return (status);
3771 }
3772 
3773 /*
3774  * Send packet out via descriptor ring to a logical device.
3775  */
3776 static int
3777 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
3778 {
3779 	vio_dring_msg_t		dring_pkt;
3780 	dring_info_t		*dp = NULL;
3781 	vsw_private_desc_t	*priv_desc = NULL;
3782 	vnet_public_desc_t	*pub = NULL;
3783 	vsw_t			*vswp = ldcp->ldc_vswp;
3784 	mblk_t			*bp;
3785 	size_t			n, size;
3786 	caddr_t			bufp;
3787 	int			idx;
3788 	int			status = LDC_TX_SUCCESS;
3789 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
3790 
3791 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
3792 
3793 	/* TODO: make test a macro */
3794 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3795 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3796 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
3797 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
3798 		    ldcp->lane_out.lstate);
3799 		ldcp->ldc_stats.oerrors++;
3800 		return (LDC_TX_FAILURE);
3801 	}
3802 
3803 	/*
3804 	 * Note - using first ring only, this may change
3805 	 * in the future.
3806 	 */
3807 	READ_ENTER(&ldcp->lane_out.dlistrw);
3808 	if ((dp = ldcp->lane_out.dringp) == NULL) {
3809 		RW_EXIT(&ldcp->lane_out.dlistrw);
3810 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
3811 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
3812 		ldcp->ldc_stats.oerrors++;
3813 		return (LDC_TX_FAILURE);
3814 	}
3815 
3816 	size = msgsize(mp);
3817 	if (size > (size_t)ETHERMAX) {
3818 		RW_EXIT(&ldcp->lane_out.dlistrw);
3819 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
3820 		    ldcp->ldc_id, size);
3821 		ldcp->ldc_stats.oerrors++;
3822 		return (LDC_TX_FAILURE);
3823 	}
3824 
3825 	/*
3826 	 * Find a free descriptor
3827 	 *
3828 	 * Note: for the moment we are assuming that we will only
3829 	 * have one dring going from the switch to each of its
3830 	 * peers. This may change in the future.
3831 	 */
3832 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
3833 		D2(vswp, "%s(%lld): no descriptor available for ring "
3834 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
3835 
3836 		/* nothing more we can do */
3837 		status = LDC_TX_NORESOURCES;
3838 		ldcp->ldc_stats.tx_no_desc++;
3839 		goto vsw_dringsend_free_exit;
3840 	} else {
3841 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
3842 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
3843 	}
3844 
3845 	/* copy data into the descriptor */
3846 	bufp = priv_desc->datap;
3847 	bufp += VNET_IPALIGN;
3848 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
3849 		n = MBLKL(bp);
3850 		bcopy(bp->b_rptr, bufp, n);
3851 		bufp += n;
3852 	}
3853 
3854 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
3855 
3856 	pub = priv_desc->descp;
3857 	pub->nbytes = priv_desc->datalen;
3858 
3859 	/* update statistics */
3860 	if (IS_BROADCAST(ehp))
3861 		ldcp->ldc_stats.brdcstxmt++;
3862 	else if (IS_MULTICAST(ehp))
3863 		ldcp->ldc_stats.multixmt++;
3864 	ldcp->ldc_stats.opackets++;
3865 	ldcp->ldc_stats.obytes += priv_desc->datalen;
3866 
3867 	mutex_enter(&priv_desc->dstate_lock);
3868 	pub->hdr.dstate = VIO_DESC_READY;
3869 	mutex_exit(&priv_desc->dstate_lock);
3870 
3871 	/*
3872 	 * Determine whether or not we need to send a message to our
3873 	 * peer prompting them to read our newly updated descriptor(s).
3874 	 */
3875 	mutex_enter(&dp->restart_lock);
3876 	if (dp->restart_reqd) {
3877 		dp->restart_reqd = B_FALSE;
3878 		ldcp->ldc_stats.dring_data_msgs++;
3879 		mutex_exit(&dp->restart_lock);
3880 
3881 		/*
3882 		 * Send a vio_dring_msg to peer to prompt them to read
3883 		 * the updated descriptor ring.
3884 		 */
3885 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
3886 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
3887 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
3888 		dring_pkt.tag.vio_sid = ldcp->local_session;
3889 
3890 		/* Note - for now using first ring */
3891 		dring_pkt.dring_ident = dp->ident;
3892 		dring_pkt.seq_num = atomic_inc_64_nv(&ldcp->lane_out.seq_num);
3893 
3894 		/*
3895 		 * If last_ack_recv is -1 then we know we've not
3896 		 * received any ack's yet, so this must be the first
3897 		 * msg sent, so set the start to the begining of the ring.
3898 		 */
3899 		mutex_enter(&dp->dlock);
3900 		if (dp->last_ack_recv == -1) {
3901 			dring_pkt.start_idx = 0;
3902 		} else {
3903 			dring_pkt.start_idx =
3904 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
3905 		}
3906 		dring_pkt.end_idx = -1;
3907 		mutex_exit(&dp->dlock);
3908 
3909 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
3910 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
3911 		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
3912 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
3913 		    dring_pkt.end_idx, dring_pkt.seq_num);
3914 
3915 		RW_EXIT(&ldcp->lane_out.dlistrw);
3916 
3917 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
3918 		    sizeof (vio_dring_msg_t), B_TRUE);
3919 
3920 		return (status);
3921 
3922 	} else {
3923 		mutex_exit(&dp->restart_lock);
3924 		D2(vswp, "%s(%lld): updating descp %d", __func__,
3925 		    ldcp->ldc_id, idx);
3926 	}
3927 
3928 vsw_dringsend_free_exit:
3929 
3930 	RW_EXIT(&ldcp->lane_out.dlistrw);
3931 
3932 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
3933 	return (status);
3934 }
3935 
3936 /*
3937  * Send an in-band descriptor message over ldc.
3938  */
3939 static int
3940 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
3941 {
3942 	vsw_t			*vswp = ldcp->ldc_vswp;
3943 	vnet_ibnd_desc_t	ibnd_msg;
3944 	vsw_private_desc_t	*priv_desc = NULL;
3945 	dring_info_t		*dp = NULL;
3946 	size_t			n, size = 0;
3947 	caddr_t			bufp;
3948 	mblk_t			*bp;
3949 	int			idx, i;
3950 	int			status = LDC_TX_SUCCESS;
3951 	static int		warn_msg = 1;
3952 
3953 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3954 
3955 	ASSERT(mp != NULL);
3956 
3957 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3958 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3959 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
3960 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
3961 		    ldcp->lane_out.lstate);
3962 		ldcp->ldc_stats.oerrors++;
3963 		return (LDC_TX_FAILURE);
3964 	}
3965 
3966 	/*
3967 	 * only expect single dring to exist, which we use
3968 	 * as an internal buffer, rather than a transfer channel.
3969 	 */
3970 	READ_ENTER(&ldcp->lane_out.dlistrw);
3971 	if ((dp = ldcp->lane_out.dringp) == NULL) {
3972 		DERR(vswp, "%s(%lld): no dring for outbound lane",
3973 		    __func__, ldcp->ldc_id);
3974 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
3975 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
3976 		RW_EXIT(&ldcp->lane_out.dlistrw);
3977 		ldcp->ldc_stats.oerrors++;
3978 		return (LDC_TX_FAILURE);
3979 	}
3980 
3981 	size = msgsize(mp);
3982 	if (size > (size_t)ETHERMAX) {
3983 		RW_EXIT(&ldcp->lane_out.dlistrw);
3984 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
3985 		    ldcp->ldc_id, size);
3986 		ldcp->ldc_stats.oerrors++;
3987 		return (LDC_TX_FAILURE);
3988 	}
3989 
3990 	/*
3991 	 * Find a free descriptor in our buffer ring
3992 	 */
3993 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
3994 		RW_EXIT(&ldcp->lane_out.dlistrw);
3995 		if (warn_msg) {
3996 			DERR(vswp, "%s(%lld): no descriptor available for ring "
3997 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
3998 			warn_msg = 0;
3999 		}
4000 
4001 		/* nothing more we can do */
4002 		status = LDC_TX_NORESOURCES;
4003 		goto vsw_descrsend_free_exit;
4004 	} else {
4005 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4006 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4007 		warn_msg = 1;
4008 	}
4009 
4010 	/* copy data into the descriptor */
4011 	bufp = priv_desc->datap;
4012 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4013 		n = MBLKL(bp);
4014 		bcopy(bp->b_rptr, bufp, n);
4015 		bufp += n;
4016 	}
4017 
4018 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4019 
4020 	/* create and send the in-band descp msg */
4021 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4022 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4023 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4024 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4025 
4026 	ibnd_msg.hdr.seq_num = atomic_inc_64_nv(&ldcp->lane_out.seq_num);
4027 
4028 	/*
4029 	 * Copy the mem cookies describing the data from the
4030 	 * private region of the descriptor ring into the inband
4031 	 * descriptor.
4032 	 */
4033 	for (i = 0; i < priv_desc->ncookies; i++) {
4034 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4035 		    sizeof (ldc_mem_cookie_t));
4036 	}
4037 
4038 	ibnd_msg.hdr.desc_handle = idx;
4039 	ibnd_msg.ncookies = priv_desc->ncookies;
4040 	ibnd_msg.nbytes = size;
4041 
4042 	ldcp->ldc_stats.opackets++;
4043 	ldcp->ldc_stats.obytes += size;
4044 
4045 	RW_EXIT(&ldcp->lane_out.dlistrw);
4046 
4047 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4048 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4049 
4050 vsw_descrsend_free_exit:
4051 
4052 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4053 	return (status);
4054 }
4055 
4056 static void
4057 vsw_send_ver(void *arg)
4058 {
4059 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4060 	vsw_t		*vswp = ldcp->ldc_vswp;
4061 	lane_t		*lp = &ldcp->lane_out;
4062 	vio_ver_msg_t	ver_msg;
4063 
4064 	D1(vswp, "%s enter", __func__);
4065 
4066 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4067 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4068 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4069 	ver_msg.tag.vio_sid = ldcp->local_session;
4070 
4071 	ver_msg.ver_major = vsw_versions[0].ver_major;
4072 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
4073 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4074 
4075 	lp->lstate |= VSW_VER_INFO_SENT;
4076 	lp->ver_major = ver_msg.ver_major;
4077 	lp->ver_minor = ver_msg.ver_minor;
4078 
4079 	DUMP_TAG(ver_msg.tag);
4080 
4081 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4082 
4083 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4084 }
4085 
4086 static void
4087 vsw_send_attr(vsw_ldc_t *ldcp)
4088 {
4089 	vsw_t			*vswp = ldcp->ldc_vswp;
4090 	lane_t			*lp = &ldcp->lane_out;
4091 	vnet_attr_msg_t		attr_msg;
4092 
4093 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4094 
4095 	/*
4096 	 * Subtype is set to INFO by default
4097 	 */
4098 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4099 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4100 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4101 	attr_msg.tag.vio_sid = ldcp->local_session;
4102 
4103 	/* payload copied from default settings for lane */
4104 	attr_msg.mtu = lp->mtu;
4105 	attr_msg.addr_type = lp->addr_type;
4106 	attr_msg.xfer_mode = lp->xfer_mode;
4107 	attr_msg.ack_freq = lp->xfer_mode;
4108 
4109 	READ_ENTER(&vswp->if_lockrw);
4110 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4111 	RW_EXIT(&vswp->if_lockrw);
4112 
4113 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4114 
4115 	DUMP_TAG(attr_msg.tag);
4116 
4117 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4118 
4119 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4120 }
4121 
4122 /*
4123  * Create dring info msg (which also results in the creation of
4124  * a dring).
4125  */
4126 static vio_dring_reg_msg_t *
4127 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4128 {
4129 	vio_dring_reg_msg_t	*mp;
4130 	dring_info_t		*dp;
4131 	vsw_t			*vswp = ldcp->ldc_vswp;
4132 
4133 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4134 
4135 	/*
4136 	 * If we can't create a dring, obviously no point sending
4137 	 * a message.
4138 	 */
4139 	if ((dp = vsw_create_dring(ldcp)) == NULL)
4140 		return (NULL);
4141 
4142 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4143 
4144 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4145 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4146 	mp->tag.vio_subtype_env = VIO_DRING_REG;
4147 	mp->tag.vio_sid = ldcp->local_session;
4148 
4149 	/* payload */
4150 	mp->num_descriptors = dp->num_descriptors;
4151 	mp->descriptor_size = dp->descriptor_size;
4152 	mp->options = dp->options;
4153 	mp->ncookies = dp->ncookies;
4154 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4155 
4156 	mp->dring_ident = 0;
4157 
4158 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4159 
4160 	return (mp);
4161 }
4162 
4163 static void
4164 vsw_send_dring_info(vsw_ldc_t *ldcp)
4165 {
4166 	vio_dring_reg_msg_t	*dring_msg;
4167 	vsw_t			*vswp = ldcp->ldc_vswp;
4168 
4169 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4170 
4171 	dring_msg = vsw_create_dring_info_pkt(ldcp);
4172 	if (dring_msg == NULL) {
4173 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4174 		    vswp->instance, __func__);
4175 		return;
4176 	}
4177 
4178 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4179 
4180 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4181 
4182 	(void) vsw_send_msg(ldcp, dring_msg,
4183 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4184 
4185 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4186 
4187 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4188 }
4189 
4190 static void
4191 vsw_send_rdx(vsw_ldc_t *ldcp)
4192 {
4193 	vsw_t		*vswp = ldcp->ldc_vswp;
4194 	vio_rdx_msg_t	rdx_msg;
4195 
4196 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4197 
4198 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4199 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4200 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4201 	rdx_msg.tag.vio_sid = ldcp->local_session;
4202 
4203 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4204 
4205 	DUMP_TAG(rdx_msg.tag);
4206 
4207 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4208 
4209 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4210 }
4211 
4212 /*
4213  * Generic routine to send message out over ldc channel.
4214  *
4215  * It is possible that when we attempt to write over the ldc channel
4216  * that we get notified that it has been reset. Depending on the value
4217  * of the handle_reset flag we either handle that event here or simply
4218  * notify the caller that the channel was reset.
4219  */
4220 static int
4221 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
4222 {
4223 	int		rv;
4224 	size_t		msglen = size;
4225 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
4226 	vsw_t		*vswp = ldcp->ldc_vswp;
4227 
4228 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
4229 	    ldcp->ldc_id, size);
4230 
4231 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
4232 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
4233 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
4234 
4235 	mutex_enter(&ldcp->ldc_txlock);
4236 	do {
4237 		msglen = size;
4238 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
4239 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
4240 
4241 	if ((rv != 0) || (msglen != size)) {
4242 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
4243 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
4244 		ldcp->ldc_stats.oerrors++;
4245 	}
4246 	mutex_exit(&ldcp->ldc_txlock);
4247 
4248 	/*
4249 	 * If channel has been reset we either handle it here or
4250 	 * simply report back that it has been reset and let caller
4251 	 * decide what to do.
4252 	 */
4253 	if (rv == ECONNRESET) {
4254 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
4255 
4256 		/*
4257 		 * N.B - must never be holding the dlistrw lock when
4258 		 * we do a reset of the channel.
4259 		 */
4260 		if (handle_reset) {
4261 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4262 		}
4263 	}
4264 
4265 	return (rv);
4266 }
4267 
4268 /*
4269  * Remove the specified address from the list of address maintained
4270  * in this port node.
4271  */
4272 mcst_addr_t *
4273 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4274 {
4275 	vsw_t		*vswp = NULL;
4276 	vsw_port_t	*port = NULL;
4277 	mcst_addr_t	*prev_p = NULL;
4278 	mcst_addr_t	*curr_p = NULL;
4279 
4280 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4281 	    __func__, devtype, addr);
4282 
4283 	if (devtype == VSW_VNETPORT) {
4284 		port = (vsw_port_t *)arg;
4285 		mutex_enter(&port->mca_lock);
4286 		prev_p = curr_p = port->mcap;
4287 	} else {
4288 		vswp = (vsw_t *)arg;
4289 		mutex_enter(&vswp->mca_lock);
4290 		prev_p = curr_p = vswp->mcap;
4291 	}
4292 
4293 	while (curr_p != NULL) {
4294 		if (curr_p->addr == addr) {
4295 			D2(NULL, "%s: address found", __func__);
4296 			/* match found */
4297 			if (prev_p == curr_p) {
4298 				/* list head */
4299 				if (devtype == VSW_VNETPORT)
4300 					port->mcap = curr_p->nextp;
4301 				else
4302 					vswp->mcap = curr_p->nextp;
4303 			} else {
4304 				prev_p->nextp = curr_p->nextp;
4305 			}
4306 			break;
4307 		} else {
4308 			prev_p = curr_p;
4309 			curr_p = curr_p->nextp;
4310 		}
4311 	}
4312 
4313 	if (devtype == VSW_VNETPORT)
4314 		mutex_exit(&port->mca_lock);
4315 	else
4316 		mutex_exit(&vswp->mca_lock);
4317 
4318 	D1(NULL, "%s: exit", __func__);
4319 
4320 	return (curr_p);
4321 }
4322 
4323 /*
4324  * Creates a descriptor ring (dring) and links it into the
4325  * link of outbound drings for this channel.
4326  *
4327  * Returns NULL if creation failed.
4328  */
4329 static dring_info_t *
4330 vsw_create_dring(vsw_ldc_t *ldcp)
4331 {
4332 	vsw_private_desc_t	*priv_addr = NULL;
4333 	vsw_t			*vswp = ldcp->ldc_vswp;
4334 	ldc_mem_info_t		minfo;
4335 	dring_info_t		*dp, *tp;
4336 	int			i;
4337 
4338 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4339 
4340 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4341 
4342 	/* create public section of ring */
4343 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
4344 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
4345 
4346 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
4347 		    "failed", ldcp->ldc_id);
4348 		goto create_fail_exit;
4349 	}
4350 
4351 	ASSERT(dp->handle != NULL);
4352 
4353 	/*
4354 	 * Get the base address of the public section of the ring.
4355 	 */
4356 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4357 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
4358 		    ldcp->ldc_id);
4359 		goto dring_fail_exit;
4360 	} else {
4361 		ASSERT(minfo.vaddr != 0);
4362 		dp->pub_addr = minfo.vaddr;
4363 	}
4364 
4365 	dp->num_descriptors = VSW_RING_NUM_EL;
4366 	dp->descriptor_size = VSW_PUB_SIZE;
4367 	dp->options = VIO_TX_DRING;
4368 	dp->ncookies = 1;	/* guaranteed by ldc */
4369 
4370 	/*
4371 	 * create private portion of ring
4372 	 */
4373 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
4374 	    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
4375 
4376 	if (vsw_setup_ring(ldcp, dp)) {
4377 		DERR(vswp, "%s: unable to setup ring", __func__);
4378 		goto dring_fail_exit;
4379 	}
4380 
4381 	/* haven't used any descriptors yet */
4382 	dp->end_idx = 0;
4383 	dp->last_ack_recv = -1;
4384 
4385 	/* bind dring to the channel */
4386 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
4387 	    LDC_SHADOW_MAP, LDC_MEM_RW,
4388 	    &dp->cookie[0], &dp->ncookies)) != 0) {
4389 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
4390 		    "%lld", ldcp->ldc_id);
4391 		goto dring_fail_exit;
4392 	}
4393 
4394 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4395 	dp->restart_reqd = B_TRUE;
4396 
4397 	/*
4398 	 * Only ever create rings for outgoing lane. Link it onto
4399 	 * end of list.
4400 	 */
4401 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4402 	if (ldcp->lane_out.dringp == NULL) {
4403 		D2(vswp, "vsw_create_dring: adding first outbound ring");
4404 		ldcp->lane_out.dringp = dp;
4405 	} else {
4406 		tp = ldcp->lane_out.dringp;
4407 		while (tp->next != NULL)
4408 			tp = tp->next;
4409 
4410 		tp->next = dp;
4411 	}
4412 	RW_EXIT(&ldcp->lane_out.dlistrw);
4413 
4414 	return (dp);
4415 
4416 dring_fail_exit:
4417 	(void) ldc_mem_dring_destroy(dp->handle);
4418 
4419 create_fail_exit:
4420 	if (dp->priv_addr != NULL) {
4421 		priv_addr = dp->priv_addr;
4422 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
4423 			if (priv_addr->memhandle != NULL)
4424 				(void) ldc_mem_free_handle(
4425 				    priv_addr->memhandle);
4426 			priv_addr++;
4427 		}
4428 		kmem_free(dp->priv_addr,
4429 		    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
4430 	}
4431 	mutex_destroy(&dp->dlock);
4432 
4433 	kmem_free(dp, sizeof (dring_info_t));
4434 	return (NULL);
4435 }
4436 
4437 /*
4438  * Create a ring consisting of just a private portion and link
4439  * it into the list of rings for the outbound lane.
4440  *
4441  * These type of rings are used primarily for temporary data
4442  * storage (i.e. as data buffers).
4443  */
4444 void
4445 vsw_create_privring(vsw_ldc_t *ldcp)
4446 {
4447 	dring_info_t		*dp, *tp;
4448 	vsw_t			*vswp = ldcp->ldc_vswp;
4449 
4450 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4451 
4452 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4453 
4454 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4455 
4456 	/* no public section */
4457 	dp->pub_addr = NULL;
4458 
4459 	dp->priv_addr = kmem_zalloc(
4460 	    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
4461 
4462 	dp->num_descriptors = VSW_RING_NUM_EL;
4463 
4464 	if (vsw_setup_ring(ldcp, dp)) {
4465 		DERR(vswp, "%s: setup of ring failed", __func__);
4466 		kmem_free(dp->priv_addr,
4467 		    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
4468 		mutex_destroy(&dp->dlock);
4469 		kmem_free(dp, sizeof (dring_info_t));
4470 		return;
4471 	}
4472 
4473 	/* haven't used any descriptors yet */
4474 	dp->end_idx = 0;
4475 
4476 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4477 	dp->restart_reqd = B_TRUE;
4478 
4479 	/*
4480 	 * Only ever create rings for outgoing lane. Link it onto
4481 	 * end of list.
4482 	 */
4483 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4484 	if (ldcp->lane_out.dringp == NULL) {
4485 		D2(vswp, "%s: adding first outbound privring", __func__);
4486 		ldcp->lane_out.dringp = dp;
4487 	} else {
4488 		tp = ldcp->lane_out.dringp;
4489 		while (tp->next != NULL)
4490 			tp = tp->next;
4491 
4492 		tp->next = dp;
4493 	}
4494 	RW_EXIT(&ldcp->lane_out.dlistrw);
4495 
4496 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4497 }
4498 
4499 /*
4500  * Setup the descriptors in the dring. Returns 0 on success, 1 on
4501  * failure.
4502  */
4503 int
4504 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
4505 {
4506 	vnet_public_desc_t	*pub_addr = NULL;
4507 	vsw_private_desc_t	*priv_addr = NULL;
4508 	vsw_t			*vswp = ldcp->ldc_vswp;
4509 	uint64_t		*tmpp;
4510 	uint64_t		offset = 0;
4511 	uint32_t		ncookies = 0;
4512 	static char		*name = "vsw_setup_ring";
4513 	int			i, j, nc, rv;
4514 
4515 	priv_addr = dp->priv_addr;
4516 	pub_addr = dp->pub_addr;
4517 
4518 	/* public section may be null but private should never be */
4519 	ASSERT(priv_addr != NULL);
4520 
4521 	/*
4522 	 * Allocate the region of memory which will be used to hold
4523 	 * the data the descriptors will refer to.
4524 	 */
4525 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
4526 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
4527 
4528 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
4529 	    dp->data_sz, dp->data_addr);
4530 
4531 	tmpp = (uint64_t *)dp->data_addr;
4532 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
4533 
4534 	/*
4535 	 * Initialise some of the private and public (if they exist)
4536 	 * descriptor fields.
4537 	 */
4538 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
4539 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
4540 
4541 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
4542 		    &priv_addr->memhandle)) != 0) {
4543 			DERR(vswp, "%s: alloc mem handle failed", name);
4544 			goto setup_ring_cleanup;
4545 		}
4546 
4547 		priv_addr->datap = (void *)tmpp;
4548 
4549 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
4550 		    (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
4551 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
4552 		    &(priv_addr->memcookie[0]), &ncookies);
4553 		if (rv != 0) {
4554 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
4555 			    "(rv %d)", name, ldcp->ldc_id, rv);
4556 			goto setup_ring_cleanup;
4557 		}
4558 		priv_addr->bound = 1;
4559 
4560 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
4561 		    name, i, priv_addr->memcookie[0].addr,
4562 		    priv_addr->memcookie[0].size);
4563 
4564 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
4565 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
4566 			    "invalid num of cookies (%d) for size 0x%llx",
4567 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
4568 
4569 			goto setup_ring_cleanup;
4570 		} else {
4571 			for (j = 1; j < ncookies; j++) {
4572 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
4573 				    &(priv_addr->memcookie[j]));
4574 				if (rv != 0) {
4575 					DERR(vswp, "%s: ldc_mem_nextcookie "
4576 					    "failed rv (%d)", name, rv);
4577 					goto setup_ring_cleanup;
4578 				}
4579 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
4580 				    "size 0x%llx", name, j,
4581 				    priv_addr->memcookie[j].addr,
4582 				    priv_addr->memcookie[j].size);
4583 			}
4584 
4585 		}
4586 		priv_addr->ncookies = ncookies;
4587 		priv_addr->dstate = VIO_DESC_FREE;
4588 
4589 		if (pub_addr != NULL) {
4590 
4591 			/* link pub and private sides */
4592 			priv_addr->descp = pub_addr;
4593 
4594 			pub_addr->ncookies = priv_addr->ncookies;
4595 
4596 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
4597 				bcopy(&priv_addr->memcookie[nc],
4598 				    &pub_addr->memcookie[nc],
4599 				    sizeof (ldc_mem_cookie_t));
4600 			}
4601 
4602 			pub_addr->hdr.dstate = VIO_DESC_FREE;
4603 			pub_addr++;
4604 		}
4605 
4606 		/*
4607 		 * move to next element in the dring and the next
4608 		 * position in the data buffer.
4609 		 */
4610 		priv_addr++;
4611 		tmpp += offset;
4612 	}
4613 
4614 	return (0);
4615 
4616 setup_ring_cleanup:
4617 	priv_addr = dp->priv_addr;
4618 
4619 	for (j = 0; j < i; j++) {
4620 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
4621 		(void) ldc_mem_free_handle(priv_addr->memhandle);
4622 
4623 		mutex_destroy(&priv_addr->dstate_lock);
4624 
4625 		priv_addr++;
4626 	}
4627 	kmem_free(dp->data_addr, dp->data_sz);
4628 
4629 	return (1);
4630 }
4631 
4632 /*
4633  * Searches the private section of a ring for a free descriptor,
4634  * starting at the location of the last free descriptor found
4635  * previously.
4636  *
4637  * Returns 0 if free descriptor is available, and updates state
4638  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
4639  *
4640  * FUTURE: might need to return contiguous range of descriptors
4641  * as dring info msg assumes all will be contiguous.
4642  */
4643 static int
4644 vsw_dring_find_free_desc(dring_info_t *dringp,
4645 		vsw_private_desc_t **priv_p, int *idx)
4646 {
4647 	vsw_private_desc_t	*addr = NULL;
4648 	int			num = VSW_RING_NUM_EL;
4649 	int			ret = 1;
4650 
4651 	D1(NULL, "%s enter\n", __func__);
4652 
4653 	ASSERT(dringp->priv_addr != NULL);
4654 
4655 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
4656 	    __func__, dringp, dringp->end_idx);
4657 
4658 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
4659 
4660 	mutex_enter(&addr->dstate_lock);
4661 	if (addr->dstate == VIO_DESC_FREE) {
4662 		addr->dstate = VIO_DESC_READY;
4663 		*priv_p = addr;
4664 		*idx = dringp->end_idx;
4665 		dringp->end_idx = (dringp->end_idx + 1) % num;
4666 		ret = 0;
4667 
4668 	}
4669 	mutex_exit(&addr->dstate_lock);
4670 
4671 	/* ring full */
4672 	if (ret == 1) {
4673 		D2(NULL, "%s: no desp free: started at %d", __func__,
4674 		    dringp->end_idx);
4675 	}
4676 
4677 	D1(NULL, "%s: exit\n", __func__);
4678 
4679 	return (ret);
4680 }
4681 
4682 /*
4683  * Map from a dring identifier to the ring itself. Returns
4684  * pointer to ring or NULL if no match found.
4685  *
4686  * Should be called with dlistrw rwlock held as reader.
4687  */
4688 static dring_info_t *
4689 vsw_ident2dring(lane_t *lane, uint64_t ident)
4690 {
4691 	dring_info_t	*dp = NULL;
4692 
4693 	if ((dp = lane->dringp) == NULL) {
4694 		return (NULL);
4695 	} else {
4696 		if (dp->ident == ident)
4697 			return (dp);
4698 
4699 		while (dp != NULL) {
4700 			if (dp->ident == ident)
4701 				break;
4702 			dp = dp->next;
4703 		}
4704 	}
4705 
4706 	return (dp);
4707 }
4708 
4709 /*
4710  * Set the default lane attributes. These are copied into
4711  * the attr msg we send to our peer. If they are not acceptable
4712  * then (currently) the handshake ends.
4713  */
4714 static void
4715 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
4716 {
4717 	bzero(lp, sizeof (lane_t));
4718 
4719 	READ_ENTER(&vswp->if_lockrw);
4720 	ether_copy(&(vswp->if_addr), &(lp->addr));
4721 	RW_EXIT(&vswp->if_lockrw);
4722 
4723 	lp->mtu = VSW_MTU;
4724 	lp->addr_type = ADDR_TYPE_MAC;
4725 	lp->xfer_mode = VIO_DRING_MODE;
4726 	lp->ack_freq = 0;	/* for shared mode */
4727 
4728 	/*
4729 	 * As the seq_num is incremented before sending,
4730 	 * initialize it with VNET_ISS - 1.
4731 	 */
4732 	atomic_swap_64(&lp->seq_num, (VNET_ISS - 1));
4733 }
4734 
4735 /*
4736  * Verify that the attributes are acceptable.
4737  *
4738  * FUTURE: If some attributes are not acceptable, change them
4739  * our desired values.
4740  */
4741 static int
4742 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
4743 {
4744 	int			ret = 0;
4745 	struct ether_addr	ea;
4746 
4747 	D1(NULL, "vsw_check_attr enter\n");
4748 
4749 	/*
4750 	 * Note we currently only support in-band descriptors
4751 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
4752 	 */
4753 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
4754 	    (pkt->xfer_mode != VIO_DRING_MODE)) {
4755 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
4756 		ret = 1;
4757 	}
4758 
4759 	/* Only support MAC addresses at moment. */
4760 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
4761 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
4762 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
4763 		ret = 1;
4764 	}
4765 
4766 	/*
4767 	 * MAC address supplied by device should match that stored
4768 	 * in the vsw-port OBP node. Need to decide what to do if they
4769 	 * don't match, for the moment just warn but don't fail.
4770 	 */
4771 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
4772 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
4773 		DERR(NULL, "vsw_check_attr: device supplied address "
4774 		    "0x%llx doesn't match node address 0x%llx\n",
4775 		    pkt->addr, port->p_macaddr);
4776 	}
4777 
4778 	/*
4779 	 * Ack freq only makes sense in pkt mode, in shared
4780 	 * mode the ring descriptors say whether or not to
4781 	 * send back an ACK.
4782 	 */
4783 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
4784 	    (pkt->ack_freq > 0)) {
4785 		D2(NULL, "vsw_check_attr: non zero ack freq "
4786 		    " in SHM mode\n");
4787 		ret = 1;
4788 	}
4789 
4790 	/*
4791 	 * Note: for the moment we only support ETHER
4792 	 * frames. This may change in the future.
4793 	 */
4794 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
4795 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
4796 		    pkt->mtu);
4797 		ret = 1;
4798 	}
4799 
4800 	D1(NULL, "vsw_check_attr exit\n");
4801 
4802 	return (ret);
4803 }
4804 
4805 /*
4806  * Returns 1 if there is a problem, 0 otherwise.
4807  */
4808 static int
4809 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
4810 {
4811 	_NOTE(ARGUNUSED(pkt))
4812 
4813 	int	ret = 0;
4814 
4815 	D1(NULL, "vsw_check_dring_info enter\n");
4816 
4817 	if ((pkt->num_descriptors == 0) ||
4818 	    (pkt->descriptor_size == 0) ||
4819 	    (pkt->ncookies != 1)) {
4820 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
4821 		ret = 1;
4822 	}
4823 
4824 	D1(NULL, "vsw_check_dring_info exit\n");
4825 
4826 	return (ret);
4827 }
4828 
4829 /*
4830  * Returns 1 if two memory cookies match. Otherwise returns 0.
4831  */
4832 static int
4833 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
4834 {
4835 	if ((m1->addr != m2->addr) ||
4836 	    (m2->size != m2->size)) {
4837 		return (0);
4838 	} else {
4839 		return (1);
4840 	}
4841 }
4842 
4843 /*
4844  * Returns 1 if ring described in reg message matches that
4845  * described by dring_info structure. Otherwise returns 0.
4846  */
4847 static int
4848 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
4849 {
4850 	if ((msg->descriptor_size != dp->descriptor_size) ||
4851 	    (msg->num_descriptors != dp->num_descriptors) ||
4852 	    (msg->ncookies != dp->ncookies) ||
4853 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
4854 		return (0);
4855 	} else {
4856 		return (1);
4857 	}
4858 
4859 }
4860 
4861 static caddr_t
4862 vsw_print_ethaddr(uint8_t *a, char *ebuf)
4863 {
4864 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
4865 	    a[0], a[1], a[2], a[3], a[4], a[5]);
4866 	return (ebuf);
4867 }
4868 
4869 /*
4870  * Reset and free all the resources associated with
4871  * the channel.
4872  */
4873 static void
4874 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
4875 {
4876 	dring_info_t		*dp, *dpp;
4877 	lane_t			*lp = NULL;
4878 	int			rv = 0;
4879 
4880 	ASSERT(ldcp != NULL);
4881 
4882 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
4883 
4884 	if (dir == INBOUND) {
4885 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
4886 		    " of channel %lld", __func__, ldcp->ldc_id);
4887 		lp = &ldcp->lane_in;
4888 	} else {
4889 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
4890 		    " of channel %lld", __func__, ldcp->ldc_id);
4891 		lp = &ldcp->lane_out;
4892 	}
4893 
4894 	lp->lstate = VSW_LANE_INACTIV;
4895 
4896 	/*
4897 	 * As the seq_num is incremented before sending,
4898 	 * initialize it with VNET_ISS - 1.
4899 	 */
4900 	atomic_swap_64(&lp->seq_num, (VNET_ISS - 1));
4901 
4902 	if (lp->dringp) {
4903 		if (dir == INBOUND) {
4904 			WRITE_ENTER(&lp->dlistrw);
4905 			dp = lp->dringp;
4906 			while (dp != NULL) {
4907 				dpp = dp->next;
4908 				if (dp->handle != NULL)
4909 					(void) ldc_mem_dring_unmap(dp->handle);
4910 				kmem_free(dp, sizeof (dring_info_t));
4911 				dp = dpp;
4912 			}
4913 			RW_EXIT(&lp->dlistrw);
4914 		} else {
4915 			/*
4916 			 * unbind, destroy exported dring, free dring struct
4917 			 */
4918 			WRITE_ENTER(&lp->dlistrw);
4919 			dp = lp->dringp;
4920 			rv = vsw_free_ring(dp);
4921 			RW_EXIT(&lp->dlistrw);
4922 		}
4923 		if (rv == 0) {
4924 			lp->dringp = NULL;
4925 		}
4926 	}
4927 
4928 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
4929 }
4930 
4931 /*
4932  * Free ring and all associated resources.
4933  *
4934  * Should be called with dlistrw rwlock held as writer.
4935  */
4936 static int
4937 vsw_free_ring(dring_info_t *dp)
4938 {
4939 	vsw_private_desc_t	*paddr = NULL;
4940 	dring_info_t		*dpp;
4941 	int			i, rv = 1;
4942 
4943 	while (dp != NULL) {
4944 		mutex_enter(&dp->dlock);
4945 		dpp = dp->next;
4946 		if (dp->priv_addr != NULL) {
4947 			/*
4948 			 * First unbind and free the memory handles
4949 			 * stored in each descriptor within the ring.
4950 			 */
4951 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
4952 				paddr = (vsw_private_desc_t *)
4953 				    dp->priv_addr + i;
4954 				if (paddr->memhandle != NULL) {
4955 					if (paddr->bound == 1) {
4956 						rv = ldc_mem_unbind_handle(
4957 						    paddr->memhandle);
4958 
4959 						if (rv != 0) {
4960 							DERR(NULL, "error "
4961 							"unbinding handle for "
4962 							"ring 0x%llx at pos %d",
4963 							    dp, i);
4964 							mutex_exit(&dp->dlock);
4965 							return (rv);
4966 						}
4967 						paddr->bound = 0;
4968 					}
4969 
4970 					rv = ldc_mem_free_handle(
4971 					    paddr->memhandle);
4972 					if (rv != 0) {
4973 						DERR(NULL, "error freeing "
4974 						    "handle for ring 0x%llx "
4975 						    "at pos %d", dp, i);
4976 						mutex_exit(&dp->dlock);
4977 						return (rv);
4978 					}
4979 					paddr->memhandle = NULL;
4980 				}
4981 				mutex_destroy(&paddr->dstate_lock);
4982 			}
4983 			kmem_free(dp->priv_addr,
4984 			    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
4985 		}
4986 
4987 		/*
4988 		 * Now unbind and destroy the ring itself.
4989 		 */
4990 		if (dp->handle != NULL) {
4991 			(void) ldc_mem_dring_unbind(dp->handle);
4992 			(void) ldc_mem_dring_destroy(dp->handle);
4993 		}
4994 
4995 		if (dp->data_addr != NULL) {
4996 			kmem_free(dp->data_addr, dp->data_sz);
4997 		}
4998 
4999 		mutex_exit(&dp->dlock);
5000 		mutex_destroy(&dp->dlock);
5001 		mutex_destroy(&dp->restart_lock);
5002 		kmem_free(dp, sizeof (dring_info_t));
5003 
5004 		dp = dpp;
5005 	}
5006 	return (0);
5007 }
5008 
5009 /*
5010  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5011  * This thread is woken up by the LDC interrupt handler to process
5012  * LDC packets and receive data.
5013  */
5014 static void
5015 vsw_ldc_rx_worker(void *arg)
5016 {
5017 	callb_cpr_t	cprinfo;
5018 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5019 	vsw_t *vswp = ldcp->ldc_vswp;
5020 
5021 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5022 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5023 	    "vsw_rx_thread");
5024 	mutex_enter(&ldcp->rx_thr_lock);
5025 	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5026 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5027 
5028 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5029 		/*
5030 		 * Wait until the data is received or a stop
5031 		 * request is received.
5032 		 */
5033 		while (!(ldcp->rx_thr_flags &
5034 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5035 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5036 		}
5037 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5038 
5039 		/*
5040 		 * First process the stop request.
5041 		 */
5042 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5043 			D2(vswp, "%s(%lld):Rx thread stopped\n",
5044 			    __func__, ldcp->ldc_id);
5045 			break;
5046 		}
5047 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5048 		mutex_exit(&ldcp->rx_thr_lock);
5049 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5050 		    __func__, ldcp->ldc_id);
5051 		mutex_enter(&ldcp->ldc_cblock);
5052 		vsw_process_pkt(ldcp);
5053 		mutex_exit(&ldcp->ldc_cblock);
5054 		mutex_enter(&ldcp->rx_thr_lock);
5055 	}
5056 
5057 	/*
5058 	 * Update the run status and wakeup the thread that
5059 	 * has sent the stop request.
5060 	 */
5061 	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5062 	cv_signal(&ldcp->rx_thr_cv);
5063 	CALLB_CPR_EXIT(&cprinfo);
5064 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5065 	thread_exit();
5066 }
5067 
5068 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5069 static void
5070 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5071 {
5072 	vsw_t *vswp = ldcp->ldc_vswp;
5073 
5074 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5075 	/*
5076 	 * Send a stop request by setting the stop flag and
5077 	 * wait until the receive thread stops.
5078 	 */
5079 	mutex_enter(&ldcp->rx_thr_lock);
5080 	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5081 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5082 		cv_signal(&ldcp->rx_thr_cv);
5083 		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5084 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5085 		}
5086 	}
5087 	mutex_exit(&ldcp->rx_thr_lock);
5088 	ldcp->rx_thread = NULL;
5089 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5090 }
5091 
5092 /*
5093  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5094  * This thread is woken up by the vsw_portsend to transmit
5095  * packets.
5096  */
5097 static void
5098 vsw_ldc_tx_worker(void *arg)
5099 {
5100 	callb_cpr_t	cprinfo;
5101 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5102 	vsw_t *vswp = ldcp->ldc_vswp;
5103 	mblk_t *mp;
5104 	mblk_t *tmp;
5105 
5106 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5107 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5108 	    "vnet_tx_thread");
5109 	mutex_enter(&ldcp->tx_thr_lock);
5110 	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5111 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5112 
5113 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5114 		/*
5115 		 * Wait until the data is received or a stop
5116 		 * request is received.
5117 		 */
5118 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5119 		    (ldcp->tx_mhead == NULL)) {
5120 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5121 		}
5122 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5123 
5124 		/*
5125 		 * First process the stop request.
5126 		 */
5127 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5128 			D2(vswp, "%s(%lld):tx thread stopped\n",
5129 			    __func__, ldcp->ldc_id);
5130 			break;
5131 		}
5132 		mp = ldcp->tx_mhead;
5133 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5134 		mutex_exit(&ldcp->tx_thr_lock);
5135 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5136 		    __func__, ldcp->ldc_id);
5137 		while (mp != NULL) {
5138 			tmp = mp->b_next;
5139 			mp->b_next = mp->b_prev = NULL;
5140 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5141 			mp = tmp;
5142 		}
5143 		mutex_enter(&ldcp->tx_thr_lock);
5144 	}
5145 
5146 	/*
5147 	 * Update the run status and wakeup the thread that
5148 	 * has sent the stop request.
5149 	 */
5150 	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5151 	cv_signal(&ldcp->tx_thr_cv);
5152 	CALLB_CPR_EXIT(&cprinfo);
5153 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5154 	thread_exit();
5155 }
5156 
5157 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5158 static void
5159 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5160 {
5161 	vsw_t *vswp = ldcp->ldc_vswp;
5162 
5163 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5164 	/*
5165 	 * Send a stop request by setting the stop flag and
5166 	 * wait until the receive thread stops.
5167 	 */
5168 	mutex_enter(&ldcp->tx_thr_lock);
5169 	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5170 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5171 		cv_signal(&ldcp->tx_thr_cv);
5172 		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5173 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5174 		}
5175 	}
5176 	mutex_exit(&ldcp->tx_thr_lock);
5177 	ldcp->tx_thread = NULL;
5178 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5179 }
5180 
5181 /* vsw_reclaim_dring -- reclaim descriptors */
5182 static int
5183 vsw_reclaim_dring(dring_info_t *dp, int start)
5184 {
5185 	int i, j, len;
5186 	vsw_private_desc_t *priv_addr;
5187 	vnet_public_desc_t *pub_addr;
5188 
5189 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5190 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5191 	len = dp->num_descriptors;
5192 
5193 	D2(NULL, "%s: start index %ld\n", __func__, start);
5194 
5195 	j = 0;
5196 	for (i = start; j < len; i = (i + 1) % len, j++) {
5197 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5198 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5199 
5200 		mutex_enter(&priv_addr->dstate_lock);
5201 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
5202 			mutex_exit(&priv_addr->dstate_lock);
5203 			break;
5204 		}
5205 		pub_addr->hdr.dstate = VIO_DESC_FREE;
5206 		priv_addr->dstate = VIO_DESC_FREE;
5207 		/* clear all the fields */
5208 		priv_addr->datalen = 0;
5209 		pub_addr->hdr.ack = 0;
5210 		mutex_exit(&priv_addr->dstate_lock);
5211 
5212 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
5213 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
5214 	}
5215 	return (j);
5216 }
5217 
5218 /*
5219  * Debugging routines
5220  */
5221 static void
5222 display_state(void)
5223 {
5224 	vsw_t		*vswp;
5225 	vsw_port_list_t	*plist;
5226 	vsw_port_t 	*port;
5227 	vsw_ldc_list_t	*ldcl;
5228 	vsw_ldc_t 	*ldcp;
5229 	extern vsw_t 	*vsw_head;
5230 
5231 	cmn_err(CE_NOTE, "***** system state *****");
5232 
5233 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
5234 		plist = &vswp->plist;
5235 		READ_ENTER(&plist->lockrw);
5236 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
5237 		    vswp->instance, plist->num_ports);
5238 
5239 		for (port = plist->head; port != NULL; port = port->p_next) {
5240 			ldcl = &port->p_ldclist;
5241 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
5242 			    port->p_instance, ldcl->num_ldcs);
5243 			READ_ENTER(&ldcl->lockrw);
5244 			ldcp = ldcl->head;
5245 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
5246 				cmn_err(CE_CONT, "chan %lu : dev %d : "
5247 				    "status %d : phase %u\n",
5248 				    ldcp->ldc_id, ldcp->dev_class,
5249 				    ldcp->ldc_status, ldcp->hphase);
5250 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
5251 				    "psession %lu\n", ldcp->ldc_id,
5252 				    ldcp->local_session, ldcp->peer_session);
5253 
5254 				cmn_err(CE_CONT, "Inbound lane:\n");
5255 				display_lane(&ldcp->lane_in);
5256 				cmn_err(CE_CONT, "Outbound lane:\n");
5257 				display_lane(&ldcp->lane_out);
5258 			}
5259 			RW_EXIT(&ldcl->lockrw);
5260 		}
5261 		RW_EXIT(&plist->lockrw);
5262 	}
5263 	cmn_err(CE_NOTE, "***** system state *****");
5264 }
5265 
5266 static void
5267 display_lane(lane_t *lp)
5268 {
5269 	dring_info_t	*drp;
5270 
5271 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
5272 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
5273 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
5274 	    lp->addr_type, lp->addr, lp->xfer_mode);
5275 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
5276 
5277 	cmn_err(CE_CONT, "Dring info:\n");
5278 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
5279 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
5280 		    drp->num_descriptors, drp->descriptor_size);
5281 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
5282 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
5283 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
5284 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
5285 		    drp->ident, drp->end_idx);
5286 		display_ring(drp);
5287 	}
5288 }
5289 
5290 static void
5291 display_ring(dring_info_t *dringp)
5292 {
5293 	uint64_t		i;
5294 	uint64_t		priv_count = 0;
5295 	uint64_t		pub_count = 0;
5296 	vnet_public_desc_t	*pub_addr = NULL;
5297 	vsw_private_desc_t	*priv_addr = NULL;
5298 
5299 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
5300 		if (dringp->pub_addr != NULL) {
5301 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
5302 
5303 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
5304 				pub_count++;
5305 		}
5306 
5307 		if (dringp->priv_addr != NULL) {
5308 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
5309 
5310 			if (priv_addr->dstate == VIO_DESC_FREE)
5311 				priv_count++;
5312 		}
5313 	}
5314 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
5315 	    i, priv_count, pub_count);
5316 }
5317 
5318 static void
5319 dump_flags(uint64_t state)
5320 {
5321 	int	i;
5322 
5323 	typedef struct flag_name {
5324 		int	flag_val;
5325 		char	*flag_name;
5326 	} flag_name_t;
5327 
5328 	flag_name_t	flags[] = {
5329 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
5330 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
5331 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
5332 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
5333 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
5334 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
5335 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
5336 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
5337 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
5338 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
5339 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
5340 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
5341 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
5342 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
5343 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
5344 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
5345 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
5346 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
5347 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
5348 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
5349 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
5350 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
5351 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
5352 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
5353 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
5354 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
5355 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
5356 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
5357 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
5358 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
5359 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
5360 
5361 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
5362 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
5363 		if (state & flags[i].flag_val)
5364 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
5365 	}
5366 }
5367