xref: /titanic_51/usr/src/uts/sun4v/io/vsw_ldc.c (revision d2ec54f7875f7e05edd56195adbeb593c947763f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 #include <sys/callb.h>
75 #include <sys/vlan.h>
76 
77 /* Port add/deletion/etc routines */
78 static	int vsw_port_delete(vsw_port_t *port);
79 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
80 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
81 static	int vsw_init_ldcs(vsw_port_t *port);
82 static	int vsw_uninit_ldcs(vsw_port_t *port);
83 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
84 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
85 static	int vsw_drain_ldcs(vsw_port_t *port);
86 static	int vsw_drain_port_taskq(vsw_port_t *port);
87 static	void vsw_marker_task(void *);
88 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
89 int vsw_detach_ports(vsw_t *vswp);
90 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
91 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
92 int vsw_port_detach(vsw_t *vswp, int p_instance);
93 int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
94 int vsw_port_attach(vsw_port_t *portp);
95 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
96 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
97 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
98 void vsw_hio_port_reset(vsw_port_t *portp);
99 
100 /* Interrupt routines */
101 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
102 
103 /* Handshake routines */
104 static	void vsw_ldc_reinit(vsw_ldc_t *);
105 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
106 static	void vsw_conn_task(void *);
107 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
108 static	void vsw_next_milestone(vsw_ldc_t *);
109 static	int vsw_supported_version(vio_ver_msg_t *);
110 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
111 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
112 
113 /* Data processing routines */
114 static void vsw_process_pkt(void *);
115 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
116 static void vsw_process_ctrl_pkt(void *);
117 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
123 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
124 	uint32_t);
125 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
126 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
127 static void vsw_process_pkt_data(void *, void *, uint32_t);
128 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
129 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
130 
131 /* Switching/data transmit routines */
132 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
133 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
134 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
135 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
136 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
138 
139 /* Packet creation routines */
140 static void vsw_send_ver(void *);
141 static void vsw_send_attr(vsw_ldc_t *);
142 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
143 static void vsw_send_dring_info(vsw_ldc_t *);
144 static void vsw_send_rdx(vsw_ldc_t *);
145 
146 /* Dring routines */
147 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
148 static void vsw_create_privring(vsw_ldc_t *);
149 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
150 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
151     int *);
152 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
153 static int vsw_reclaim_dring(dring_info_t *dp, int start);
154 
155 static void vsw_set_lane_attr(vsw_t *, lane_t *);
156 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
157 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
158 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
159 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
160 
161 /* Rcv/Tx thread routines */
162 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
163 static void vsw_ldc_tx_worker(void *arg);
164 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
165 static void vsw_ldc_rx_worker(void *arg);
166 
167 /* Misc support routines */
168 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
169 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
170 static int vsw_free_ring(dring_info_t *);
171 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
172 static int vsw_get_same_dest_list(struct ether_header *ehp,
173     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
174 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
175 
176 /* Debugging routines */
177 static void dump_flags(uint64_t);
178 static void display_state(void);
179 static void display_lane(lane_t *);
180 static void display_ring(dring_info_t *);
181 
182 /*
183  * Functions imported from other files.
184  */
185 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
186 extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
187 extern void vsw_reconfig_hw(vsw_t *);
188 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
189 extern void vsw_del_mcst_port(vsw_port_t *port);
190 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
191 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
192 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
193 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
194 extern void vsw_create_vlans(void *arg, int type);
195 extern void vsw_destroy_vlans(void *arg, int type);
196 extern void vsw_vlan_add_ids(void *arg, int type);
197 extern void vsw_vlan_remove_ids(void *arg, int type);
198 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
199 	struct ether_header *ehp, uint16_t *vidp);
200 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
201 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
202 	mblk_t **npt);
203 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
204 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
205 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
206 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
207 extern void vsw_hio_stop_port(vsw_port_t *portp);
208 
209 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
210 
211 /*
212  * Tunables used in this file.
213  */
214 extern int vsw_num_handshakes;
215 extern int vsw_wretries;
216 extern int vsw_desc_delay;
217 extern int vsw_read_attempts;
218 extern int vsw_ldc_tx_delay;
219 extern int vsw_ldc_tx_retries;
220 extern boolean_t vsw_ldc_rxthr_enabled;
221 extern boolean_t vsw_ldc_txthr_enabled;
222 extern uint32_t vsw_ntxds;
223 extern uint32_t vsw_max_tx_qcount;
224 extern uint32_t vsw_chain_len;
225 extern uint32_t vsw_mblk_size1;
226 extern uint32_t vsw_mblk_size2;
227 extern uint32_t vsw_mblk_size3;
228 extern uint32_t vsw_num_mblks1;
229 extern uint32_t vsw_num_mblks2;
230 extern uint32_t vsw_num_mblks3;
231 extern boolean_t vsw_obp_ver_proto_workaround;
232 
233 #define	LDC_ENTER_LOCK(ldcp)	\
234 				mutex_enter(&((ldcp)->ldc_cblock));\
235 				mutex_enter(&((ldcp)->ldc_rxlock));\
236 				mutex_enter(&((ldcp)->ldc_txlock));
237 #define	LDC_EXIT_LOCK(ldcp)	\
238 				mutex_exit(&((ldcp)->ldc_txlock));\
239 				mutex_exit(&((ldcp)->ldc_rxlock));\
240 				mutex_exit(&((ldcp)->ldc_cblock));
241 
242 #define	VSW_VER_EQ(ldcp, major, minor)	\
243 	((ldcp)->lane_out.ver_major == (major) &&	\
244 	    (ldcp)->lane_out.ver_minor == (minor))
245 
246 #define	VSW_VER_LT(ldcp, major, minor)	\
247 	(((ldcp)->lane_out.ver_major < (major)) ||	\
248 	    ((ldcp)->lane_out.ver_major == (major) &&	\
249 	    (ldcp)->lane_out.ver_minor < (minor)))
250 
251 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
252 	(((ldcp)->lane_out.ver_major > (major)) ||	\
253 	    ((ldcp)->lane_out.ver_major == (major) &&	\
254 	    (ldcp)->lane_out.ver_minor >= (minor)))
255 
256 /* supported versions */
257 static	ver_sup_t	vsw_versions[] = { {1, 3} };
258 
259 /*
260  * For the moment the state dump routines have their own
261  * private flag.
262  */
263 #define	DUMP_STATE	0
264 
265 #if DUMP_STATE
266 
267 #define	DUMP_TAG(tag) \
268 {			\
269 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
270 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
271 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
272 }
273 
274 #define	DUMP_TAG_PTR(tag) \
275 {			\
276 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
277 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
278 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
279 }
280 
281 #define	DUMP_FLAGS(flags) dump_flags(flags);
282 #define	DISPLAY_STATE()	display_state()
283 
284 #else
285 
286 #define	DUMP_TAG(tag)
287 #define	DUMP_TAG_PTR(tag)
288 #define	DUMP_FLAGS(state)
289 #define	DISPLAY_STATE()
290 
291 #endif	/* DUMP_STATE */
292 
293 /*
294  * Attach the specified port.
295  *
296  * Returns 0 on success, 1 on failure.
297  */
298 int
299 vsw_port_attach(vsw_port_t *port)
300 {
301 	vsw_t			*vswp = port->p_vswp;
302 	vsw_port_list_t		*plist = &vswp->plist;
303 	vsw_port_t		*p, **pp;
304 	int			i;
305 	int			nids = port->num_ldcs;
306 	uint64_t		*ldcids;
307 
308 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
309 
310 	/* port already exists? */
311 	READ_ENTER(&plist->lockrw);
312 	for (p = plist->head; p != NULL; p = p->p_next) {
313 		if (p->p_instance == port->p_instance) {
314 			DWARN(vswp, "%s: port instance %d already attached",
315 			    __func__, p->p_instance);
316 			RW_EXIT(&plist->lockrw);
317 			return (1);
318 		}
319 	}
320 	RW_EXIT(&plist->lockrw);
321 
322 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
323 
324 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
325 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
326 
327 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
328 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
329 	port->state = VSW_PORT_INIT;
330 
331 	D2(vswp, "%s: %d nids", __func__, nids);
332 	ldcids = port->ldc_ids;
333 	for (i = 0; i < nids; i++) {
334 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
335 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
336 			DERR(vswp, "%s: ldc_attach failed", __func__);
337 
338 			rw_destroy(&port->p_ldclist.lockrw);
339 
340 			cv_destroy(&port->state_cv);
341 			mutex_destroy(&port->state_lock);
342 
343 			mutex_destroy(&port->tx_lock);
344 			mutex_destroy(&port->mca_lock);
345 			kmem_free(port, sizeof (vsw_port_t));
346 			return (1);
347 		}
348 	}
349 
350 	if (vswp->switching_setup_done == B_TRUE) {
351 		/*
352 		 * If the underlying physical device has been setup,
353 		 * program the mac address of this port in it.
354 		 * Otherwise, port macaddr will be set after the physical
355 		 * device is successfully setup by the timeout handler.
356 		 */
357 		mutex_enter(&vswp->hw_lock);
358 		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
359 		mutex_exit(&vswp->hw_lock);
360 	}
361 
362 	/* create the fdb entry for this port/mac address */
363 	vsw_fdbe_add(vswp, port);
364 
365 	vsw_create_vlans(port, VSW_VNETPORT);
366 
367 	WRITE_ENTER(&plist->lockrw);
368 
369 	/* link it into the list of ports for this vsw instance */
370 	pp = (vsw_port_t **)(&plist->head);
371 	port->p_next = *pp;
372 	*pp = port;
373 	plist->num_ports++;
374 
375 	RW_EXIT(&plist->lockrw);
376 
377 	/*
378 	 * Initialise the port and any ldc's under it.
379 	 */
380 	(void) vsw_init_ldcs(port);
381 
382 	D1(vswp, "%s: exit", __func__);
383 	return (0);
384 }
385 
386 /*
387  * Detach the specified port.
388  *
389  * Returns 0 on success, 1 on failure.
390  */
391 int
392 vsw_port_detach(vsw_t *vswp, int p_instance)
393 {
394 	vsw_port_t	*port = NULL;
395 	vsw_port_list_t	*plist = &vswp->plist;
396 
397 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
398 
399 	WRITE_ENTER(&plist->lockrw);
400 
401 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
402 		RW_EXIT(&plist->lockrw);
403 		return (1);
404 	}
405 
406 	if (vsw_plist_del_node(vswp, port)) {
407 		RW_EXIT(&plist->lockrw);
408 		return (1);
409 	}
410 
411 	/* cleanup any HybridIO for this port */
412 	vsw_hio_stop_port(port);
413 
414 	/*
415 	 * No longer need to hold writer lock on port list now
416 	 * that we have unlinked the target port from the list.
417 	 */
418 	RW_EXIT(&plist->lockrw);
419 
420 	/* Remove the fdb entry for this port/mac address */
421 	vsw_fdbe_del(vswp, &(port->p_macaddr));
422 	vsw_destroy_vlans(port, VSW_VNETPORT);
423 
424 	/* Remove any multicast addresses.. */
425 	vsw_del_mcst_port(port);
426 
427 	/* Remove address if was programmed into HW. */
428 	mutex_enter(&vswp->hw_lock);
429 
430 	/*
431 	 * Port's address may not have been set in hardware. This could
432 	 * happen if the underlying physical device is not yet available and
433 	 * vsw_setup_switching_timeout() may be in progress.
434 	 * We remove its addr from hardware only if it has been set before.
435 	 */
436 	if (port->addr_set != VSW_ADDR_UNSET)
437 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
438 
439 	if (vswp->recfg_reqd)
440 		vsw_reconfig_hw(vswp);
441 
442 	mutex_exit(&vswp->hw_lock);
443 
444 	if (vsw_port_delete(port)) {
445 		return (1);
446 	}
447 
448 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
449 	return (0);
450 }
451 
452 /*
453  * Detach all active ports.
454  *
455  * Returns 0 on success, 1 on failure.
456  */
457 int
458 vsw_detach_ports(vsw_t *vswp)
459 {
460 	vsw_port_list_t 	*plist = &vswp->plist;
461 	vsw_port_t		*port = NULL;
462 
463 	D1(vswp, "%s: enter", __func__);
464 
465 	WRITE_ENTER(&plist->lockrw);
466 
467 	while ((port = plist->head) != NULL) {
468 		if (vsw_plist_del_node(vswp, port)) {
469 			DERR(vswp, "%s: Error deleting port %d"
470 			    " from port list", __func__, port->p_instance);
471 			RW_EXIT(&plist->lockrw);
472 			return (1);
473 		}
474 
475 		/* Remove address if was programmed into HW. */
476 		mutex_enter(&vswp->hw_lock);
477 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
478 		mutex_exit(&vswp->hw_lock);
479 
480 		/* Remove the fdb entry for this port/mac address */
481 		vsw_fdbe_del(vswp, &(port->p_macaddr));
482 		vsw_destroy_vlans(port, VSW_VNETPORT);
483 
484 		/* Remove any multicast addresses.. */
485 		vsw_del_mcst_port(port);
486 
487 		/*
488 		 * No longer need to hold the lock on the port list
489 		 * now that we have unlinked the target port from the
490 		 * list.
491 		 */
492 		RW_EXIT(&plist->lockrw);
493 		if (vsw_port_delete(port)) {
494 			DERR(vswp, "%s: Error deleting port %d",
495 			    __func__, port->p_instance);
496 			return (1);
497 		}
498 		WRITE_ENTER(&plist->lockrw);
499 	}
500 	RW_EXIT(&plist->lockrw);
501 
502 	D1(vswp, "%s: exit", __func__);
503 
504 	return (0);
505 }
506 
507 /*
508  * Delete the specified port.
509  *
510  * Returns 0 on success, 1 on failure.
511  */
512 static int
513 vsw_port_delete(vsw_port_t *port)
514 {
515 	vsw_ldc_list_t 		*ldcl;
516 	vsw_t			*vswp = port->p_vswp;
517 	int			num_ldcs;
518 
519 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
520 
521 	(void) vsw_uninit_ldcs(port);
522 
523 	/*
524 	 * Wait for any pending ctrl msg tasks which reference this
525 	 * port to finish.
526 	 */
527 	if (vsw_drain_port_taskq(port))
528 		return (1);
529 
530 	/*
531 	 * Wait for any active callbacks to finish
532 	 */
533 	if (vsw_drain_ldcs(port))
534 		return (1);
535 
536 	ldcl = &port->p_ldclist;
537 	num_ldcs = port->num_ldcs;
538 	WRITE_ENTER(&ldcl->lockrw);
539 	while (num_ldcs > 0) {
540 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
541 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
542 			    vswp->instance, ldcl->head->ldc_id);
543 			RW_EXIT(&ldcl->lockrw);
544 			port->num_ldcs = num_ldcs;
545 			return (1);
546 		}
547 		num_ldcs--;
548 	}
549 	RW_EXIT(&ldcl->lockrw);
550 
551 	rw_destroy(&port->p_ldclist.lockrw);
552 
553 	mutex_destroy(&port->mca_lock);
554 	mutex_destroy(&port->tx_lock);
555 
556 	cv_destroy(&port->state_cv);
557 	mutex_destroy(&port->state_lock);
558 
559 	if (port->num_ldcs != 0) {
560 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
561 		port->num_ldcs = 0;
562 	}
563 	kmem_free(port, sizeof (vsw_port_t));
564 
565 	D1(vswp, "%s: exit", __func__);
566 
567 	return (0);
568 }
569 
570 /*
571  * Attach a logical domain channel (ldc) under a specified port.
572  *
573  * Returns 0 on success, 1 on failure.
574  */
575 static int
576 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
577 {
578 	vsw_t 		*vswp = port->p_vswp;
579 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
580 	vsw_ldc_t 	*ldcp = NULL;
581 	ldc_attr_t 	attr;
582 	ldc_status_t	istatus;
583 	int 		status = DDI_FAILURE;
584 	int		rv;
585 	char		kname[MAXNAMELEN];
586 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
587 			    PROG_callback = 0x2, PROG_rx_thread = 0x4,
588 			    PROG_tx_thread = 0x8}
589 			progress;
590 
591 	progress = PROG_init;
592 
593 	D1(vswp, "%s: enter", __func__);
594 
595 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
596 	if (ldcp == NULL) {
597 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
598 		return (1);
599 	}
600 	ldcp->ldc_id = ldc_id;
601 
602 	/* Allocate pools of receive mblks */
603 	rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
604 	    vsw_mblk_size1, vsw_mblk_size2, vsw_mblk_size3,
605 	    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
606 	if (rv) {
607 		DWARN(vswp, "%s: unable to create free mblk pools for"
608 		    " channel %ld (rv %d)", __func__, ldc_id, rv);
609 		kmem_free(ldcp, sizeof (vsw_ldc_t));
610 		return (1);
611 	}
612 
613 	progress |= PROG_mblks;
614 
615 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
616 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
617 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
618 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
619 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
620 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
621 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
622 
623 	/* required for handshake with peer */
624 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
625 	ldcp->peer_session = 0;
626 	ldcp->session_status = 0;
627 	ldcp->hss_id = 1;	/* Initial handshake session id */
628 
629 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
630 
631 	/* only set for outbound lane, inbound set by peer */
632 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
633 
634 	attr.devclass = LDC_DEV_NT_SVC;
635 	attr.instance = ddi_get_instance(vswp->dip);
636 	attr.mode = LDC_MODE_UNRELIABLE;
637 	attr.mtu = VSW_LDC_MTU;
638 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
639 	if (status != 0) {
640 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
641 		    __func__, ldc_id, status);
642 		goto ldc_attach_fail;
643 	}
644 
645 	if (vsw_ldc_rxthr_enabled) {
646 		ldcp->rx_thr_flags = 0;
647 
648 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
649 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
650 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
651 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
652 
653 		progress |= PROG_rx_thread;
654 		if (ldcp->rx_thread == NULL) {
655 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
656 			    __func__, ldc_id);
657 			goto ldc_attach_fail;
658 		}
659 	}
660 
661 	if (vsw_ldc_txthr_enabled) {
662 		ldcp->tx_thr_flags = 0;
663 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
664 
665 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
666 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
667 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
668 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
669 
670 		progress |= PROG_tx_thread;
671 		if (ldcp->tx_thread == NULL) {
672 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
673 			    __func__, ldc_id);
674 			goto ldc_attach_fail;
675 		}
676 	}
677 
678 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
679 	if (status != 0) {
680 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
681 		    __func__, ldc_id, status);
682 		(void) ldc_fini(ldcp->ldc_handle);
683 		goto ldc_attach_fail;
684 	}
685 	/*
686 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
687 	 * data msgs, including raw data msgs used to recv priority frames.
688 	 */
689 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
690 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
691 
692 	progress |= PROG_callback;
693 
694 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
695 
696 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
697 		DERR(vswp, "%s: ldc_status failed", __func__);
698 		mutex_destroy(&ldcp->status_lock);
699 		goto ldc_attach_fail;
700 	}
701 
702 	ldcp->ldc_status = istatus;
703 	ldcp->ldc_port = port;
704 	ldcp->ldc_vswp = vswp;
705 
706 	vsw_reset_vnet_proto_ops(ldcp);
707 
708 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
709 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
710 	    kname, &ldcp->ldc_stats);
711 	if (ldcp->ksp == NULL) {
712 		DERR(vswp, "%s: kstats setup failed", __func__);
713 		goto ldc_attach_fail;
714 	}
715 
716 	/* link it into the list of channels for this port */
717 	WRITE_ENTER(&ldcl->lockrw);
718 	ldcp->ldc_next = ldcl->head;
719 	ldcl->head = ldcp;
720 	RW_EXIT(&ldcl->lockrw);
721 
722 	D1(vswp, "%s: exit", __func__);
723 	return (0);
724 
725 ldc_attach_fail:
726 
727 	if (progress & PROG_callback) {
728 		(void) ldc_unreg_callback(ldcp->ldc_handle);
729 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
730 	}
731 
732 	if (progress & PROG_rx_thread) {
733 		if (ldcp->rx_thread != NULL) {
734 			vsw_stop_rx_thread(ldcp);
735 		}
736 		mutex_destroy(&ldcp->rx_thr_lock);
737 		cv_destroy(&ldcp->rx_thr_cv);
738 	}
739 
740 	if (progress & PROG_tx_thread) {
741 		if (ldcp->tx_thread != NULL) {
742 			vsw_stop_tx_thread(ldcp);
743 		}
744 		mutex_destroy(&ldcp->tx_thr_lock);
745 		cv_destroy(&ldcp->tx_thr_cv);
746 	}
747 	if (ldcp->ksp != NULL) {
748 		vgen_destroy_kstats(ldcp->ksp);
749 	}
750 	mutex_destroy(&ldcp->ldc_txlock);
751 	mutex_destroy(&ldcp->ldc_rxlock);
752 	mutex_destroy(&ldcp->ldc_cblock);
753 	mutex_destroy(&ldcp->drain_cv_lock);
754 
755 	cv_destroy(&ldcp->drain_cv);
756 
757 	rw_destroy(&ldcp->lane_in.dlistrw);
758 	rw_destroy(&ldcp->lane_out.dlistrw);
759 
760 	if (progress & PROG_mblks) {
761 		vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
762 	}
763 	kmem_free(ldcp, sizeof (vsw_ldc_t));
764 
765 	return (1);
766 }
767 
768 /*
769  * Detach a logical domain channel (ldc) belonging to a
770  * particular port.
771  *
772  * Returns 0 on success, 1 on failure.
773  */
774 static int
775 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
776 {
777 	vsw_t 		*vswp = port->p_vswp;
778 	vsw_ldc_t 	*ldcp, *prev_ldcp;
779 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
780 	int 		rv;
781 
782 	prev_ldcp = ldcl->head;
783 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
784 		if (ldcp->ldc_id == ldc_id) {
785 			break;
786 		}
787 	}
788 
789 	/* specified ldc id not found */
790 	if (ldcp == NULL) {
791 		DERR(vswp, "%s: ldcp = NULL", __func__);
792 		return (1);
793 	}
794 
795 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
796 
797 	/* Stop the receive thread */
798 	if (ldcp->rx_thread != NULL) {
799 		vsw_stop_rx_thread(ldcp);
800 		mutex_destroy(&ldcp->rx_thr_lock);
801 		cv_destroy(&ldcp->rx_thr_cv);
802 	}
803 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
804 
805 	/* Stop the tx thread */
806 	if (ldcp->tx_thread != NULL) {
807 		vsw_stop_tx_thread(ldcp);
808 		mutex_destroy(&ldcp->tx_thr_lock);
809 		cv_destroy(&ldcp->tx_thr_cv);
810 		if (ldcp->tx_mhead != NULL) {
811 			freemsgchain(ldcp->tx_mhead);
812 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
813 			ldcp->tx_cnt = 0;
814 		}
815 	}
816 
817 	/* Destory kstats */
818 	vgen_destroy_kstats(ldcp->ksp);
819 
820 	/*
821 	 * Before we can close the channel we must release any mapped
822 	 * resources (e.g. drings).
823 	 */
824 	vsw_free_lane_resources(ldcp, INBOUND);
825 	vsw_free_lane_resources(ldcp, OUTBOUND);
826 
827 	/*
828 	 * If the close fails we are in serious trouble, as won't
829 	 * be able to delete the parent port.
830 	 */
831 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
832 		DERR(vswp, "%s: error %d closing channel %lld",
833 		    __func__, rv, ldcp->ldc_id);
834 		return (1);
835 	}
836 
837 	(void) ldc_fini(ldcp->ldc_handle);
838 
839 	ldcp->ldc_status = LDC_INIT;
840 	ldcp->ldc_handle = NULL;
841 	ldcp->ldc_vswp = NULL;
842 
843 
844 	/*
845 	 * Most likely some mblks are still in use and
846 	 * have not been returned to the pool. These mblks are
847 	 * added to the pool that is maintained in the device instance.
848 	 * Another attempt will be made to destroy the pool
849 	 * when the device detaches.
850 	 */
851 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
852 
853 	/* unlink it from the list */
854 	prev_ldcp = ldcp->ldc_next;
855 
856 	mutex_destroy(&ldcp->ldc_txlock);
857 	mutex_destroy(&ldcp->ldc_rxlock);
858 	mutex_destroy(&ldcp->ldc_cblock);
859 	cv_destroy(&ldcp->drain_cv);
860 	mutex_destroy(&ldcp->drain_cv_lock);
861 	mutex_destroy(&ldcp->status_lock);
862 	rw_destroy(&ldcp->lane_in.dlistrw);
863 	rw_destroy(&ldcp->lane_out.dlistrw);
864 
865 	kmem_free(ldcp, sizeof (vsw_ldc_t));
866 
867 	return (0);
868 }
869 
870 /*
871  * Open and attempt to bring up the channel. Note that channel
872  * can only be brought up if peer has also opened channel.
873  *
874  * Returns 0 if can open and bring up channel, otherwise
875  * returns 1.
876  */
877 static int
878 vsw_ldc_init(vsw_ldc_t *ldcp)
879 {
880 	vsw_t 		*vswp = ldcp->ldc_vswp;
881 	ldc_status_t	istatus = 0;
882 	int		rv;
883 
884 	D1(vswp, "%s: enter", __func__);
885 
886 	LDC_ENTER_LOCK(ldcp);
887 
888 	/* don't start at 0 in case clients don't like that */
889 	ldcp->next_ident = 1;
890 
891 	rv = ldc_open(ldcp->ldc_handle);
892 	if (rv != 0) {
893 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
894 		    __func__, ldcp->ldc_id, rv);
895 		LDC_EXIT_LOCK(ldcp);
896 		return (1);
897 	}
898 
899 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
900 		DERR(vswp, "%s: unable to get status", __func__);
901 		LDC_EXIT_LOCK(ldcp);
902 		return (1);
903 
904 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
905 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
906 		    __func__, ldcp->ldc_id, istatus);
907 		LDC_EXIT_LOCK(ldcp);
908 		return (1);
909 	}
910 
911 	mutex_enter(&ldcp->status_lock);
912 	ldcp->ldc_status = istatus;
913 	mutex_exit(&ldcp->status_lock);
914 
915 	rv = ldc_up(ldcp->ldc_handle);
916 	if (rv != 0) {
917 		/*
918 		 * Not a fatal error for ldc_up() to fail, as peer
919 		 * end point may simply not be ready yet.
920 		 */
921 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
922 		    ldcp->ldc_id, rv);
923 		LDC_EXIT_LOCK(ldcp);
924 		return (1);
925 	}
926 
927 	/*
928 	 * ldc_up() call is non-blocking so need to explicitly
929 	 * check channel status to see if in fact the channel
930 	 * is UP.
931 	 */
932 	mutex_enter(&ldcp->status_lock);
933 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
934 		DERR(vswp, "%s: unable to get status", __func__);
935 		mutex_exit(&ldcp->status_lock);
936 		LDC_EXIT_LOCK(ldcp);
937 		return (1);
938 
939 	}
940 
941 	if (ldcp->ldc_status == LDC_UP) {
942 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
943 		    ldcp->ldc_id, istatus);
944 		mutex_exit(&ldcp->status_lock);
945 		LDC_EXIT_LOCK(ldcp);
946 
947 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
948 		return (0);
949 	}
950 
951 	mutex_exit(&ldcp->status_lock);
952 	LDC_EXIT_LOCK(ldcp);
953 
954 	D1(vswp, "%s: exit", __func__);
955 	return (0);
956 }
957 
958 /* disable callbacks on the channel */
959 static int
960 vsw_ldc_uninit(vsw_ldc_t *ldcp)
961 {
962 	vsw_t	*vswp = ldcp->ldc_vswp;
963 	int	rv;
964 
965 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
966 
967 	LDC_ENTER_LOCK(ldcp);
968 
969 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
970 	if (rv != 0) {
971 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
972 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
973 		LDC_EXIT_LOCK(ldcp);
974 		return (1);
975 	}
976 
977 	mutex_enter(&ldcp->status_lock);
978 	ldcp->ldc_status = LDC_INIT;
979 	mutex_exit(&ldcp->status_lock);
980 
981 	LDC_EXIT_LOCK(ldcp);
982 
983 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
984 
985 	return (0);
986 }
987 
988 static int
989 vsw_init_ldcs(vsw_port_t *port)
990 {
991 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
992 	vsw_ldc_t	*ldcp;
993 
994 	READ_ENTER(&ldcl->lockrw);
995 	ldcp =  ldcl->head;
996 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
997 		(void) vsw_ldc_init(ldcp);
998 	}
999 	RW_EXIT(&ldcl->lockrw);
1000 
1001 	return (0);
1002 }
1003 
1004 static int
1005 vsw_uninit_ldcs(vsw_port_t *port)
1006 {
1007 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1008 	vsw_ldc_t	*ldcp;
1009 
1010 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1011 
1012 	READ_ENTER(&ldcl->lockrw);
1013 	ldcp =  ldcl->head;
1014 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1015 		(void) vsw_ldc_uninit(ldcp);
1016 	}
1017 	RW_EXIT(&ldcl->lockrw);
1018 
1019 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1020 
1021 	return (0);
1022 }
1023 
1024 /*
1025  * Wait until the callback(s) associated with the ldcs under the specified
1026  * port have completed.
1027  *
1028  * Prior to this function being invoked each channel under this port
1029  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1030  *
1031  * A short explaination of what we are doing below..
1032  *
1033  * The simplest approach would be to have a reference counter in
1034  * the ldc structure which is increment/decremented by the callbacks as
1035  * they use the channel. The drain function could then simply disable any
1036  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1037  * there is a tiny window here - before the callback is able to get the lock
1038  * on the channel it is interrupted and this function gets to execute. It
1039  * sees that the ref count is zero and believes its free to delete the
1040  * associated data structures.
1041  *
1042  * We get around this by taking advantage of the fact that before the ldc
1043  * framework invokes a callback it sets a flag to indicate that there is a
1044  * callback active (or about to become active). If when we attempt to
1045  * unregister a callback when this active flag is set then the unregister
1046  * will fail with EWOULDBLOCK.
1047  *
1048  * If the unregister fails we do a cv_timedwait. We will either be signaled
1049  * by the callback as it is exiting (note we have to wait a short period to
1050  * allow the callback to return fully to the ldc framework and it to clear
1051  * the active flag), or by the timer expiring. In either case we again attempt
1052  * the unregister. We repeat this until we can succesfully unregister the
1053  * callback.
1054  *
1055  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1056  * the case where the callback has finished but the ldc framework has not yet
1057  * cleared the active flag. In this case we would never get a cv_signal.
1058  */
1059 static int
1060 vsw_drain_ldcs(vsw_port_t *port)
1061 {
1062 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1063 	vsw_ldc_t	*ldcp;
1064 	vsw_t		*vswp = port->p_vswp;
1065 
1066 	D1(vswp, "%s: enter", __func__);
1067 
1068 	READ_ENTER(&ldcl->lockrw);
1069 
1070 	ldcp = ldcl->head;
1071 
1072 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1073 		/*
1074 		 * If we can unregister the channel callback then we
1075 		 * know that there is no callback either running or
1076 		 * scheduled to run for this channel so move on to next
1077 		 * channel in the list.
1078 		 */
1079 		mutex_enter(&ldcp->drain_cv_lock);
1080 
1081 		/* prompt active callbacks to quit */
1082 		ldcp->drain_state = VSW_LDC_DRAINING;
1083 
1084 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1085 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1086 			    ldcp->ldc_id);
1087 			mutex_exit(&ldcp->drain_cv_lock);
1088 			continue;
1089 		} else {
1090 			/*
1091 			 * If we end up here we know that either 1) a callback
1092 			 * is currently executing, 2) is about to start (i.e.
1093 			 * the ldc framework has set the active flag but
1094 			 * has not actually invoked the callback yet, or 3)
1095 			 * has finished and has returned to the ldc framework
1096 			 * but the ldc framework has not yet cleared the
1097 			 * active bit.
1098 			 *
1099 			 * Wait for it to finish.
1100 			 */
1101 			while (ldc_unreg_callback(ldcp->ldc_handle)
1102 			    == EWOULDBLOCK)
1103 				(void) cv_timedwait(&ldcp->drain_cv,
1104 				    &ldcp->drain_cv_lock, lbolt + hz);
1105 
1106 			mutex_exit(&ldcp->drain_cv_lock);
1107 			D2(vswp, "%s: unreg callback for chan %ld after "
1108 			    "timeout", __func__, ldcp->ldc_id);
1109 		}
1110 	}
1111 	RW_EXIT(&ldcl->lockrw);
1112 
1113 	D1(vswp, "%s: exit", __func__);
1114 	return (0);
1115 }
1116 
1117 /*
1118  * Wait until all tasks which reference this port have completed.
1119  *
1120  * Prior to this function being invoked each channel under this port
1121  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1122  */
1123 static int
1124 vsw_drain_port_taskq(vsw_port_t *port)
1125 {
1126 	vsw_t		*vswp = port->p_vswp;
1127 
1128 	D1(vswp, "%s: enter", __func__);
1129 
1130 	/*
1131 	 * Mark the port as in the process of being detached, and
1132 	 * dispatch a marker task to the queue so we know when all
1133 	 * relevant tasks have completed.
1134 	 */
1135 	mutex_enter(&port->state_lock);
1136 	port->state = VSW_PORT_DETACHING;
1137 
1138 	if ((vswp->taskq_p == NULL) ||
1139 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1140 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1141 		DERR(vswp, "%s: unable to dispatch marker task",
1142 		    __func__);
1143 		mutex_exit(&port->state_lock);
1144 		return (1);
1145 	}
1146 
1147 	/*
1148 	 * Wait for the marker task to finish.
1149 	 */
1150 	while (port->state != VSW_PORT_DETACHABLE)
1151 		cv_wait(&port->state_cv, &port->state_lock);
1152 
1153 	mutex_exit(&port->state_lock);
1154 
1155 	D1(vswp, "%s: exit", __func__);
1156 
1157 	return (0);
1158 }
1159 
1160 static void
1161 vsw_marker_task(void *arg)
1162 {
1163 	vsw_port_t	*port = arg;
1164 	vsw_t		*vswp = port->p_vswp;
1165 
1166 	D1(vswp, "%s: enter", __func__);
1167 
1168 	mutex_enter(&port->state_lock);
1169 
1170 	/*
1171 	 * No further tasks should be dispatched which reference
1172 	 * this port so ok to mark it as safe to detach.
1173 	 */
1174 	port->state = VSW_PORT_DETACHABLE;
1175 
1176 	cv_signal(&port->state_cv);
1177 
1178 	mutex_exit(&port->state_lock);
1179 
1180 	D1(vswp, "%s: exit", __func__);
1181 }
1182 
1183 vsw_port_t *
1184 vsw_lookup_port(vsw_t *vswp, int p_instance)
1185 {
1186 	vsw_port_list_t *plist = &vswp->plist;
1187 	vsw_port_t	*port;
1188 
1189 	for (port = plist->head; port != NULL; port = port->p_next) {
1190 		if (port->p_instance == p_instance) {
1191 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1192 			return (port);
1193 		}
1194 	}
1195 
1196 	return (NULL);
1197 }
1198 
1199 void
1200 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1201 {
1202 	vsw_ldc_list_t 	*ldclp;
1203 	vsw_ldc_t	*ldcp;
1204 
1205 	ldclp = &portp->p_ldclist;
1206 
1207 	READ_ENTER(&ldclp->lockrw);
1208 
1209 	/*
1210 	 * NOTE: for now, we will assume we have a single channel.
1211 	 */
1212 	if (ldclp->head == NULL) {
1213 		RW_EXIT(&ldclp->lockrw);
1214 		return;
1215 	}
1216 	ldcp = ldclp->head;
1217 
1218 	mutex_enter(&ldcp->ldc_cblock);
1219 
1220 	/*
1221 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1222 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1223 	 */
1224 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1225 	    portp->nvids != 0) {
1226 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1227 	}
1228 
1229 	mutex_exit(&ldcp->ldc_cblock);
1230 
1231 	RW_EXIT(&ldclp->lockrw);
1232 }
1233 
1234 void
1235 vsw_hio_port_reset(vsw_port_t *portp)
1236 {
1237 	vsw_ldc_list_t	*ldclp;
1238 	vsw_ldc_t	*ldcp;
1239 
1240 	ldclp = &portp->p_ldclist;
1241 
1242 	READ_ENTER(&ldclp->lockrw);
1243 
1244 	/*
1245 	 * NOTE: for now, we will assume we have a single channel.
1246 	 */
1247 	if (ldclp->head == NULL) {
1248 		RW_EXIT(&ldclp->lockrw);
1249 		return;
1250 	}
1251 	ldcp = ldclp->head;
1252 
1253 	mutex_enter(&ldcp->ldc_cblock);
1254 
1255 	/*
1256 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1257 	 * to trigger re-negotiation, which inturn trigger HybridIO
1258 	 * setup/cleanup.
1259 	 */
1260 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1261 	    (portp->p_hio_capable == B_TRUE)) {
1262 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1263 	}
1264 
1265 	mutex_exit(&ldcp->ldc_cblock);
1266 
1267 	RW_EXIT(&ldclp->lockrw);
1268 }
1269 
1270 /*
1271  * Search for and remove the specified port from the port
1272  * list. Returns 0 if able to locate and remove port, otherwise
1273  * returns 1.
1274  */
1275 static int
1276 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1277 {
1278 	vsw_port_list_t *plist = &vswp->plist;
1279 	vsw_port_t	*curr_p, *prev_p;
1280 
1281 	if (plist->head == NULL)
1282 		return (1);
1283 
1284 	curr_p = prev_p = plist->head;
1285 
1286 	while (curr_p != NULL) {
1287 		if (curr_p == port) {
1288 			if (prev_p == curr_p) {
1289 				plist->head = curr_p->p_next;
1290 			} else {
1291 				prev_p->p_next = curr_p->p_next;
1292 			}
1293 			plist->num_ports--;
1294 			break;
1295 		} else {
1296 			prev_p = curr_p;
1297 			curr_p = curr_p->p_next;
1298 		}
1299 	}
1300 	return (0);
1301 }
1302 
1303 /*
1304  * Interrupt handler for ldc messages.
1305  */
1306 static uint_t
1307 vsw_ldc_cb(uint64_t event, caddr_t arg)
1308 {
1309 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1310 	vsw_t 		*vswp = ldcp->ldc_vswp;
1311 
1312 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1313 
1314 	mutex_enter(&ldcp->ldc_cblock);
1315 	ldcp->ldc_stats.callbacks++;
1316 
1317 	mutex_enter(&ldcp->status_lock);
1318 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1319 		mutex_exit(&ldcp->status_lock);
1320 		mutex_exit(&ldcp->ldc_cblock);
1321 		return (LDC_SUCCESS);
1322 	}
1323 	mutex_exit(&ldcp->status_lock);
1324 
1325 	if (event & LDC_EVT_UP) {
1326 		/*
1327 		 * Channel has come up.
1328 		 */
1329 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1330 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1331 
1332 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1333 
1334 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1335 	}
1336 
1337 	if (event & LDC_EVT_READ) {
1338 		/*
1339 		 * Data available for reading.
1340 		 */
1341 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1342 		    __func__, ldcp->ldc_id, event);
1343 
1344 		if (ldcp->rx_thread != NULL) {
1345 			/*
1346 			 * If the receive thread is enabled, then
1347 			 * wakeup the receive thread to process the
1348 			 * LDC messages.
1349 			 */
1350 			mutex_exit(&ldcp->ldc_cblock);
1351 			mutex_enter(&ldcp->rx_thr_lock);
1352 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1353 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1354 				cv_signal(&ldcp->rx_thr_cv);
1355 			}
1356 			mutex_exit(&ldcp->rx_thr_lock);
1357 			mutex_enter(&ldcp->ldc_cblock);
1358 		} else {
1359 			vsw_process_pkt(ldcp);
1360 		}
1361 
1362 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1363 
1364 		goto vsw_cb_exit;
1365 	}
1366 
1367 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1368 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1369 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1370 
1371 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1372 	}
1373 
1374 	/*
1375 	 * Catch either LDC_EVT_WRITE which we don't support or any
1376 	 * unknown event.
1377 	 */
1378 	if (event &
1379 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1380 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1381 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1382 	}
1383 
1384 vsw_cb_exit:
1385 	mutex_exit(&ldcp->ldc_cblock);
1386 
1387 	/*
1388 	 * Let the drain function know we are finishing if it
1389 	 * is waiting.
1390 	 */
1391 	mutex_enter(&ldcp->drain_cv_lock);
1392 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1393 		cv_signal(&ldcp->drain_cv);
1394 	mutex_exit(&ldcp->drain_cv_lock);
1395 
1396 	return (LDC_SUCCESS);
1397 }
1398 
1399 /*
1400  * Reinitialise data structures associated with the channel.
1401  */
1402 static void
1403 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1404 {
1405 	vsw_t		*vswp = ldcp->ldc_vswp;
1406 	vsw_port_t	*port;
1407 	vsw_ldc_list_t	*ldcl;
1408 
1409 	D1(vswp, "%s: enter", __func__);
1410 
1411 	port = ldcp->ldc_port;
1412 	ldcl = &port->p_ldclist;
1413 
1414 	READ_ENTER(&ldcl->lockrw);
1415 
1416 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1417 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1418 
1419 	vsw_free_lane_resources(ldcp, INBOUND);
1420 	vsw_free_lane_resources(ldcp, OUTBOUND);
1421 	RW_EXIT(&ldcl->lockrw);
1422 
1423 	ldcp->lane_in.lstate = 0;
1424 	ldcp->lane_out.lstate = 0;
1425 
1426 	/* Remove the fdb entry for this port/mac address */
1427 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1428 
1429 	/* remove the port from vlans it has been assigned to */
1430 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1431 
1432 	/*
1433 	 * Remove parent port from any multicast groups
1434 	 * it may have registered with. Client must resend
1435 	 * multicast add command after handshake completes.
1436 	 */
1437 	vsw_del_mcst_port(port);
1438 
1439 	ldcp->peer_session = 0;
1440 	ldcp->session_status = 0;
1441 	ldcp->hcnt = 0;
1442 	ldcp->hphase = VSW_MILESTONE0;
1443 
1444 	vsw_reset_vnet_proto_ops(ldcp);
1445 
1446 	D1(vswp, "%s: exit", __func__);
1447 }
1448 
1449 /*
1450  * Process a connection event.
1451  *
1452  * Note - care must be taken to ensure that this function is
1453  * not called with the dlistrw lock held.
1454  */
1455 static void
1456 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1457 {
1458 	vsw_t		*vswp = ldcp->ldc_vswp;
1459 	vsw_conn_evt_t	*conn = NULL;
1460 
1461 	D1(vswp, "%s: enter", __func__);
1462 
1463 	/*
1464 	 * Check if either a reset or restart event is pending
1465 	 * or in progress. If so just return.
1466 	 *
1467 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1468 	 * being received by the callback handler, or a ECONNRESET error
1469 	 * code being returned from a ldc_read() or ldc_write() call.
1470 	 *
1471 	 * A VSW_CONN_RESTART event occurs when some error checking code
1472 	 * decides that there is a problem with data from the channel,
1473 	 * and that the handshake should be restarted.
1474 	 */
1475 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1476 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1477 		return;
1478 
1479 	/*
1480 	 * If it is an LDC_UP event we first check the recorded
1481 	 * state of the channel. If this is UP then we know that
1482 	 * the channel moving to the UP state has already been dealt
1483 	 * with and don't need to dispatch a  new task.
1484 	 *
1485 	 * The reason for this check is that when we do a ldc_up(),
1486 	 * depending on the state of the peer, we may or may not get
1487 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1488 	 * every time we do ldc_up() we explicitly check the channel
1489 	 * status to see has it come up (ldc_up() is asynch and will
1490 	 * complete at some undefined time), and take the appropriate
1491 	 * action.
1492 	 *
1493 	 * The flip side of this is that we may get a LDC_UP event
1494 	 * when we have already seen that the channel is up and have
1495 	 * dealt with that.
1496 	 */
1497 	mutex_enter(&ldcp->status_lock);
1498 	if (evt == VSW_CONN_UP) {
1499 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1500 			mutex_exit(&ldcp->status_lock);
1501 			return;
1502 		}
1503 	}
1504 	mutex_exit(&ldcp->status_lock);
1505 
1506 	/*
1507 	 * The transaction group id allows us to identify and discard
1508 	 * any tasks which are still pending on the taskq and refer
1509 	 * to the handshake session we are about to restart or reset.
1510 	 * These stale messages no longer have any real meaning.
1511 	 */
1512 	(void) atomic_inc_32(&ldcp->hss_id);
1513 
1514 	ASSERT(vswp->taskq_p != NULL);
1515 
1516 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1517 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1518 		    " connection event", vswp->instance);
1519 		goto err_exit;
1520 	}
1521 
1522 	conn->evt = evt;
1523 	conn->ldcp = ldcp;
1524 
1525 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1526 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1527 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1528 		    vswp->instance);
1529 
1530 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1531 		goto err_exit;
1532 	}
1533 
1534 	D1(vswp, "%s: exit", __func__);
1535 	return;
1536 
1537 err_exit:
1538 	/*
1539 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1540 	 * that future requests will at least be attempted and will hopefully
1541 	 * succeed.
1542 	 */
1543 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1544 		ldcp->reset_active = 0;
1545 }
1546 
1547 /*
1548  * Deal with events relating to a connection. Invoked from a taskq.
1549  */
1550 static void
1551 vsw_conn_task(void *arg)
1552 {
1553 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1554 	vsw_ldc_t	*ldcp = NULL;
1555 	vsw_port_t	*portp;
1556 	vsw_t		*vswp = NULL;
1557 	uint16_t	evt;
1558 	ldc_status_t	curr_status;
1559 
1560 	ldcp = conn->ldcp;
1561 	evt = conn->evt;
1562 	vswp = ldcp->ldc_vswp;
1563 	portp = ldcp->ldc_port;
1564 
1565 	D1(vswp, "%s: enter", __func__);
1566 
1567 	/* can safely free now have copied out data */
1568 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1569 
1570 	mutex_enter(&ldcp->status_lock);
1571 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1572 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1573 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1574 		mutex_exit(&ldcp->status_lock);
1575 		return;
1576 	}
1577 
1578 	/*
1579 	 * If we wish to restart the handshake on this channel, then if
1580 	 * the channel is UP we bring it DOWN to flush the underlying
1581 	 * ldc queue.
1582 	 */
1583 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1584 		(void) ldc_down(ldcp->ldc_handle);
1585 
1586 	if ((vswp->hio_capable) && (portp->p_hio_enabled)) {
1587 		vsw_hio_stop(vswp, ldcp);
1588 	}
1589 
1590 	/*
1591 	 * re-init all the associated data structures.
1592 	 */
1593 	vsw_ldc_reinit(ldcp);
1594 
1595 	/*
1596 	 * Bring the channel back up (note it does no harm to
1597 	 * do this even if the channel is already UP, Just
1598 	 * becomes effectively a no-op).
1599 	 */
1600 	(void) ldc_up(ldcp->ldc_handle);
1601 
1602 	/*
1603 	 * Check if channel is now UP. This will only happen if
1604 	 * peer has also done a ldc_up().
1605 	 */
1606 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1607 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1608 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1609 		mutex_exit(&ldcp->status_lock);
1610 		return;
1611 	}
1612 
1613 	ldcp->ldc_status = curr_status;
1614 
1615 	/* channel UP so restart handshake by sending version info */
1616 	if (curr_status == LDC_UP) {
1617 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1618 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1619 			    " handshake attempts (%d) on channel %ld",
1620 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1621 			mutex_exit(&ldcp->status_lock);
1622 			return;
1623 		}
1624 
1625 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1626 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1627 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1628 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1629 			    vswp->instance);
1630 
1631 			/*
1632 			 * Don't count as valid restart attempt if couldn't
1633 			 * send version msg.
1634 			 */
1635 			if (ldcp->hcnt > 0)
1636 				ldcp->hcnt--;
1637 		}
1638 	}
1639 
1640 	/*
1641 	 * Mark that the process is complete by clearing the flag.
1642 	 *
1643 	 * Note is it possible that the taskq dispatch above may have failed,
1644 	 * most likely due to memory shortage. We still clear the flag so
1645 	 * future attempts will at least be attempted and will hopefully
1646 	 * succeed.
1647 	 */
1648 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1649 		ldcp->reset_active = 0;
1650 
1651 	mutex_exit(&ldcp->status_lock);
1652 
1653 	D1(vswp, "%s: exit", __func__);
1654 }
1655 
1656 /*
1657  * returns 0 if legal for event signified by flag to have
1658  * occured at the time it did. Otherwise returns 1.
1659  */
1660 int
1661 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1662 {
1663 	vsw_t		*vswp = ldcp->ldc_vswp;
1664 	uint64_t	state;
1665 	uint64_t	phase;
1666 
1667 	if (dir == INBOUND)
1668 		state = ldcp->lane_in.lstate;
1669 	else
1670 		state = ldcp->lane_out.lstate;
1671 
1672 	phase = ldcp->hphase;
1673 
1674 	switch (flag) {
1675 	case VSW_VER_INFO_RECV:
1676 		if (phase > VSW_MILESTONE0) {
1677 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1678 			    " when in state %d\n", ldcp->ldc_id, phase);
1679 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1680 			return (1);
1681 		}
1682 		break;
1683 
1684 	case VSW_VER_ACK_RECV:
1685 	case VSW_VER_NACK_RECV:
1686 		if (!(state & VSW_VER_INFO_SENT)) {
1687 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1688 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1689 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1690 			return (1);
1691 		} else
1692 			state &= ~VSW_VER_INFO_SENT;
1693 		break;
1694 
1695 	case VSW_ATTR_INFO_RECV:
1696 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1697 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1698 			    " when in state %d\n", ldcp->ldc_id, phase);
1699 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1700 			return (1);
1701 		}
1702 		break;
1703 
1704 	case VSW_ATTR_ACK_RECV:
1705 	case VSW_ATTR_NACK_RECV:
1706 		if (!(state & VSW_ATTR_INFO_SENT)) {
1707 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1708 			    " or ATTR_NACK when in state %d\n",
1709 			    ldcp->ldc_id, phase);
1710 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1711 			return (1);
1712 		} else
1713 			state &= ~VSW_ATTR_INFO_SENT;
1714 		break;
1715 
1716 	case VSW_DRING_INFO_RECV:
1717 		if (phase < VSW_MILESTONE1) {
1718 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1719 			    " when in state %d\n", ldcp->ldc_id, phase);
1720 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1721 			return (1);
1722 		}
1723 		break;
1724 
1725 	case VSW_DRING_ACK_RECV:
1726 	case VSW_DRING_NACK_RECV:
1727 		if (!(state & VSW_DRING_INFO_SENT)) {
1728 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1729 			    " or DRING_NACK when in state %d\n",
1730 			    ldcp->ldc_id, phase);
1731 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1732 			return (1);
1733 		} else
1734 			state &= ~VSW_DRING_INFO_SENT;
1735 		break;
1736 
1737 	case VSW_RDX_INFO_RECV:
1738 		if (phase < VSW_MILESTONE3) {
1739 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1740 			    " when in state %d\n", ldcp->ldc_id, phase);
1741 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1742 			return (1);
1743 		}
1744 		break;
1745 
1746 	case VSW_RDX_ACK_RECV:
1747 	case VSW_RDX_NACK_RECV:
1748 		if (!(state & VSW_RDX_INFO_SENT)) {
1749 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1750 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1751 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1752 			return (1);
1753 		} else
1754 			state &= ~VSW_RDX_INFO_SENT;
1755 		break;
1756 
1757 	case VSW_MCST_INFO_RECV:
1758 		if (phase < VSW_MILESTONE3) {
1759 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1760 			    " when in state %d\n", ldcp->ldc_id, phase);
1761 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1762 			return (1);
1763 		}
1764 		break;
1765 
1766 	default:
1767 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1768 		    ldcp->ldc_id, flag);
1769 		return (1);
1770 	}
1771 
1772 	if (dir == INBOUND)
1773 		ldcp->lane_in.lstate = state;
1774 	else
1775 		ldcp->lane_out.lstate = state;
1776 
1777 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1778 
1779 	return (0);
1780 }
1781 
1782 void
1783 vsw_next_milestone(vsw_ldc_t *ldcp)
1784 {
1785 	vsw_t		*vswp = ldcp->ldc_vswp;
1786 	vsw_port_t	*portp = ldcp->ldc_port;
1787 
1788 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1789 	    ldcp->ldc_id, ldcp->hphase);
1790 
1791 	DUMP_FLAGS(ldcp->lane_in.lstate);
1792 	DUMP_FLAGS(ldcp->lane_out.lstate);
1793 
1794 	switch (ldcp->hphase) {
1795 
1796 	case VSW_MILESTONE0:
1797 		/*
1798 		 * If we haven't started to handshake with our peer,
1799 		 * start to do so now.
1800 		 */
1801 		if (ldcp->lane_out.lstate == 0) {
1802 			D2(vswp, "%s: (chan %lld) starting handshake "
1803 			    "with peer", __func__, ldcp->ldc_id);
1804 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1805 		}
1806 
1807 		/*
1808 		 * Only way to pass this milestone is to have successfully
1809 		 * negotiated version info.
1810 		 */
1811 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1812 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1813 
1814 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1815 			    __func__, ldcp->ldc_id);
1816 
1817 			vsw_set_vnet_proto_ops(ldcp);
1818 
1819 			/*
1820 			 * Next milestone is passed when attribute
1821 			 * information has been successfully exchanged.
1822 			 */
1823 			ldcp->hphase = VSW_MILESTONE1;
1824 			vsw_send_attr(ldcp);
1825 
1826 		}
1827 		break;
1828 
1829 	case VSW_MILESTONE1:
1830 		/*
1831 		 * Only way to pass this milestone is to have successfully
1832 		 * negotiated attribute information.
1833 		 */
1834 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1835 
1836 			ldcp->hphase = VSW_MILESTONE2;
1837 
1838 			/*
1839 			 * If the peer device has said it wishes to
1840 			 * use descriptor rings then we send it our ring
1841 			 * info, otherwise we just set up a private ring
1842 			 * which we use an internal buffer
1843 			 */
1844 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1845 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1846 			    (VSW_VER_LT(ldcp, 1, 2) &&
1847 			    (ldcp->lane_in.xfer_mode ==
1848 			    VIO_DRING_MODE_V1_0))) {
1849 				vsw_send_dring_info(ldcp);
1850 			}
1851 		}
1852 		break;
1853 
1854 	case VSW_MILESTONE2:
1855 		/*
1856 		 * If peer has indicated in its attribute message that
1857 		 * it wishes to use descriptor rings then the only way
1858 		 * to pass this milestone is for us to have received
1859 		 * valid dring info.
1860 		 *
1861 		 * If peer is not using descriptor rings then just fall
1862 		 * through.
1863 		 */
1864 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1865 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1866 		    (VSW_VER_LT(ldcp, 1, 2) &&
1867 		    (ldcp->lane_in.xfer_mode ==
1868 		    VIO_DRING_MODE_V1_0))) {
1869 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
1870 				break;
1871 		}
1872 
1873 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1874 		    __func__, ldcp->ldc_id);
1875 
1876 		ldcp->hphase = VSW_MILESTONE3;
1877 		vsw_send_rdx(ldcp);
1878 		break;
1879 
1880 	case VSW_MILESTONE3:
1881 		/*
1882 		 * Pass this milestone when all paramaters have been
1883 		 * successfully exchanged and RDX sent in both directions.
1884 		 *
1885 		 * Mark outbound lane as available to transmit data.
1886 		 */
1887 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
1888 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
1889 
1890 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1891 			    __func__, ldcp->ldc_id);
1892 			D2(vswp, "%s: ** handshake complete (0x%llx : "
1893 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
1894 			    ldcp->lane_out.lstate);
1895 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
1896 			ldcp->hphase = VSW_MILESTONE4;
1897 			ldcp->hcnt = 0;
1898 			DISPLAY_STATE();
1899 			/* Start HIO if enabled and capable */
1900 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1901 				D2(vswp, "%s: start HybridIO setup", __func__);
1902 				vsw_hio_start(vswp, ldcp);
1903 			}
1904 		} else {
1905 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1906 			    __func__, ldcp->lane_in.lstate,
1907 			    ldcp->lane_out.lstate);
1908 		}
1909 		break;
1910 
1911 	case VSW_MILESTONE4:
1912 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1913 		    ldcp->ldc_id);
1914 		break;
1915 
1916 	default:
1917 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1918 		    ldcp->ldc_id, ldcp->hphase);
1919 	}
1920 
1921 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1922 	    ldcp->hphase);
1923 }
1924 
1925 /*
1926  * Check if major version is supported.
1927  *
1928  * Returns 0 if finds supported major number, and if necessary
1929  * adjusts the minor field.
1930  *
1931  * Returns 1 if can't match major number exactly. Sets mjor/minor
1932  * to next lowest support values, or to zero if no other values possible.
1933  */
1934 static int
1935 vsw_supported_version(vio_ver_msg_t *vp)
1936 {
1937 	int	i;
1938 
1939 	D1(NULL, "vsw_supported_version: enter");
1940 
1941 	for (i = 0; i < VSW_NUM_VER; i++) {
1942 		if (vsw_versions[i].ver_major == vp->ver_major) {
1943 			/*
1944 			 * Matching or lower major version found. Update
1945 			 * minor number if necessary.
1946 			 */
1947 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1948 				D2(NULL, "%s: adjusting minor value from %d "
1949 				    "to %d", __func__, vp->ver_minor,
1950 				    vsw_versions[i].ver_minor);
1951 				vp->ver_minor = vsw_versions[i].ver_minor;
1952 			}
1953 
1954 			return (0);
1955 		}
1956 
1957 		/*
1958 		 * If the message contains a higher major version number, set
1959 		 * the message's major/minor versions to the current values
1960 		 * and return false, so this message will get resent with
1961 		 * these values.
1962 		 */
1963 		if (vsw_versions[i].ver_major < vp->ver_major) {
1964 			D2(NULL, "%s: adjusting major and minor "
1965 			    "values to %d, %d\n",
1966 			    __func__, vsw_versions[i].ver_major,
1967 			    vsw_versions[i].ver_minor);
1968 			vp->ver_major = vsw_versions[i].ver_major;
1969 			vp->ver_minor = vsw_versions[i].ver_minor;
1970 			return (1);
1971 		}
1972 	}
1973 
1974 	/* No match was possible, zero out fields */
1975 	vp->ver_major = 0;
1976 	vp->ver_minor = 0;
1977 
1978 	D1(NULL, "vsw_supported_version: exit");
1979 
1980 	return (1);
1981 }
1982 
1983 /*
1984  * Set vnet-protocol-version dependent functions based on version.
1985  */
1986 static void
1987 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1988 {
1989 	vsw_t	*vswp = ldcp->ldc_vswp;
1990 	lane_t	*lp = &ldcp->lane_out;
1991 
1992 	if (VSW_VER_GTEQ(ldcp, 1, 3)) {
1993 		/*
1994 		 * If the version negotiated with peer is >= 1.3,
1995 		 * set the mtu in our attributes to max_frame_size.
1996 		 */
1997 		lp->mtu = vswp->max_frame_size;
1998 	} else {
1999 		vsw_port_t	*portp = ldcp->ldc_port;
2000 		/*
2001 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2002 		 * We can negotiate that size with those peers provided the
2003 		 * following conditions are true:
2004 		 * - Our max_frame_size is greater only by VLAN_TAGSZ (4).
2005 		 * - Only pvid is defined for our peer and there are no vids.
2006 		 * If the above conditions are true, then we can send/recv only
2007 		 * untagged frames of max size ETHERMAX. Note that pvid of the
2008 		 * peer can be different, as vsw has to serve the vnet in that
2009 		 * vlan even if itself is not assigned to that vlan.
2010 		 */
2011 		if ((vswp->max_frame_size == ETHERMAX + VLAN_TAGSZ) &&
2012 		    portp->nvids == 0) {
2013 			lp->mtu = ETHERMAX;
2014 		}
2015 	}
2016 
2017 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2018 		/* Versions >= 1.2 */
2019 
2020 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2021 			/*
2022 			 * enable priority routines and pkt mode only if
2023 			 * at least one pri-eth-type is specified in MD.
2024 			 */
2025 			ldcp->tx = vsw_ldctx_pri;
2026 			ldcp->rx_pktdata = vsw_process_pkt_data;
2027 
2028 			/* set xfer mode for vsw_send_attr() */
2029 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2030 		} else {
2031 			/* no priority eth types defined in MD */
2032 
2033 			ldcp->tx = vsw_ldctx;
2034 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2035 
2036 			/* set xfer mode for vsw_send_attr() */
2037 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2038 		}
2039 
2040 	} else {
2041 		/* Versions prior to 1.2  */
2042 
2043 		vsw_reset_vnet_proto_ops(ldcp);
2044 	}
2045 }
2046 
2047 /*
2048  * Reset vnet-protocol-version dependent functions to v1.0.
2049  */
2050 static void
2051 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2052 {
2053 	lane_t	*lp = &ldcp->lane_out;
2054 
2055 	ldcp->tx = vsw_ldctx;
2056 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2057 
2058 	/* set xfer mode for vsw_send_attr() */
2059 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2060 }
2061 
2062 /*
2063  * Main routine for processing messages received over LDC.
2064  */
2065 static void
2066 vsw_process_pkt(void *arg)
2067 {
2068 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2069 	vsw_t 		*vswp = ldcp->ldc_vswp;
2070 	size_t		msglen;
2071 	vio_msg_tag_t	*tagp;
2072 	uint64_t	*ldcmsg;
2073 	int 		rv = 0;
2074 
2075 
2076 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2077 
2078 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2079 
2080 	ldcmsg = ldcp->ldcmsg;
2081 	/*
2082 	 * If channel is up read messages until channel is empty.
2083 	 */
2084 	do {
2085 		msglen = ldcp->msglen;
2086 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2087 
2088 		if (rv != 0) {
2089 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2090 			    __func__, ldcp->ldc_id, rv, msglen);
2091 		}
2092 
2093 		/* channel has been reset */
2094 		if (rv == ECONNRESET) {
2095 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2096 			break;
2097 		}
2098 
2099 		if (msglen == 0) {
2100 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2101 			    ldcp->ldc_id);
2102 			break;
2103 		}
2104 
2105 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2106 		    ldcp->ldc_id, msglen);
2107 
2108 		/*
2109 		 * Figure out what sort of packet we have gotten by
2110 		 * examining the msg tag, and then switch it appropriately.
2111 		 */
2112 		tagp = (vio_msg_tag_t *)ldcmsg;
2113 
2114 		switch (tagp->vio_msgtype) {
2115 		case VIO_TYPE_CTRL:
2116 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2117 			break;
2118 		case VIO_TYPE_DATA:
2119 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2120 			break;
2121 		case VIO_TYPE_ERR:
2122 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2123 			break;
2124 		default:
2125 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2126 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2127 			break;
2128 		}
2129 	} while (msglen);
2130 
2131 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2132 }
2133 
2134 /*
2135  * Dispatch a task to process a VIO control message.
2136  */
2137 static void
2138 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2139 {
2140 	vsw_ctrl_task_t		*ctaskp = NULL;
2141 	vsw_port_t		*port = ldcp->ldc_port;
2142 	vsw_t			*vswp = port->p_vswp;
2143 
2144 	D1(vswp, "%s: enter", __func__);
2145 
2146 	/*
2147 	 * We need to handle RDX ACK messages in-band as once they
2148 	 * are exchanged it is possible that we will get an
2149 	 * immediate (legitimate) data packet.
2150 	 */
2151 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2152 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2153 
2154 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2155 			return;
2156 
2157 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2158 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2159 		    "(ostate 0x%llx : hphase %d)", __func__,
2160 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2161 		vsw_next_milestone(ldcp);
2162 		return;
2163 	}
2164 
2165 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2166 
2167 	if (ctaskp == NULL) {
2168 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2169 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2170 		return;
2171 	}
2172 
2173 	ctaskp->ldcp = ldcp;
2174 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2175 	ctaskp->hss_id = ldcp->hss_id;
2176 
2177 	/*
2178 	 * Dispatch task to processing taskq if port is not in
2179 	 * the process of being detached.
2180 	 */
2181 	mutex_enter(&port->state_lock);
2182 	if (port->state == VSW_PORT_INIT) {
2183 		if ((vswp->taskq_p == NULL) ||
2184 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2185 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2186 			DERR(vswp, "%s: unable to dispatch task to taskq",
2187 			    __func__);
2188 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2189 			mutex_exit(&port->state_lock);
2190 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2191 			return;
2192 		}
2193 	} else {
2194 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2195 		    "task", __func__, port->p_instance);
2196 	}
2197 
2198 	mutex_exit(&port->state_lock);
2199 
2200 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2201 	    ldcp->ldc_id);
2202 	D1(vswp, "%s: exit", __func__);
2203 }
2204 
2205 /*
2206  * Process a VIO ctrl message. Invoked from taskq.
2207  */
2208 static void
2209 vsw_process_ctrl_pkt(void *arg)
2210 {
2211 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2212 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2213 	vsw_t 		*vswp = ldcp->ldc_vswp;
2214 	vio_msg_tag_t	tag;
2215 	uint16_t	env;
2216 
2217 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2218 
2219 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2220 	env = tag.vio_subtype_env;
2221 
2222 	/* stale pkt check */
2223 	if (ctaskp->hss_id < ldcp->hss_id) {
2224 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2225 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2226 		return;
2227 	}
2228 
2229 	/* session id check */
2230 	if (ldcp->session_status & VSW_PEER_SESSION) {
2231 		if (ldcp->peer_session != tag.vio_sid) {
2232 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2233 			    __func__, ldcp->ldc_id, tag.vio_sid);
2234 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2235 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2236 			return;
2237 		}
2238 	}
2239 
2240 	/*
2241 	 * Switch on vio_subtype envelope, then let lower routines
2242 	 * decide if its an INFO, ACK or NACK packet.
2243 	 */
2244 	switch (env) {
2245 	case VIO_VER_INFO:
2246 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2247 		break;
2248 	case VIO_DRING_REG:
2249 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2250 		break;
2251 	case VIO_DRING_UNREG:
2252 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2253 		break;
2254 	case VIO_ATTR_INFO:
2255 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2256 		break;
2257 	case VNET_MCAST_INFO:
2258 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2259 		break;
2260 	case VIO_RDX:
2261 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2262 		break;
2263 	case VIO_DDS_INFO:
2264 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2265 		break;
2266 	default:
2267 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2268 	}
2269 
2270 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2271 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2272 }
2273 
2274 /*
2275  * Version negotiation. We can end up here either because our peer
2276  * has responded to a handshake message we have sent it, or our peer
2277  * has initiated a handshake with us. If its the former then can only
2278  * be ACK or NACK, if its the later can only be INFO.
2279  *
2280  * If its an ACK we move to the next stage of the handshake, namely
2281  * attribute exchange. If its a NACK we see if we can specify another
2282  * version, if we can't we stop.
2283  *
2284  * If it is an INFO we reset all params associated with communication
2285  * in that direction over this channel (remember connection is
2286  * essentially 2 independent simplex channels).
2287  */
2288 void
2289 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2290 {
2291 	vio_ver_msg_t	*ver_pkt;
2292 	vsw_t 		*vswp = ldcp->ldc_vswp;
2293 
2294 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2295 
2296 	/*
2297 	 * We know this is a ctrl/version packet so
2298 	 * cast it into the correct structure.
2299 	 */
2300 	ver_pkt = (vio_ver_msg_t *)pkt;
2301 
2302 	switch (ver_pkt->tag.vio_subtype) {
2303 	case VIO_SUBTYPE_INFO:
2304 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2305 
2306 		/*
2307 		 * Record the session id, which we will use from now
2308 		 * until we see another VER_INFO msg. Even then the
2309 		 * session id in most cases will be unchanged, execpt
2310 		 * if channel was reset.
2311 		 */
2312 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2313 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2314 			DERR(vswp, "%s: updating session id for chan %lld "
2315 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2316 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2317 		}
2318 
2319 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2320 		ldcp->session_status |= VSW_PEER_SESSION;
2321 
2322 		/* Legal message at this time ? */
2323 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2324 			return;
2325 
2326 		/*
2327 		 * First check the device class. Currently only expect
2328 		 * to be talking to a network device. In the future may
2329 		 * also talk to another switch.
2330 		 */
2331 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2332 			DERR(vswp, "%s: illegal device class %d", __func__,
2333 			    ver_pkt->dev_class);
2334 
2335 			ver_pkt->tag.vio_sid = ldcp->local_session;
2336 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2337 
2338 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2339 
2340 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2341 			    sizeof (vio_ver_msg_t), B_TRUE);
2342 
2343 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2344 			vsw_next_milestone(ldcp);
2345 			return;
2346 		} else {
2347 			ldcp->dev_class = ver_pkt->dev_class;
2348 		}
2349 
2350 		/*
2351 		 * Now check the version.
2352 		 */
2353 		if (vsw_supported_version(ver_pkt) == 0) {
2354 			/*
2355 			 * Support this major version and possibly
2356 			 * adjusted minor version.
2357 			 */
2358 
2359 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2360 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2361 
2362 			/* Store accepted values */
2363 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2364 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2365 
2366 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2367 
2368 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2369 
2370 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2371 				/*
2372 				 * Send a version info message
2373 				 * using the accepted version that
2374 				 * we are about to ack. Also note that
2375 				 * we send our ver info before we ack.
2376 				 * Otherwise, as soon as receiving the
2377 				 * ack, obp sends attr info msg, which
2378 				 * breaks vsw_check_flag() invoked
2379 				 * from vsw_process_ctrl_attr_pkt();
2380 				 * as we also need VSW_VER_ACK_RECV to
2381 				 * be set in lane_out.lstate, before
2382 				 * we can receive attr info.
2383 				 */
2384 				vsw_send_ver(ldcp);
2385 			}
2386 		} else {
2387 			/*
2388 			 * NACK back with the next lower major/minor
2389 			 * pairing we support (if don't suuport any more
2390 			 * versions then they will be set to zero.
2391 			 */
2392 
2393 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2394 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2395 
2396 			/* Store updated values */
2397 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2398 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2399 
2400 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2401 
2402 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2403 		}
2404 
2405 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2406 		ver_pkt->tag.vio_sid = ldcp->local_session;
2407 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2408 		    sizeof (vio_ver_msg_t), B_TRUE);
2409 
2410 		vsw_next_milestone(ldcp);
2411 		break;
2412 
2413 	case VIO_SUBTYPE_ACK:
2414 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2415 
2416 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2417 			return;
2418 
2419 		/* Store updated values */
2420 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2421 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2422 
2423 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2424 		vsw_next_milestone(ldcp);
2425 
2426 		break;
2427 
2428 	case VIO_SUBTYPE_NACK:
2429 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2430 
2431 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2432 			return;
2433 
2434 		/*
2435 		 * If our peer sent us a NACK with the ver fields set to
2436 		 * zero then there is nothing more we can do. Otherwise see
2437 		 * if we support either the version suggested, or a lesser
2438 		 * one.
2439 		 */
2440 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2441 			DERR(vswp, "%s: peer unable to negotiate any "
2442 			    "further.", __func__);
2443 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2444 			vsw_next_milestone(ldcp);
2445 			return;
2446 		}
2447 
2448 		/*
2449 		 * Check to see if we support this major version or
2450 		 * a lower one. If we don't then maj/min will be set
2451 		 * to zero.
2452 		 */
2453 		(void) vsw_supported_version(ver_pkt);
2454 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2455 			/* Nothing more we can do */
2456 			DERR(vswp, "%s: version negotiation failed.\n",
2457 			    __func__);
2458 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2459 			vsw_next_milestone(ldcp);
2460 		} else {
2461 			/* found a supported major version */
2462 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2463 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2464 
2465 			D2(vswp, "%s: resending with updated values (%x, %x)",
2466 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2467 
2468 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2469 			ver_pkt->tag.vio_sid = ldcp->local_session;
2470 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2471 
2472 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2473 
2474 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2475 			    sizeof (vio_ver_msg_t), B_TRUE);
2476 
2477 			vsw_next_milestone(ldcp);
2478 
2479 		}
2480 		break;
2481 
2482 	default:
2483 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2484 		    ver_pkt->tag.vio_subtype);
2485 	}
2486 
2487 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2488 }
2489 
2490 /*
2491  * Process an attribute packet. We can end up here either because our peer
2492  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2493  * peer has sent us an attribute INFO message
2494  *
2495  * If its an ACK we then move to the next stage of the handshake which
2496  * is to send our descriptor ring info to our peer. If its a NACK then
2497  * there is nothing more we can (currently) do.
2498  *
2499  * If we get a valid/acceptable INFO packet (and we have already negotiated
2500  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2501  * NACK back and reset channel state to INACTIV.
2502  *
2503  * FUTURE: in time we will probably negotiate over attributes, but for
2504  * the moment unacceptable attributes are regarded as a fatal error.
2505  *
2506  */
2507 void
2508 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2509 {
2510 	vnet_attr_msg_t		*attr_pkt;
2511 	vsw_t			*vswp = ldcp->ldc_vswp;
2512 	vsw_port_t		*port = ldcp->ldc_port;
2513 	uint64_t		macaddr = 0;
2514 	int			i;
2515 
2516 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2517 
2518 	/*
2519 	 * We know this is a ctrl/attr packet so
2520 	 * cast it into the correct structure.
2521 	 */
2522 	attr_pkt = (vnet_attr_msg_t *)pkt;
2523 
2524 	switch (attr_pkt->tag.vio_subtype) {
2525 	case VIO_SUBTYPE_INFO:
2526 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2527 
2528 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2529 			return;
2530 
2531 		/*
2532 		 * If the attributes are unacceptable then we NACK back.
2533 		 */
2534 		if (vsw_check_attr(attr_pkt, ldcp)) {
2535 
2536 			DERR(vswp, "%s (chan %d): invalid attributes",
2537 			    __func__, ldcp->ldc_id);
2538 
2539 			vsw_free_lane_resources(ldcp, INBOUND);
2540 
2541 			attr_pkt->tag.vio_sid = ldcp->local_session;
2542 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2543 
2544 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2545 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2546 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2547 			    sizeof (vnet_attr_msg_t), B_TRUE);
2548 
2549 			vsw_next_milestone(ldcp);
2550 			return;
2551 		}
2552 
2553 		/*
2554 		 * Otherwise store attributes for this lane and update
2555 		 * lane state.
2556 		 */
2557 		ldcp->lane_in.mtu = attr_pkt->mtu;
2558 		ldcp->lane_in.addr = attr_pkt->addr;
2559 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
2560 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
2561 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
2562 
2563 		macaddr = ldcp->lane_in.addr;
2564 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2565 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2566 			macaddr >>= 8;
2567 		}
2568 
2569 		/* create the fdb entry for this port/mac address */
2570 		vsw_fdbe_add(vswp, port);
2571 
2572 		/* add the port to the specified vlans */
2573 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2574 
2575 		/* setup device specifc xmit routines */
2576 		mutex_enter(&port->tx_lock);
2577 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2578 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2579 		    (VSW_VER_LT(ldcp, 1, 2) &&
2580 		    (ldcp->lane_in.xfer_mode == VIO_DRING_MODE_V1_0))) {
2581 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2582 			port->transmit = vsw_dringsend;
2583 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
2584 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2585 			vsw_create_privring(ldcp);
2586 			port->transmit = vsw_descrsend;
2587 			ldcp->lane_out.xfer_mode = VIO_DESC_MODE;
2588 		}
2589 
2590 		/*
2591 		 * HybridIO is supported only vnet, not by OBP.
2592 		 * So, set hio_capable to true only when in DRING mode.
2593 		 */
2594 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2595 		    (ldcp->lane_in.xfer_mode != VIO_DESC_MODE)) {
2596 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2597 		} else {
2598 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2599 		}
2600 
2601 		mutex_exit(&port->tx_lock);
2602 
2603 		attr_pkt->tag.vio_sid = ldcp->local_session;
2604 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2605 
2606 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2607 
2608 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
2609 
2610 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2611 		    sizeof (vnet_attr_msg_t), B_TRUE);
2612 
2613 		vsw_next_milestone(ldcp);
2614 		break;
2615 
2616 	case VIO_SUBTYPE_ACK:
2617 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2618 
2619 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2620 			return;
2621 
2622 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
2623 		vsw_next_milestone(ldcp);
2624 		break;
2625 
2626 	case VIO_SUBTYPE_NACK:
2627 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2628 
2629 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2630 			return;
2631 
2632 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
2633 		vsw_next_milestone(ldcp);
2634 		break;
2635 
2636 	default:
2637 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2638 		    attr_pkt->tag.vio_subtype);
2639 	}
2640 
2641 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2642 }
2643 
2644 /*
2645  * Process a dring info packet. We can end up here either because our peer
2646  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2647  * peer has sent us a dring INFO message.
2648  *
2649  * If we get a valid/acceptable INFO packet (and we have already negotiated
2650  * a version) we ACK back and update the lane state, otherwise we NACK back.
2651  *
2652  * FUTURE: nothing to stop client from sending us info on multiple dring's
2653  * but for the moment we will just use the first one we are given.
2654  *
2655  */
2656 void
2657 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2658 {
2659 	vio_dring_reg_msg_t	*dring_pkt;
2660 	vsw_t			*vswp = ldcp->ldc_vswp;
2661 	ldc_mem_info_t		minfo;
2662 	dring_info_t		*dp, *dbp;
2663 	int			dring_found = 0;
2664 
2665 	/*
2666 	 * We know this is a ctrl/dring packet so
2667 	 * cast it into the correct structure.
2668 	 */
2669 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2670 
2671 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2672 
2673 	switch (dring_pkt->tag.vio_subtype) {
2674 	case VIO_SUBTYPE_INFO:
2675 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2676 
2677 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2678 			return;
2679 
2680 		/*
2681 		 * If the dring params are unacceptable then we NACK back.
2682 		 */
2683 		if (vsw_check_dring_info(dring_pkt)) {
2684 
2685 			DERR(vswp, "%s (%lld): invalid dring info",
2686 			    __func__, ldcp->ldc_id);
2687 
2688 			vsw_free_lane_resources(ldcp, INBOUND);
2689 
2690 			dring_pkt->tag.vio_sid = ldcp->local_session;
2691 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2692 
2693 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2694 
2695 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2696 
2697 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2698 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2699 
2700 			vsw_next_milestone(ldcp);
2701 			return;
2702 		}
2703 
2704 		/*
2705 		 * Otherwise, attempt to map in the dring using the
2706 		 * cookie. If that succeeds we send back a unique dring
2707 		 * identifier that the sending side will use in future
2708 		 * to refer to this descriptor ring.
2709 		 */
2710 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2711 
2712 		dp->num_descriptors = dring_pkt->num_descriptors;
2713 		dp->descriptor_size = dring_pkt->descriptor_size;
2714 		dp->options = dring_pkt->options;
2715 		dp->ncookies = dring_pkt->ncookies;
2716 
2717 		/*
2718 		 * Note: should only get one cookie. Enforced in
2719 		 * the ldc layer.
2720 		 */
2721 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2722 		    sizeof (ldc_mem_cookie_t));
2723 
2724 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2725 		    dp->num_descriptors, dp->descriptor_size);
2726 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2727 		    dp->options, dp->ncookies);
2728 
2729 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2730 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2731 		    LDC_SHADOW_MAP, &(dp->handle))) != 0) {
2732 
2733 			DERR(vswp, "%s: dring_map failed\n", __func__);
2734 
2735 			kmem_free(dp, sizeof (dring_info_t));
2736 			vsw_free_lane_resources(ldcp, INBOUND);
2737 
2738 			dring_pkt->tag.vio_sid = ldcp->local_session;
2739 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2740 
2741 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2742 
2743 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2744 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2745 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2746 
2747 			vsw_next_milestone(ldcp);
2748 			return;
2749 		}
2750 
2751 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2752 
2753 			DERR(vswp, "%s: dring_addr failed\n", __func__);
2754 
2755 			kmem_free(dp, sizeof (dring_info_t));
2756 			vsw_free_lane_resources(ldcp, INBOUND);
2757 
2758 			dring_pkt->tag.vio_sid = ldcp->local_session;
2759 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2760 
2761 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2762 
2763 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2764 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2765 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2766 
2767 			vsw_next_milestone(ldcp);
2768 			return;
2769 		} else {
2770 			/* store the address of the pub part of ring */
2771 			dp->pub_addr = minfo.vaddr;
2772 		}
2773 
2774 		/* no private section as we are importing */
2775 		dp->priv_addr = NULL;
2776 
2777 		/*
2778 		 * Using simple mono increasing int for ident at
2779 		 * the moment.
2780 		 */
2781 		dp->ident = ldcp->next_ident;
2782 		ldcp->next_ident++;
2783 
2784 		dp->end_idx = 0;
2785 		dp->next = NULL;
2786 
2787 		/*
2788 		 * Link it onto the end of the list of drings
2789 		 * for this lane.
2790 		 */
2791 		if (ldcp->lane_in.dringp == NULL) {
2792 			D2(vswp, "%s: adding first INBOUND dring", __func__);
2793 			ldcp->lane_in.dringp = dp;
2794 		} else {
2795 			dbp = ldcp->lane_in.dringp;
2796 
2797 			while (dbp->next != NULL)
2798 				dbp = dbp->next;
2799 
2800 			dbp->next = dp;
2801 		}
2802 
2803 		/* acknowledge it */
2804 		dring_pkt->tag.vio_sid = ldcp->local_session;
2805 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2806 		dring_pkt->dring_ident = dp->ident;
2807 
2808 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2809 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
2810 
2811 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
2812 		vsw_next_milestone(ldcp);
2813 		break;
2814 
2815 	case VIO_SUBTYPE_ACK:
2816 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2817 
2818 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
2819 			return;
2820 
2821 		/*
2822 		 * Peer is acknowledging our dring info and will have
2823 		 * sent us a dring identifier which we will use to
2824 		 * refer to this ring w.r.t. our peer.
2825 		 */
2826 		dp = ldcp->lane_out.dringp;
2827 		if (dp != NULL) {
2828 			/*
2829 			 * Find the ring this ident should be associated
2830 			 * with.
2831 			 */
2832 			if (vsw_dring_match(dp, dring_pkt)) {
2833 				dring_found = 1;
2834 
2835 			} else while (dp != NULL) {
2836 				if (vsw_dring_match(dp, dring_pkt)) {
2837 					dring_found = 1;
2838 					break;
2839 				}
2840 				dp = dp->next;
2841 			}
2842 
2843 			if (dring_found == 0) {
2844 				DERR(NULL, "%s: unrecognised ring cookie",
2845 				    __func__);
2846 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2847 				return;
2848 			}
2849 
2850 		} else {
2851 			DERR(vswp, "%s: DRING ACK received but no drings "
2852 			    "allocated", __func__);
2853 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2854 			return;
2855 		}
2856 
2857 		/* store ident */
2858 		dp->ident = dring_pkt->dring_ident;
2859 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
2860 		vsw_next_milestone(ldcp);
2861 		break;
2862 
2863 	case VIO_SUBTYPE_NACK:
2864 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2865 
2866 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
2867 			return;
2868 
2869 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
2870 		vsw_next_milestone(ldcp);
2871 		break;
2872 
2873 	default:
2874 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2875 		    dring_pkt->tag.vio_subtype);
2876 	}
2877 
2878 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2879 }
2880 
2881 /*
2882  * Process a request from peer to unregister a dring.
2883  *
2884  * For the moment we just restart the handshake if our
2885  * peer endpoint attempts to unregister a dring.
2886  */
2887 void
2888 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
2889 {
2890 	vsw_t			*vswp = ldcp->ldc_vswp;
2891 	vio_dring_unreg_msg_t	*dring_pkt;
2892 
2893 	/*
2894 	 * We know this is a ctrl/dring packet so
2895 	 * cast it into the correct structure.
2896 	 */
2897 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
2898 
2899 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2900 
2901 	switch (dring_pkt->tag.vio_subtype) {
2902 	case VIO_SUBTYPE_INFO:
2903 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2904 
2905 		DWARN(vswp, "%s: restarting handshake..", __func__);
2906 		break;
2907 
2908 	case VIO_SUBTYPE_ACK:
2909 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2910 
2911 		DWARN(vswp, "%s: restarting handshake..", __func__);
2912 		break;
2913 
2914 	case VIO_SUBTYPE_NACK:
2915 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2916 
2917 		DWARN(vswp, "%s: restarting handshake..", __func__);
2918 		break;
2919 
2920 	default:
2921 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2922 		    dring_pkt->tag.vio_subtype);
2923 	}
2924 
2925 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2926 
2927 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2928 }
2929 
2930 #define	SND_MCST_NACK(ldcp, pkt) \
2931 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2932 	pkt->tag.vio_sid = ldcp->local_session; \
2933 	(void) vsw_send_msg(ldcp, (void *)pkt, \
2934 			sizeof (vnet_mcast_msg_t), B_TRUE);
2935 
2936 /*
2937  * Process a multicast request from a vnet.
2938  *
2939  * Vnet's specify a multicast address that they are interested in. This
2940  * address is used as a key into the hash table which forms the multicast
2941  * forwarding database (mFDB).
2942  *
2943  * The table keys are the multicast addresses, while the table entries
2944  * are pointers to lists of ports which wish to receive packets for the
2945  * specified multicast address.
2946  *
2947  * When a multicast packet is being switched we use the address as a key
2948  * into the hash table, and then walk the appropriate port list forwarding
2949  * the pkt to each port in turn.
2950  *
2951  * If a vnet is no longer interested in a particular multicast grouping
2952  * we simply find the correct location in the hash table and then delete
2953  * the relevant port from the port list.
2954  *
2955  * To deal with the case whereby a port is being deleted without first
2956  * removing itself from the lists in the hash table, we maintain a list
2957  * of multicast addresses the port has registered an interest in, within
2958  * the port structure itself. We then simply walk that list of addresses
2959  * using them as keys into the hash table and remove the port from the
2960  * appropriate lists.
2961  */
2962 static void
2963 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
2964 {
2965 	vnet_mcast_msg_t	*mcst_pkt;
2966 	vsw_port_t		*port = ldcp->ldc_port;
2967 	vsw_t			*vswp = ldcp->ldc_vswp;
2968 	int			i;
2969 
2970 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2971 
2972 	/*
2973 	 * We know this is a ctrl/mcast packet so
2974 	 * cast it into the correct structure.
2975 	 */
2976 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
2977 
2978 	switch (mcst_pkt->tag.vio_subtype) {
2979 	case VIO_SUBTYPE_INFO:
2980 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2981 
2982 		/*
2983 		 * Check if in correct state to receive a multicast
2984 		 * message (i.e. handshake complete). If not reset
2985 		 * the handshake.
2986 		 */
2987 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
2988 			return;
2989 
2990 		/*
2991 		 * Before attempting to add or remove address check
2992 		 * that they are valid multicast addresses.
2993 		 * If not, then NACK back.
2994 		 */
2995 		for (i = 0; i < mcst_pkt->count; i++) {
2996 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
2997 				DERR(vswp, "%s: invalid multicast address",
2998 				    __func__);
2999 				SND_MCST_NACK(ldcp, mcst_pkt);
3000 				return;
3001 			}
3002 		}
3003 
3004 		/*
3005 		 * Now add/remove the addresses. If this fails we
3006 		 * NACK back.
3007 		 */
3008 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3009 			SND_MCST_NACK(ldcp, mcst_pkt);
3010 			return;
3011 		}
3012 
3013 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3014 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3015 
3016 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3017 
3018 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3019 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3020 		break;
3021 
3022 	case VIO_SUBTYPE_ACK:
3023 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3024 
3025 		/*
3026 		 * We shouldn't ever get a multicast ACK message as
3027 		 * at the moment we never request multicast addresses
3028 		 * to be set on some other device. This may change in
3029 		 * the future if we have cascading switches.
3030 		 */
3031 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3032 			return;
3033 
3034 				/* Do nothing */
3035 		break;
3036 
3037 	case VIO_SUBTYPE_NACK:
3038 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3039 
3040 		/*
3041 		 * We shouldn't get a multicast NACK packet for the
3042 		 * same reasons as we shouldn't get a ACK packet.
3043 		 */
3044 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3045 			return;
3046 
3047 				/* Do nothing */
3048 		break;
3049 
3050 	default:
3051 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3052 		    mcst_pkt->tag.vio_subtype);
3053 	}
3054 
3055 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3056 }
3057 
3058 static void
3059 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3060 {
3061 	vio_rdx_msg_t	*rdx_pkt;
3062 	vsw_t		*vswp = ldcp->ldc_vswp;
3063 
3064 	/*
3065 	 * We know this is a ctrl/rdx packet so
3066 	 * cast it into the correct structure.
3067 	 */
3068 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3069 
3070 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3071 
3072 	switch (rdx_pkt->tag.vio_subtype) {
3073 	case VIO_SUBTYPE_INFO:
3074 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3075 
3076 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3077 			return;
3078 
3079 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3080 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3081 
3082 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3083 
3084 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3085 
3086 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3087 		    sizeof (vio_rdx_msg_t), B_TRUE);
3088 
3089 		vsw_next_milestone(ldcp);
3090 		break;
3091 
3092 	case VIO_SUBTYPE_ACK:
3093 		/*
3094 		 * Should be handled in-band by callback handler.
3095 		 */
3096 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3097 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3098 		break;
3099 
3100 	case VIO_SUBTYPE_NACK:
3101 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3102 
3103 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3104 			return;
3105 
3106 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3107 		vsw_next_milestone(ldcp);
3108 		break;
3109 
3110 	default:
3111 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3112 		    rdx_pkt->tag.vio_subtype);
3113 	}
3114 
3115 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3116 }
3117 
3118 static void
3119 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3120 	uint32_t msglen)
3121 {
3122 	uint16_t	env = tagp->vio_subtype_env;
3123 	vsw_t		*vswp = ldcp->ldc_vswp;
3124 
3125 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3126 
3127 	/* session id check */
3128 	if (ldcp->session_status & VSW_PEER_SESSION) {
3129 		if (ldcp->peer_session != tagp->vio_sid) {
3130 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3131 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3132 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3133 			return;
3134 		}
3135 	}
3136 
3137 	/*
3138 	 * It is an error for us to be getting data packets
3139 	 * before the handshake has completed.
3140 	 */
3141 	if (ldcp->hphase != VSW_MILESTONE4) {
3142 		DERR(vswp, "%s: got data packet before handshake complete "
3143 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3144 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3145 		DUMP_FLAGS(ldcp->lane_in.lstate);
3146 		DUMP_FLAGS(ldcp->lane_out.lstate);
3147 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3148 		return;
3149 	}
3150 
3151 	/*
3152 	 * To reduce the locking contention, release the
3153 	 * ldc_cblock here and re-acquire it once we are done
3154 	 * receiving packets.
3155 	 */
3156 	mutex_exit(&ldcp->ldc_cblock);
3157 	mutex_enter(&ldcp->ldc_rxlock);
3158 
3159 	/*
3160 	 * Switch on vio_subtype envelope, then let lower routines
3161 	 * decide if its an INFO, ACK or NACK packet.
3162 	 */
3163 	if (env == VIO_DRING_DATA) {
3164 		vsw_process_data_dring_pkt(ldcp, dpkt);
3165 	} else if (env == VIO_PKT_DATA) {
3166 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3167 	} else if (env == VIO_DESC_DATA) {
3168 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3169 	} else {
3170 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3171 	}
3172 
3173 	mutex_exit(&ldcp->ldc_rxlock);
3174 	mutex_enter(&ldcp->ldc_cblock);
3175 
3176 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3177 }
3178 
3179 #define	SND_DRING_NACK(ldcp, pkt) \
3180 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3181 	pkt->tag.vio_sid = ldcp->local_session; \
3182 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3183 			sizeof (vio_dring_msg_t), B_TRUE);
3184 
3185 static void
3186 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3187 {
3188 	vio_dring_msg_t		*dring_pkt;
3189 	vnet_public_desc_t	*pub_addr = NULL;
3190 	vsw_private_desc_t	*priv_addr = NULL;
3191 	dring_info_t		*dp = NULL;
3192 	vsw_t			*vswp = ldcp->ldc_vswp;
3193 	mblk_t			*mp = NULL;
3194 	mblk_t			*bp = NULL;
3195 	mblk_t			*bpt = NULL;
3196 	size_t			nbytes = 0;
3197 	uint64_t		ncookies = 0;
3198 	uint64_t		chain = 0;
3199 	uint64_t		len;
3200 	uint32_t		pos, start, datalen;
3201 	uint32_t		range_start, range_end;
3202 	int32_t			end, num, cnt = 0;
3203 	int			i, rv, msg_rv = 0;
3204 	boolean_t		ack_needed = B_FALSE;
3205 	boolean_t		prev_desc_ack = B_FALSE;
3206 	int			read_attempts = 0;
3207 	struct ether_header	*ehp;
3208 
3209 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3210 
3211 	/*
3212 	 * We know this is a data/dring packet so
3213 	 * cast it into the correct structure.
3214 	 */
3215 	dring_pkt = (vio_dring_msg_t *)dpkt;
3216 
3217 	/*
3218 	 * Switch on the vio_subtype. If its INFO then we need to
3219 	 * process the data. If its an ACK we need to make sure
3220 	 * it makes sense (i.e did we send an earlier data/info),
3221 	 * and if its a NACK then we maybe attempt a retry.
3222 	 */
3223 	switch (dring_pkt->tag.vio_subtype) {
3224 	case VIO_SUBTYPE_INFO:
3225 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3226 
3227 		READ_ENTER(&ldcp->lane_in.dlistrw);
3228 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3229 		    dring_pkt->dring_ident)) == NULL) {
3230 			RW_EXIT(&ldcp->lane_in.dlistrw);
3231 
3232 			DERR(vswp, "%s(%lld): unable to find dring from "
3233 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3234 			    dring_pkt->dring_ident);
3235 
3236 			SND_DRING_NACK(ldcp, dring_pkt);
3237 			return;
3238 		}
3239 
3240 		start = pos = dring_pkt->start_idx;
3241 		end = dring_pkt->end_idx;
3242 		len = dp->num_descriptors;
3243 
3244 		range_start = range_end = pos;
3245 
3246 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3247 		    __func__, ldcp->ldc_id, start, end);
3248 
3249 		if (end == -1) {
3250 			num = -1;
3251 		} else if (end >= 0) {
3252 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3253 
3254 			/* basic sanity check */
3255 			if (end > len) {
3256 				RW_EXIT(&ldcp->lane_in.dlistrw);
3257 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3258 				    "ring length %lld", __func__,
3259 				    ldcp->ldc_id, end, len);
3260 
3261 				SND_DRING_NACK(ldcp, dring_pkt);
3262 				return;
3263 			}
3264 		} else {
3265 			RW_EXIT(&ldcp->lane_in.dlistrw);
3266 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3267 			    __func__, ldcp->ldc_id, end);
3268 			SND_DRING_NACK(ldcp, dring_pkt);
3269 			return;
3270 		}
3271 
3272 		while (cnt != num) {
3273 vsw_recheck_desc:
3274 			if ((rv = ldc_mem_dring_acquire(dp->handle,
3275 			    pos, pos)) != 0) {
3276 				RW_EXIT(&ldcp->lane_in.dlistrw);
3277 				DERR(vswp, "%s(%lld): unable to acquire "
3278 				    "descriptor at pos %d: err %d",
3279 				    __func__, pos, ldcp->ldc_id, rv);
3280 				SND_DRING_NACK(ldcp, dring_pkt);
3281 				ldcp->ldc_stats.ierrors++;
3282 				return;
3283 			}
3284 
3285 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3286 
3287 			/*
3288 			 * When given a bounded range of descriptors
3289 			 * to process, its an error to hit a descriptor
3290 			 * which is not ready. In the non-bounded case
3291 			 * (end_idx == -1) this simply indicates we have
3292 			 * reached the end of the current active range.
3293 			 */
3294 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
3295 				/* unbound - no error */
3296 				if (end == -1) {
3297 					if (read_attempts == vsw_read_attempts)
3298 						break;
3299 
3300 					delay(drv_usectohz(vsw_desc_delay));
3301 					read_attempts++;
3302 					goto vsw_recheck_desc;
3303 				}
3304 
3305 				/* bounded - error - so NACK back */
3306 				RW_EXIT(&ldcp->lane_in.dlistrw);
3307 				DERR(vswp, "%s(%lld): descriptor not READY "
3308 				    "(%d)", __func__, ldcp->ldc_id,
3309 				    pub_addr->hdr.dstate);
3310 				SND_DRING_NACK(ldcp, dring_pkt);
3311 				return;
3312 			}
3313 
3314 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3315 
3316 			range_end = pos;
3317 
3318 			/*
3319 			 * If we ACK'd the previous descriptor then now
3320 			 * record the new range start position for later
3321 			 * ACK's.
3322 			 */
3323 			if (prev_desc_ack) {
3324 				range_start = pos;
3325 
3326 				D2(vswp, "%s(%lld): updating range start to be "
3327 				    "%d", __func__, ldcp->ldc_id, range_start);
3328 
3329 				prev_desc_ack = B_FALSE;
3330 			}
3331 
3332 			/*
3333 			 * Data is padded to align on 8 byte boundary,
3334 			 * datalen is actual data length, i.e. minus that
3335 			 * padding.
3336 			 */
3337 			datalen = pub_addr->nbytes;
3338 
3339 			/*
3340 			 * Does peer wish us to ACK when we have finished
3341 			 * with this descriptor ?
3342 			 */
3343 			if (pub_addr->hdr.ack)
3344 				ack_needed = B_TRUE;
3345 
3346 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3347 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3348 			    __func__, ldcp->ldc_id, pos, pub_addr,
3349 			    pub_addr->hdr.dstate, datalen);
3350 
3351 			/*
3352 			 * Mark that we are starting to process descriptor.
3353 			 */
3354 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
3355 
3356 			/*
3357 			 * Ensure that we ask ldc for an aligned
3358 			 * number of bytes.
3359 			 */
3360 			nbytes = (datalen + VNET_IPALIGN + 7) & ~7;
3361 
3362 			mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3363 			if (mp == NULL) {
3364 				ldcp->ldc_stats.rx_vio_allocb_fail++;
3365 				/*
3366 				 * No free receive buffers available, so
3367 				 * fallback onto allocb(9F). Make sure that
3368 				 * we get a data buffer which is a multiple
3369 				 * of 8 as this is required by ldc_mem_copy.
3370 				 */
3371 				DTRACE_PROBE(allocb);
3372 				if ((mp = allocb(datalen + VNET_IPALIGN + 8,
3373 				    BPRI_MED)) == NULL) {
3374 					DERR(vswp, "%s(%ld): allocb failed",
3375 					    __func__, ldcp->ldc_id);
3376 					pub_addr->hdr.dstate = VIO_DESC_DONE;
3377 					(void) ldc_mem_dring_release(dp->handle,
3378 					    pos, pos);
3379 					ldcp->ldc_stats.ierrors++;
3380 					ldcp->ldc_stats.rx_allocb_fail++;
3381 					break;
3382 				}
3383 			}
3384 
3385 			ncookies = pub_addr->ncookies;
3386 			rv = ldc_mem_copy(ldcp->ldc_handle,
3387 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3388 			    pub_addr->memcookie, ncookies, LDC_COPY_IN);
3389 
3390 			if (rv != 0) {
3391 				DERR(vswp, "%s(%d): unable to copy in data "
3392 				    "from %d cookies in desc %d (rv %d)",
3393 				    __func__, ldcp->ldc_id, ncookies, pos, rv);
3394 				freemsg(mp);
3395 
3396 				pub_addr->hdr.dstate = VIO_DESC_DONE;
3397 				(void) ldc_mem_dring_release(dp->handle,
3398 				    pos, pos);
3399 				ldcp->ldc_stats.ierrors++;
3400 				break;
3401 			} else {
3402 				D2(vswp, "%s(%d): copied in %ld bytes"
3403 				    " using %d cookies", __func__,
3404 				    ldcp->ldc_id, nbytes, ncookies);
3405 			}
3406 
3407 			/* adjust the read pointer to skip over the padding */
3408 			mp->b_rptr += VNET_IPALIGN;
3409 
3410 			/* point to the actual end of data */
3411 			mp->b_wptr = mp->b_rptr + datalen;
3412 
3413 			/* update statistics */
3414 			ehp = (struct ether_header *)mp->b_rptr;
3415 			if (IS_BROADCAST(ehp))
3416 				ldcp->ldc_stats.brdcstrcv++;
3417 			else if (IS_MULTICAST(ehp))
3418 				ldcp->ldc_stats.multircv++;
3419 
3420 			ldcp->ldc_stats.ipackets++;
3421 			ldcp->ldc_stats.rbytes += datalen;
3422 
3423 			/*
3424 			 * IPALIGN space can be used for VLAN_TAG
3425 			 */
3426 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3427 			    VSW_VNETPORT, mp);
3428 
3429 			/* build a chain of received packets */
3430 			if (bp == NULL) {
3431 				/* first pkt */
3432 				bp = mp;
3433 				bp->b_next = bp->b_prev = NULL;
3434 				bpt = bp;
3435 				chain = 1;
3436 			} else {
3437 				mp->b_next = mp->b_prev = NULL;
3438 				bpt->b_next = mp;
3439 				bpt = mp;
3440 				chain++;
3441 			}
3442 
3443 			/* mark we are finished with this descriptor */
3444 			pub_addr->hdr.dstate = VIO_DESC_DONE;
3445 
3446 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
3447 
3448 			/*
3449 			 * Send an ACK back to peer if requested.
3450 			 */
3451 			if (ack_needed) {
3452 				ack_needed = B_FALSE;
3453 
3454 				dring_pkt->start_idx = range_start;
3455 				dring_pkt->end_idx = range_end;
3456 
3457 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3458 				    " requested", __func__, ldcp->ldc_id,
3459 				    dring_pkt->start_idx, dring_pkt->end_idx);
3460 
3461 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3462 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3463 				dring_pkt->tag.vio_sid = ldcp->local_session;
3464 
3465 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3466 				    sizeof (vio_dring_msg_t), B_FALSE);
3467 
3468 				/*
3469 				 * Check if ACK was successfully sent. If not
3470 				 * we break and deal with that below.
3471 				 */
3472 				if (msg_rv != 0)
3473 					break;
3474 
3475 				prev_desc_ack = B_TRUE;
3476 				range_start = pos;
3477 			}
3478 
3479 			/* next descriptor */
3480 			pos = (pos + 1) % len;
3481 			cnt++;
3482 
3483 			/*
3484 			 * Break out of loop here and stop processing to
3485 			 * allow some other network device (or disk) to
3486 			 * get access to the cpu.
3487 			 */
3488 			if (chain > vsw_chain_len) {
3489 				D3(vswp, "%s(%lld): switching chain of %d "
3490 				    "msgs", __func__, ldcp->ldc_id, chain);
3491 				break;
3492 			}
3493 		}
3494 		RW_EXIT(&ldcp->lane_in.dlistrw);
3495 
3496 		/*
3497 		 * If when we attempted to send the ACK we found that the
3498 		 * channel had been reset then now handle this. We deal with
3499 		 * it here as we cannot reset the channel while holding the
3500 		 * dlistrw lock, and we don't want to acquire/release it
3501 		 * continuously in the above loop, as a channel reset should
3502 		 * be a rare event.
3503 		 */
3504 		if (msg_rv == ECONNRESET) {
3505 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3506 			break;
3507 		}
3508 
3509 		/* send the chain of packets to be switched */
3510 		if (bp != NULL) {
3511 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3512 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3513 			    __func__, ldcp->ldc_id, chain);
3514 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3515 			    ldcp->ldc_port, NULL);
3516 		}
3517 
3518 		DTRACE_PROBE1(msg_cnt, int, cnt);
3519 
3520 		/*
3521 		 * We are now finished so ACK back with the state
3522 		 * set to STOPPING so our peer knows we are finished
3523 		 */
3524 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3525 		dring_pkt->tag.vio_sid = ldcp->local_session;
3526 
3527 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3528 
3529 		DTRACE_PROBE(stop_process_sent);
3530 
3531 		/*
3532 		 * We have not processed any more descriptors beyond
3533 		 * the last one we ACK'd.
3534 		 */
3535 		if (prev_desc_ack)
3536 			range_start = range_end;
3537 
3538 		dring_pkt->start_idx = range_start;
3539 		dring_pkt->end_idx = range_end;
3540 
3541 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3542 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3543 		    dring_pkt->end_idx);
3544 
3545 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3546 		    sizeof (vio_dring_msg_t), B_TRUE);
3547 		break;
3548 
3549 	case VIO_SUBTYPE_ACK:
3550 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3551 		/*
3552 		 * Verify that the relevant descriptors are all
3553 		 * marked as DONE
3554 		 */
3555 		READ_ENTER(&ldcp->lane_out.dlistrw);
3556 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3557 		    dring_pkt->dring_ident)) == NULL) {
3558 			RW_EXIT(&ldcp->lane_out.dlistrw);
3559 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3560 			return;
3561 		}
3562 
3563 		start = end = 0;
3564 		start = dring_pkt->start_idx;
3565 		end = dring_pkt->end_idx;
3566 		len = dp->num_descriptors;
3567 
3568 
3569 		mutex_enter(&dp->dlock);
3570 		dp->last_ack_recv = end;
3571 		ldcp->ldc_stats.dring_data_acks++;
3572 		mutex_exit(&dp->dlock);
3573 
3574 		(void) vsw_reclaim_dring(dp, start);
3575 
3576 		/*
3577 		 * If our peer is stopping processing descriptors then
3578 		 * we check to make sure it has processed all the descriptors
3579 		 * we have updated. If not then we send it a new message
3580 		 * to prompt it to restart.
3581 		 */
3582 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3583 			DTRACE_PROBE(stop_process_recv);
3584 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3585 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3586 			    dring_pkt->end_idx);
3587 
3588 			/*
3589 			 * Check next descriptor in public section of ring.
3590 			 * If its marked as READY then we need to prompt our
3591 			 * peer to start processing the ring again.
3592 			 */
3593 			i = (end + 1) % len;
3594 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3595 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3596 
3597 			/*
3598 			 * Hold the restart lock across all of this to
3599 			 * make sure that its not possible for us to
3600 			 * decide that a msg needs to be sent in the future
3601 			 * but the sending code having already checked is
3602 			 * about to exit.
3603 			 */
3604 			mutex_enter(&dp->restart_lock);
3605 			ldcp->ldc_stats.dring_stopped_acks++;
3606 			mutex_enter(&priv_addr->dstate_lock);
3607 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3608 
3609 				mutex_exit(&priv_addr->dstate_lock);
3610 
3611 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3612 				dring_pkt->tag.vio_sid = ldcp->local_session;
3613 
3614 				dring_pkt->start_idx = (end + 1) % len;
3615 				dring_pkt->end_idx = -1;
3616 
3617 				D2(vswp, "%s(%lld) : sending restart msg:"
3618 				    " %d : %d", __func__, ldcp->ldc_id,
3619 				    dring_pkt->start_idx, dring_pkt->end_idx);
3620 
3621 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3622 				    sizeof (vio_dring_msg_t), B_FALSE);
3623 				ldcp->ldc_stats.dring_data_msgs++;
3624 
3625 			} else {
3626 				mutex_exit(&priv_addr->dstate_lock);
3627 				dp->restart_reqd = B_TRUE;
3628 			}
3629 			mutex_exit(&dp->restart_lock);
3630 		}
3631 		RW_EXIT(&ldcp->lane_out.dlistrw);
3632 
3633 		/* only do channel reset after dropping dlistrw lock */
3634 		if (msg_rv == ECONNRESET)
3635 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3636 
3637 		break;
3638 
3639 	case VIO_SUBTYPE_NACK:
3640 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3641 		    __func__, ldcp->ldc_id);
3642 		/*
3643 		 * Something is badly wrong if we are getting NACK's
3644 		 * for our data pkts. So reset the channel.
3645 		 */
3646 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3647 
3648 		break;
3649 
3650 	default:
3651 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3652 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3653 	}
3654 
3655 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3656 }
3657 
3658 /*
3659  * dummy pkt data handler function for vnet protocol version 1.0
3660  */
3661 static void
3662 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3663 {
3664 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3665 }
3666 
3667 /*
3668  * This function handles raw pkt data messages received over the channel.
3669  * Currently, only priority-eth-type frames are received through this mechanism.
3670  * In this case, the frame(data) is present within the message itself which
3671  * is copied into an mblk before switching it.
3672  */
3673 static void
3674 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3675 {
3676 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3677 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3678 	uint32_t		size;
3679 	mblk_t			*mp;
3680 	vsw_t			*vswp = ldcp->ldc_vswp;
3681 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3682 	lane_t			*lp = &ldcp->lane_out;
3683 
3684 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3685 	if (size < ETHERMIN || size > lp->mtu) {
3686 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3687 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3688 		    ldcp->ldc_id, size);
3689 		return;
3690 	}
3691 
3692 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3693 	if (mp == NULL) {
3694 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3695 		if (mp == NULL) {
3696 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3697 			DWARN(vswp, "%s(%lld) allocb failure, "
3698 			    "unable to process priority frame\n", __func__,
3699 			    ldcp->ldc_id);
3700 			return;
3701 		}
3702 	}
3703 
3704 	/* skip over the extra space for vlan tag */
3705 	mp->b_rptr += VLAN_TAGSZ;
3706 
3707 	/* copy the frame from the payload of raw data msg into the mblk */
3708 	bcopy(dpkt->data, mp->b_rptr, size);
3709 	mp->b_wptr = mp->b_rptr + size;
3710 
3711 	/* update stats */
3712 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3713 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3714 
3715 	/*
3716 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3717 	 */
3718 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3719 
3720 	/* switch the frame to destination */
3721 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3722 }
3723 
3724 /*
3725  * Process an in-band descriptor message (most likely from
3726  * OBP).
3727  */
3728 static void
3729 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3730 {
3731 	vnet_ibnd_desc_t	*ibnd_desc;
3732 	dring_info_t		*dp = NULL;
3733 	vsw_private_desc_t	*priv_addr = NULL;
3734 	vsw_t			*vswp = ldcp->ldc_vswp;
3735 	mblk_t			*mp = NULL;
3736 	size_t			nbytes = 0;
3737 	size_t			off = 0;
3738 	uint64_t		idx = 0;
3739 	uint32_t		num = 1, len, datalen = 0;
3740 	uint64_t		ncookies = 0;
3741 	int			i, rv;
3742 	int			j = 0;
3743 
3744 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3745 
3746 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3747 
3748 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3749 	case VIO_SUBTYPE_INFO:
3750 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3751 
3752 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3753 			return;
3754 
3755 		/*
3756 		 * Data is padded to align on a 8 byte boundary,
3757 		 * nbytes is actual data length, i.e. minus that
3758 		 * padding.
3759 		 */
3760 		datalen = ibnd_desc->nbytes;
3761 
3762 		D2(vswp, "%s(%lld): processing inband desc : "
3763 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3764 
3765 		ncookies = ibnd_desc->ncookies;
3766 
3767 		/*
3768 		 * allocb(9F) returns an aligned data block. We
3769 		 * need to ensure that we ask ldc for an aligned
3770 		 * number of bytes also.
3771 		 */
3772 		nbytes = datalen;
3773 		if (nbytes & 0x7) {
3774 			off = 8 - (nbytes & 0x7);
3775 			nbytes += off;
3776 		}
3777 
3778 		/* alloc extra space for VLAN_TAG */
3779 		mp = allocb(datalen + 8, BPRI_MED);
3780 		if (mp == NULL) {
3781 			DERR(vswp, "%s(%lld): allocb failed",
3782 			    __func__, ldcp->ldc_id);
3783 			ldcp->ldc_stats.rx_allocb_fail++;
3784 			return;
3785 		}
3786 
3787 		/* skip over the extra space for VLAN_TAG */
3788 		mp->b_rptr += 8;
3789 
3790 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3791 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3792 		    LDC_COPY_IN);
3793 
3794 		if (rv != 0) {
3795 			DERR(vswp, "%s(%d): unable to copy in data from "
3796 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3797 			freemsg(mp);
3798 			ldcp->ldc_stats.ierrors++;
3799 			return;
3800 		}
3801 
3802 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3803 		    __func__, ldcp->ldc_id, nbytes, ncookies);
3804 
3805 		/* point to the actual end of data */
3806 		mp->b_wptr = mp->b_rptr + datalen;
3807 		ldcp->ldc_stats.ipackets++;
3808 		ldcp->ldc_stats.rbytes += datalen;
3809 
3810 		/*
3811 		 * We ACK back every in-band descriptor message we process
3812 		 */
3813 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3814 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3815 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3816 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3817 
3818 		/*
3819 		 * there is extra space alloc'd for VLAN_TAG
3820 		 */
3821 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3822 
3823 		/* send the packet to be switched */
3824 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3825 		    ldcp->ldc_port, NULL);
3826 
3827 		break;
3828 
3829 	case VIO_SUBTYPE_ACK:
3830 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3831 
3832 		/* Verify the ACK is valid */
3833 		idx = ibnd_desc->hdr.desc_handle;
3834 
3835 		if (idx >= vsw_ntxds) {
3836 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3837 			    "(idx %ld)", vswp->instance, idx);
3838 			return;
3839 		}
3840 
3841 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3842 			DERR(vswp, "%s: no dring found", __func__);
3843 			return;
3844 		}
3845 
3846 		len = dp->num_descriptors;
3847 		/*
3848 		 * If the descriptor we are being ACK'ed for is not the
3849 		 * one we expected, then pkts were lost somwhere, either
3850 		 * when we tried to send a msg, or a previous ACK msg from
3851 		 * our peer. In either case we now reclaim the descriptors
3852 		 * in the range from the last ACK we received up to the
3853 		 * current ACK.
3854 		 */
3855 		if (idx != dp->last_ack_recv) {
3856 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3857 			    __func__, dp->last_ack_recv, idx);
3858 			num = idx >= dp->last_ack_recv ?
3859 			    idx - dp->last_ack_recv + 1:
3860 			    (len - dp->last_ack_recv + 1) + idx;
3861 		}
3862 
3863 		/*
3864 		 * When we sent the in-band message to our peer we
3865 		 * marked the copy in our private ring as READY. We now
3866 		 * check that the descriptor we are being ACK'ed for is in
3867 		 * fact READY, i.e. it is one we have shared with our peer.
3868 		 *
3869 		 * If its not we flag an error, but still reset the descr
3870 		 * back to FREE.
3871 		 */
3872 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3873 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3874 			mutex_enter(&priv_addr->dstate_lock);
3875 			if (priv_addr->dstate != VIO_DESC_READY) {
3876 				DERR(vswp, "%s: (%ld) desc at index %ld not "
3877 				    "READY (0x%lx)", __func__,
3878 				    ldcp->ldc_id, idx, priv_addr->dstate);
3879 				DERR(vswp, "%s: bound %d: ncookies %ld : "
3880 				    "datalen %ld", __func__,
3881 				    priv_addr->bound, priv_addr->ncookies,
3882 				    priv_addr->datalen);
3883 			}
3884 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3885 			    ldcp->ldc_id, idx);
3886 			/* release resources associated with sent msg */
3887 			priv_addr->datalen = 0;
3888 			priv_addr->dstate = VIO_DESC_FREE;
3889 			mutex_exit(&priv_addr->dstate_lock);
3890 		}
3891 		/* update to next expected value */
3892 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3893 
3894 		break;
3895 
3896 	case VIO_SUBTYPE_NACK:
3897 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3898 
3899 		/*
3900 		 * We should only get a NACK if our peer doesn't like
3901 		 * something about a message we have sent it. If this
3902 		 * happens we just release the resources associated with
3903 		 * the message. (We are relying on higher layers to decide
3904 		 * whether or not to resend.
3905 		 */
3906 
3907 		/* limit check */
3908 		idx = ibnd_desc->hdr.desc_handle;
3909 
3910 		if (idx >= vsw_ntxds) {
3911 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3912 			    __func__, idx);
3913 			return;
3914 		}
3915 
3916 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3917 			DERR(vswp, "%s: no dring found", __func__);
3918 			return;
3919 		}
3920 
3921 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3922 
3923 		/* move to correct location in ring */
3924 		priv_addr += idx;
3925 
3926 		/* release resources associated with sent msg */
3927 		mutex_enter(&priv_addr->dstate_lock);
3928 		priv_addr->datalen = 0;
3929 		priv_addr->dstate = VIO_DESC_FREE;
3930 		mutex_exit(&priv_addr->dstate_lock);
3931 
3932 		break;
3933 
3934 	default:
3935 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3936 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3937 	}
3938 
3939 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3940 }
3941 
3942 static void
3943 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3944 {
3945 	_NOTE(ARGUNUSED(epkt))
3946 
3947 	vsw_t		*vswp = ldcp->ldc_vswp;
3948 	uint16_t	env = tagp->vio_subtype_env;
3949 
3950 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3951 
3952 	/*
3953 	 * Error vio_subtypes have yet to be defined. So for
3954 	 * the moment we can't do anything.
3955 	 */
3956 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3957 
3958 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3959 }
3960 
3961 /* transmit the packet over the given port */
3962 int
3963 vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
3964 {
3965 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
3966 	vsw_ldc_t 	*ldcp;
3967 	int		status = 0;
3968 	uint32_t	n;
3969 
3970 	READ_ENTER(&ldcl->lockrw);
3971 	/*
3972 	 * Note for now, we have a single channel.
3973 	 */
3974 	ldcp = ldcl->head;
3975 	if (ldcp == NULL) {
3976 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
3977 		freemsgchain(mp);
3978 		RW_EXIT(&ldcl->lockrw);
3979 		return (1);
3980 	}
3981 
3982 	n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3983 
3984 	count -= n;
3985 	if (count == 0) {
3986 		goto vsw_portsend_exit;
3987 	}
3988 
3989 	status = ldcp->tx(ldcp, mp, mpt, count);
3990 
3991 vsw_portsend_exit:
3992 	RW_EXIT(&ldcl->lockrw);
3993 
3994 	return (status);
3995 }
3996 
3997 /*
3998  * Break up frames into 2 seperate chains: normal and
3999  * priority, based on the frame type. The number of
4000  * priority frames is also counted and returned.
4001  *
4002  * Params:
4003  * 	vswp:	pointer to the instance of vsw
4004  *	np:	head of packet chain to be broken
4005  *	npt:	tail of packet chain to be broken
4006  *
4007  * Returns:
4008  *	np:	head of normal data packets
4009  *	npt:	tail of normal data packets
4010  *	hp:	head of high priority packets
4011  *	hpt:	tail of high priority packets
4012  */
4013 static uint32_t
4014 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4015 	mblk_t **hp, mblk_t **hpt)
4016 {
4017 	mblk_t			*tmp = NULL;
4018 	mblk_t			*smp = NULL;
4019 	mblk_t			*hmp = NULL;	/* high prio pkts head */
4020 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4021 	mblk_t			*nmp = NULL;	/* normal pkts head */
4022 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4023 	uint32_t		count = 0;
4024 	int			i;
4025 	struct ether_header	*ehp;
4026 	uint32_t		num_types;
4027 	uint16_t		*types;
4028 
4029 	tmp = *np;
4030 	while (tmp != NULL) {
4031 
4032 		smp = tmp;
4033 		tmp = tmp->b_next;
4034 		smp->b_next = NULL;
4035 		smp->b_prev = NULL;
4036 
4037 		ehp = (struct ether_header *)smp->b_rptr;
4038 		num_types = vswp->pri_num_types;
4039 		types = vswp->pri_types;
4040 		for (i = 0; i < num_types; i++) {
4041 			if (ehp->ether_type == types[i]) {
4042 				/* high priority frame */
4043 
4044 				if (hmp != NULL) {
4045 					hmpt->b_next = smp;
4046 					hmpt = smp;
4047 				} else {
4048 					hmp = hmpt = smp;
4049 				}
4050 				count++;
4051 				break;
4052 			}
4053 		}
4054 		if (i == num_types) {
4055 			/* normal data frame */
4056 
4057 			if (nmp != NULL) {
4058 				nmpt->b_next = smp;
4059 				nmpt = smp;
4060 			} else {
4061 				nmp = nmpt = smp;
4062 			}
4063 		}
4064 	}
4065 
4066 	*hp = hmp;
4067 	*hpt = hmpt;
4068 	*np = nmp;
4069 	*npt = nmpt;
4070 
4071 	return (count);
4072 }
4073 
4074 /*
4075  * Wrapper function to transmit normal and/or priority frames over the channel.
4076  */
4077 static int
4078 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4079 {
4080 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4081 	mblk_t			*tmp;
4082 	mblk_t			*smp;
4083 	mblk_t			*hmp;	/* high prio pkts head */
4084 	mblk_t			*hmpt;	/* high prio pkts tail */
4085 	mblk_t			*nmp;	/* normal pkts head */
4086 	mblk_t			*nmpt;	/* normal pkts tail */
4087 	uint32_t		n = 0;
4088 	vsw_t			*vswp = ldcp->ldc_vswp;
4089 
4090 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4091 	ASSERT(count != 0);
4092 
4093 	nmp = mp;
4094 	nmpt = mpt;
4095 
4096 	/* gather any priority frames from the chain of packets */
4097 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4098 
4099 	/* transmit priority frames */
4100 	tmp = hmp;
4101 	while (tmp != NULL) {
4102 		smp = tmp;
4103 		tmp = tmp->b_next;
4104 		smp->b_next = NULL;
4105 		vsw_ldcsend_pkt(ldcp, smp);
4106 	}
4107 
4108 	count -= n;
4109 
4110 	if (count == 0) {
4111 		/* no normal data frames to process */
4112 		return (0);
4113 	}
4114 
4115 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4116 }
4117 
4118 /*
4119  * Wrapper function to transmit normal frames over the channel.
4120  */
4121 static int
4122 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4123 {
4124 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4125 	mblk_t		*tmp = NULL;
4126 
4127 	ASSERT(count != 0);
4128 	/*
4129 	 * If the TX thread is enabled, then queue the
4130 	 * ordinary frames and signal the tx thread.
4131 	 */
4132 	if (ldcp->tx_thread != NULL) {
4133 
4134 		mutex_enter(&ldcp->tx_thr_lock);
4135 
4136 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4137 			/*
4138 			 * If we reached queue limit,
4139 			 * do not queue new packets,
4140 			 * drop them.
4141 			 */
4142 			ldcp->ldc_stats.tx_qfull += count;
4143 			mutex_exit(&ldcp->tx_thr_lock);
4144 			freemsgchain(mp);
4145 			goto exit;
4146 		}
4147 		if (ldcp->tx_mhead == NULL) {
4148 			ldcp->tx_mhead = mp;
4149 			ldcp->tx_mtail = mpt;
4150 			cv_signal(&ldcp->tx_thr_cv);
4151 		} else {
4152 			ldcp->tx_mtail->b_next = mp;
4153 			ldcp->tx_mtail = mpt;
4154 		}
4155 		ldcp->tx_cnt += count;
4156 		mutex_exit(&ldcp->tx_thr_lock);
4157 	} else {
4158 		while (mp != NULL) {
4159 			tmp = mp->b_next;
4160 			mp->b_next = mp->b_prev = NULL;
4161 			(void) vsw_ldcsend(ldcp, mp, 1);
4162 			mp = tmp;
4163 		}
4164 	}
4165 
4166 exit:
4167 	return (0);
4168 }
4169 
4170 /*
4171  * This function transmits the frame in the payload of a raw data
4172  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4173  * send special frames with high priorities, without going through
4174  * the normal data path which uses descriptor ring mechanism.
4175  */
4176 static void
4177 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4178 {
4179 	vio_raw_data_msg_t	*pkt;
4180 	mblk_t			*bp;
4181 	mblk_t			*nmp = NULL;
4182 	caddr_t			dst;
4183 	uint32_t		mblksz;
4184 	uint32_t		size;
4185 	uint32_t		nbytes;
4186 	int			rv;
4187 	vsw_t			*vswp = ldcp->ldc_vswp;
4188 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4189 
4190 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4191 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4192 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4193 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4194 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4195 		    ldcp->lane_out.lstate);
4196 		goto send_pkt_exit;
4197 	}
4198 
4199 	size = msgsize(mp);
4200 
4201 	/* frame size bigger than available payload len of raw data msg ? */
4202 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4203 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4204 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4205 		    ldcp->ldc_id, size);
4206 		goto send_pkt_exit;
4207 	}
4208 
4209 	if (size < ETHERMIN)
4210 		size = ETHERMIN;
4211 
4212 	/* alloc space for a raw data message */
4213 	nmp = vio_allocb(vswp->pri_tx_vmp);
4214 	if (nmp == NULL) {
4215 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4216 		DWARN(vswp, "vio_allocb failed\n");
4217 		goto send_pkt_exit;
4218 	}
4219 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4220 
4221 	/* copy frame into the payload of raw data message */
4222 	dst = (caddr_t)pkt->data;
4223 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4224 		mblksz = MBLKL(bp);
4225 		bcopy(bp->b_rptr, dst, mblksz);
4226 		dst += mblksz;
4227 	}
4228 
4229 	/* setup the raw data msg */
4230 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4231 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4232 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4233 	pkt->tag.vio_sid = ldcp->local_session;
4234 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4235 
4236 	/* send the msg over ldc */
4237 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4238 	if (rv != 0) {
4239 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4240 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4241 		    ldcp->ldc_id);
4242 		goto send_pkt_exit;
4243 	}
4244 
4245 	/* update stats */
4246 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4247 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4248 
4249 send_pkt_exit:
4250 	if (nmp != NULL)
4251 		freemsg(nmp);
4252 	freemsg(mp);
4253 }
4254 
4255 /*
4256  * Transmit the packet over the given LDC channel.
4257  *
4258  * The 'retries' argument indicates how many times a packet
4259  * is retried before it is dropped. Note, the retry is done
4260  * only for a resource related failure, for all other failures
4261  * the packet is dropped immediately.
4262  */
4263 static int
4264 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4265 {
4266 	int i;
4267 	int rc;
4268 	int status = 0;
4269 	vsw_port_t *port = ldcp->ldc_port;
4270 	dring_info_t *dp = NULL;
4271 
4272 
4273 	for (i = 0; i < retries; ) {
4274 		/*
4275 		 * Send the message out using the appropriate
4276 		 * transmit function which will free mblock when it
4277 		 * is finished with it.
4278 		 */
4279 		mutex_enter(&port->tx_lock);
4280 		if (port->transmit != NULL) {
4281 			status = (*port->transmit)(ldcp, mp);
4282 		}
4283 		if (status == LDC_TX_SUCCESS) {
4284 			mutex_exit(&port->tx_lock);
4285 			break;
4286 		}
4287 		i++;	/* increment the counter here */
4288 
4289 		/* If its the last retry, then update the oerror */
4290 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4291 			ldcp->ldc_stats.oerrors++;
4292 		}
4293 		mutex_exit(&port->tx_lock);
4294 
4295 		if (status != LDC_TX_NORESOURCES) {
4296 			/*
4297 			 * No retrying required for errors un-related
4298 			 * to resources.
4299 			 */
4300 			break;
4301 		}
4302 		READ_ENTER(&ldcp->lane_out.dlistrw);
4303 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4304 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4305 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4306 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4307 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4308 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4309 		} else {
4310 			/*
4311 			 * If there is no dring or the xfer_mode is
4312 			 * set to DESC_MODE(ie., OBP), then simply break here.
4313 			 */
4314 			RW_EXIT(&ldcp->lane_out.dlistrw);
4315 			break;
4316 		}
4317 		RW_EXIT(&ldcp->lane_out.dlistrw);
4318 
4319 		/*
4320 		 * Delay only if none were reclaimed
4321 		 * and its not the last retry.
4322 		 */
4323 		if ((rc == 0) && (i < retries)) {
4324 			delay(drv_usectohz(vsw_ldc_tx_delay));
4325 		}
4326 	}
4327 	freemsg(mp);
4328 	return (status);
4329 }
4330 
4331 /*
4332  * Send packet out via descriptor ring to a logical device.
4333  */
4334 static int
4335 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4336 {
4337 	vio_dring_msg_t		dring_pkt;
4338 	dring_info_t		*dp = NULL;
4339 	vsw_private_desc_t	*priv_desc = NULL;
4340 	vnet_public_desc_t	*pub = NULL;
4341 	vsw_t			*vswp = ldcp->ldc_vswp;
4342 	mblk_t			*bp;
4343 	size_t			n, size;
4344 	caddr_t			bufp;
4345 	int			idx;
4346 	int			status = LDC_TX_SUCCESS;
4347 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4348 	lane_t			*lp = &ldcp->lane_out;
4349 
4350 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4351 
4352 	/* TODO: make test a macro */
4353 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4354 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4355 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4356 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4357 		    ldcp->lane_out.lstate);
4358 		ldcp->ldc_stats.oerrors++;
4359 		return (LDC_TX_FAILURE);
4360 	}
4361 
4362 	/*
4363 	 * Note - using first ring only, this may change
4364 	 * in the future.
4365 	 */
4366 	READ_ENTER(&ldcp->lane_out.dlistrw);
4367 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4368 		RW_EXIT(&ldcp->lane_out.dlistrw);
4369 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4370 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4371 		ldcp->ldc_stats.oerrors++;
4372 		return (LDC_TX_FAILURE);
4373 	}
4374 
4375 	size = msgsize(mp);
4376 	if (size > (size_t)lp->mtu) {
4377 		RW_EXIT(&ldcp->lane_out.dlistrw);
4378 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4379 		    ldcp->ldc_id, size);
4380 		ldcp->ldc_stats.oerrors++;
4381 		return (LDC_TX_FAILURE);
4382 	}
4383 
4384 	/*
4385 	 * Find a free descriptor
4386 	 *
4387 	 * Note: for the moment we are assuming that we will only
4388 	 * have one dring going from the switch to each of its
4389 	 * peers. This may change in the future.
4390 	 */
4391 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4392 		D2(vswp, "%s(%lld): no descriptor available for ring "
4393 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4394 
4395 		/* nothing more we can do */
4396 		status = LDC_TX_NORESOURCES;
4397 		ldcp->ldc_stats.tx_no_desc++;
4398 		goto vsw_dringsend_free_exit;
4399 	} else {
4400 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4401 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4402 	}
4403 
4404 	/* copy data into the descriptor */
4405 	bufp = priv_desc->datap;
4406 	bufp += VNET_IPALIGN;
4407 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4408 		n = MBLKL(bp);
4409 		bcopy(bp->b_rptr, bufp, n);
4410 		bufp += n;
4411 	}
4412 
4413 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4414 
4415 	pub = priv_desc->descp;
4416 	pub->nbytes = priv_desc->datalen;
4417 
4418 	/* update statistics */
4419 	if (IS_BROADCAST(ehp))
4420 		ldcp->ldc_stats.brdcstxmt++;
4421 	else if (IS_MULTICAST(ehp))
4422 		ldcp->ldc_stats.multixmt++;
4423 	ldcp->ldc_stats.opackets++;
4424 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4425 
4426 	mutex_enter(&priv_desc->dstate_lock);
4427 	pub->hdr.dstate = VIO_DESC_READY;
4428 	mutex_exit(&priv_desc->dstate_lock);
4429 
4430 	/*
4431 	 * Determine whether or not we need to send a message to our
4432 	 * peer prompting them to read our newly updated descriptor(s).
4433 	 */
4434 	mutex_enter(&dp->restart_lock);
4435 	if (dp->restart_reqd) {
4436 		dp->restart_reqd = B_FALSE;
4437 		ldcp->ldc_stats.dring_data_msgs++;
4438 		mutex_exit(&dp->restart_lock);
4439 
4440 		/*
4441 		 * Send a vio_dring_msg to peer to prompt them to read
4442 		 * the updated descriptor ring.
4443 		 */
4444 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4445 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4446 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4447 		dring_pkt.tag.vio_sid = ldcp->local_session;
4448 
4449 		/* Note - for now using first ring */
4450 		dring_pkt.dring_ident = dp->ident;
4451 
4452 		/*
4453 		 * If last_ack_recv is -1 then we know we've not
4454 		 * received any ack's yet, so this must be the first
4455 		 * msg sent, so set the start to the begining of the ring.
4456 		 */
4457 		mutex_enter(&dp->dlock);
4458 		if (dp->last_ack_recv == -1) {
4459 			dring_pkt.start_idx = 0;
4460 		} else {
4461 			dring_pkt.start_idx =
4462 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4463 		}
4464 		dring_pkt.end_idx = -1;
4465 		mutex_exit(&dp->dlock);
4466 
4467 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4468 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4469 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4470 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4471 		    dring_pkt.end_idx);
4472 
4473 		RW_EXIT(&ldcp->lane_out.dlistrw);
4474 
4475 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4476 		    sizeof (vio_dring_msg_t), B_TRUE);
4477 
4478 		return (status);
4479 
4480 	} else {
4481 		mutex_exit(&dp->restart_lock);
4482 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4483 		    ldcp->ldc_id, idx);
4484 	}
4485 
4486 vsw_dringsend_free_exit:
4487 
4488 	RW_EXIT(&ldcp->lane_out.dlistrw);
4489 
4490 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4491 	return (status);
4492 }
4493 
4494 /*
4495  * Send an in-band descriptor message over ldc.
4496  */
4497 static int
4498 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4499 {
4500 	vsw_t			*vswp = ldcp->ldc_vswp;
4501 	vnet_ibnd_desc_t	ibnd_msg;
4502 	vsw_private_desc_t	*priv_desc = NULL;
4503 	dring_info_t		*dp = NULL;
4504 	size_t			n, size = 0;
4505 	caddr_t			bufp;
4506 	mblk_t			*bp;
4507 	int			idx, i;
4508 	int			status = LDC_TX_SUCCESS;
4509 	static int		warn_msg = 1;
4510 	lane_t			*lp = &ldcp->lane_out;
4511 
4512 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4513 
4514 	ASSERT(mp != NULL);
4515 
4516 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4517 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4518 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4519 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4520 		    ldcp->lane_out.lstate);
4521 		ldcp->ldc_stats.oerrors++;
4522 		return (LDC_TX_FAILURE);
4523 	}
4524 
4525 	/*
4526 	 * only expect single dring to exist, which we use
4527 	 * as an internal buffer, rather than a transfer channel.
4528 	 */
4529 	READ_ENTER(&ldcp->lane_out.dlistrw);
4530 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4531 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4532 		    __func__, ldcp->ldc_id);
4533 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4534 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4535 		RW_EXIT(&ldcp->lane_out.dlistrw);
4536 		ldcp->ldc_stats.oerrors++;
4537 		return (LDC_TX_FAILURE);
4538 	}
4539 
4540 	size = msgsize(mp);
4541 	if (size > (size_t)lp->mtu) {
4542 		RW_EXIT(&ldcp->lane_out.dlistrw);
4543 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4544 		    ldcp->ldc_id, size);
4545 		ldcp->ldc_stats.oerrors++;
4546 		return (LDC_TX_FAILURE);
4547 	}
4548 
4549 	/*
4550 	 * Find a free descriptor in our buffer ring
4551 	 */
4552 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4553 		RW_EXIT(&ldcp->lane_out.dlistrw);
4554 		if (warn_msg) {
4555 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4556 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4557 			warn_msg = 0;
4558 		}
4559 
4560 		/* nothing more we can do */
4561 		status = LDC_TX_NORESOURCES;
4562 		goto vsw_descrsend_free_exit;
4563 	} else {
4564 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4565 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4566 		warn_msg = 1;
4567 	}
4568 
4569 	/* copy data into the descriptor */
4570 	bufp = priv_desc->datap;
4571 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4572 		n = MBLKL(bp);
4573 		bcopy(bp->b_rptr, bufp, n);
4574 		bufp += n;
4575 	}
4576 
4577 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4578 
4579 	/* create and send the in-band descp msg */
4580 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4581 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4582 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4583 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4584 
4585 	/*
4586 	 * Copy the mem cookies describing the data from the
4587 	 * private region of the descriptor ring into the inband
4588 	 * descriptor.
4589 	 */
4590 	for (i = 0; i < priv_desc->ncookies; i++) {
4591 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4592 		    sizeof (ldc_mem_cookie_t));
4593 	}
4594 
4595 	ibnd_msg.hdr.desc_handle = idx;
4596 	ibnd_msg.ncookies = priv_desc->ncookies;
4597 	ibnd_msg.nbytes = size;
4598 
4599 	ldcp->ldc_stats.opackets++;
4600 	ldcp->ldc_stats.obytes += size;
4601 
4602 	RW_EXIT(&ldcp->lane_out.dlistrw);
4603 
4604 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4605 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4606 
4607 vsw_descrsend_free_exit:
4608 
4609 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4610 	return (status);
4611 }
4612 
4613 static void
4614 vsw_send_ver(void *arg)
4615 {
4616 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4617 	vsw_t		*vswp = ldcp->ldc_vswp;
4618 	lane_t		*lp = &ldcp->lane_out;
4619 	vio_ver_msg_t	ver_msg;
4620 
4621 	D1(vswp, "%s enter", __func__);
4622 
4623 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4624 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4625 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4626 	ver_msg.tag.vio_sid = ldcp->local_session;
4627 
4628 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4629 		ver_msg.ver_major = vsw_versions[0].ver_major;
4630 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4631 	} else {
4632 		/* use the major,minor that we've ack'd */
4633 		lane_t	*lpi = &ldcp->lane_in;
4634 		ver_msg.ver_major = lpi->ver_major;
4635 		ver_msg.ver_minor = lpi->ver_minor;
4636 	}
4637 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4638 
4639 	lp->lstate |= VSW_VER_INFO_SENT;
4640 	lp->ver_major = ver_msg.ver_major;
4641 	lp->ver_minor = ver_msg.ver_minor;
4642 
4643 	DUMP_TAG(ver_msg.tag);
4644 
4645 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4646 
4647 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4648 }
4649 
4650 static void
4651 vsw_send_attr(vsw_ldc_t *ldcp)
4652 {
4653 	vsw_t			*vswp = ldcp->ldc_vswp;
4654 	lane_t			*lp = &ldcp->lane_out;
4655 	vnet_attr_msg_t		attr_msg;
4656 
4657 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4658 
4659 	/*
4660 	 * Subtype is set to INFO by default
4661 	 */
4662 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4663 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4664 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4665 	attr_msg.tag.vio_sid = ldcp->local_session;
4666 
4667 	/* payload copied from default settings for lane */
4668 	attr_msg.mtu = lp->mtu;
4669 	attr_msg.addr_type = lp->addr_type;
4670 	attr_msg.xfer_mode = lp->xfer_mode;
4671 	attr_msg.ack_freq = lp->xfer_mode;
4672 
4673 	READ_ENTER(&vswp->if_lockrw);
4674 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4675 	RW_EXIT(&vswp->if_lockrw);
4676 
4677 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4678 
4679 	DUMP_TAG(attr_msg.tag);
4680 
4681 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4682 
4683 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4684 }
4685 
4686 /*
4687  * Create dring info msg (which also results in the creation of
4688  * a dring).
4689  */
4690 static vio_dring_reg_msg_t *
4691 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4692 {
4693 	vio_dring_reg_msg_t	*mp;
4694 	dring_info_t		*dp;
4695 	vsw_t			*vswp = ldcp->ldc_vswp;
4696 
4697 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4698 
4699 	/*
4700 	 * If we can't create a dring, obviously no point sending
4701 	 * a message.
4702 	 */
4703 	if ((dp = vsw_create_dring(ldcp)) == NULL)
4704 		return (NULL);
4705 
4706 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4707 
4708 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4709 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4710 	mp->tag.vio_subtype_env = VIO_DRING_REG;
4711 	mp->tag.vio_sid = ldcp->local_session;
4712 
4713 	/* payload */
4714 	mp->num_descriptors = dp->num_descriptors;
4715 	mp->descriptor_size = dp->descriptor_size;
4716 	mp->options = dp->options;
4717 	mp->ncookies = dp->ncookies;
4718 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4719 
4720 	mp->dring_ident = 0;
4721 
4722 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4723 
4724 	return (mp);
4725 }
4726 
4727 static void
4728 vsw_send_dring_info(vsw_ldc_t *ldcp)
4729 {
4730 	vio_dring_reg_msg_t	*dring_msg;
4731 	vsw_t			*vswp = ldcp->ldc_vswp;
4732 
4733 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4734 
4735 	dring_msg = vsw_create_dring_info_pkt(ldcp);
4736 	if (dring_msg == NULL) {
4737 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4738 		    vswp->instance, __func__);
4739 		return;
4740 	}
4741 
4742 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4743 
4744 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4745 
4746 	(void) vsw_send_msg(ldcp, dring_msg,
4747 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4748 
4749 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4750 
4751 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4752 }
4753 
4754 static void
4755 vsw_send_rdx(vsw_ldc_t *ldcp)
4756 {
4757 	vsw_t		*vswp = ldcp->ldc_vswp;
4758 	vio_rdx_msg_t	rdx_msg;
4759 
4760 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4761 
4762 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4763 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4764 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4765 	rdx_msg.tag.vio_sid = ldcp->local_session;
4766 
4767 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4768 
4769 	DUMP_TAG(rdx_msg.tag);
4770 
4771 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4772 
4773 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4774 }
4775 
4776 /*
4777  * Generic routine to send message out over ldc channel.
4778  *
4779  * It is possible that when we attempt to write over the ldc channel
4780  * that we get notified that it has been reset. Depending on the value
4781  * of the handle_reset flag we either handle that event here or simply
4782  * notify the caller that the channel was reset.
4783  */
4784 int
4785 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
4786 {
4787 	int			rv;
4788 	size_t			msglen = size;
4789 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
4790 	vsw_t			*vswp = ldcp->ldc_vswp;
4791 	vio_dring_msg_t		*dmsg;
4792 	vio_raw_data_msg_t	*rmsg;
4793 	vnet_ibnd_desc_t	*imsg;
4794 	boolean_t		data_msg = B_FALSE;
4795 
4796 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
4797 	    ldcp->ldc_id, size);
4798 
4799 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
4800 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
4801 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
4802 
4803 	mutex_enter(&ldcp->ldc_txlock);
4804 
4805 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
4806 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
4807 			dmsg = (vio_dring_msg_t *)tag;
4808 			dmsg->seq_num = ldcp->lane_out.seq_num;
4809 			data_msg = B_TRUE;
4810 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
4811 			rmsg = (vio_raw_data_msg_t *)tag;
4812 			rmsg->seq_num = ldcp->lane_out.seq_num;
4813 			data_msg = B_TRUE;
4814 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
4815 			imsg = (vnet_ibnd_desc_t *)tag;
4816 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
4817 			data_msg = B_TRUE;
4818 		}
4819 	}
4820 
4821 	do {
4822 		msglen = size;
4823 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
4824 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
4825 
4826 	if (rv == 0 && data_msg == B_TRUE) {
4827 		ldcp->lane_out.seq_num++;
4828 	}
4829 
4830 	if ((rv != 0) || (msglen != size)) {
4831 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
4832 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
4833 		ldcp->ldc_stats.oerrors++;
4834 	}
4835 
4836 	mutex_exit(&ldcp->ldc_txlock);
4837 
4838 	/*
4839 	 * If channel has been reset we either handle it here or
4840 	 * simply report back that it has been reset and let caller
4841 	 * decide what to do.
4842 	 */
4843 	if (rv == ECONNRESET) {
4844 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
4845 
4846 		/*
4847 		 * N.B - must never be holding the dlistrw lock when
4848 		 * we do a reset of the channel.
4849 		 */
4850 		if (handle_reset) {
4851 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4852 		}
4853 	}
4854 
4855 	return (rv);
4856 }
4857 
4858 /*
4859  * Remove the specified address from the list of address maintained
4860  * in this port node.
4861  */
4862 mcst_addr_t *
4863 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4864 {
4865 	vsw_t		*vswp = NULL;
4866 	vsw_port_t	*port = NULL;
4867 	mcst_addr_t	*prev_p = NULL;
4868 	mcst_addr_t	*curr_p = NULL;
4869 
4870 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4871 	    __func__, devtype, addr);
4872 
4873 	if (devtype == VSW_VNETPORT) {
4874 		port = (vsw_port_t *)arg;
4875 		mutex_enter(&port->mca_lock);
4876 		prev_p = curr_p = port->mcap;
4877 	} else {
4878 		vswp = (vsw_t *)arg;
4879 		mutex_enter(&vswp->mca_lock);
4880 		prev_p = curr_p = vswp->mcap;
4881 	}
4882 
4883 	while (curr_p != NULL) {
4884 		if (curr_p->addr == addr) {
4885 			D2(NULL, "%s: address found", __func__);
4886 			/* match found */
4887 			if (prev_p == curr_p) {
4888 				/* list head */
4889 				if (devtype == VSW_VNETPORT)
4890 					port->mcap = curr_p->nextp;
4891 				else
4892 					vswp->mcap = curr_p->nextp;
4893 			} else {
4894 				prev_p->nextp = curr_p->nextp;
4895 			}
4896 			break;
4897 		} else {
4898 			prev_p = curr_p;
4899 			curr_p = curr_p->nextp;
4900 		}
4901 	}
4902 
4903 	if (devtype == VSW_VNETPORT)
4904 		mutex_exit(&port->mca_lock);
4905 	else
4906 		mutex_exit(&vswp->mca_lock);
4907 
4908 	D1(NULL, "%s: exit", __func__);
4909 
4910 	return (curr_p);
4911 }
4912 
4913 /*
4914  * Creates a descriptor ring (dring) and links it into the
4915  * link of outbound drings for this channel.
4916  *
4917  * Returns NULL if creation failed.
4918  */
4919 static dring_info_t *
4920 vsw_create_dring(vsw_ldc_t *ldcp)
4921 {
4922 	vsw_private_desc_t	*priv_addr = NULL;
4923 	vsw_t			*vswp = ldcp->ldc_vswp;
4924 	ldc_mem_info_t		minfo;
4925 	dring_info_t		*dp, *tp;
4926 	int			i;
4927 
4928 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4929 
4930 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4931 
4932 	/* create public section of ring */
4933 	if ((ldc_mem_dring_create(vsw_ntxds,
4934 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
4935 
4936 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
4937 		    "failed", ldcp->ldc_id);
4938 		goto create_fail_exit;
4939 	}
4940 
4941 	ASSERT(dp->handle != NULL);
4942 
4943 	/*
4944 	 * Get the base address of the public section of the ring.
4945 	 */
4946 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4947 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
4948 		    ldcp->ldc_id);
4949 		goto dring_fail_exit;
4950 	} else {
4951 		ASSERT(minfo.vaddr != 0);
4952 		dp->pub_addr = minfo.vaddr;
4953 	}
4954 
4955 	dp->num_descriptors = vsw_ntxds;
4956 	dp->descriptor_size = VSW_PUB_SIZE;
4957 	dp->options = VIO_TX_DRING;
4958 	dp->ncookies = 1;	/* guaranteed by ldc */
4959 
4960 	/*
4961 	 * create private portion of ring
4962 	 */
4963 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
4964 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
4965 
4966 	if (vsw_setup_ring(ldcp, dp)) {
4967 		DERR(vswp, "%s: unable to setup ring", __func__);
4968 		goto dring_fail_exit;
4969 	}
4970 
4971 	/* haven't used any descriptors yet */
4972 	dp->end_idx = 0;
4973 	dp->last_ack_recv = -1;
4974 
4975 	/* bind dring to the channel */
4976 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
4977 	    LDC_SHADOW_MAP, LDC_MEM_RW,
4978 	    &dp->cookie[0], &dp->ncookies)) != 0) {
4979 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
4980 		    "%lld", ldcp->ldc_id);
4981 		goto dring_fail_exit;
4982 	}
4983 
4984 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4985 	dp->restart_reqd = B_TRUE;
4986 
4987 	/*
4988 	 * Only ever create rings for outgoing lane. Link it onto
4989 	 * end of list.
4990 	 */
4991 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4992 	if (ldcp->lane_out.dringp == NULL) {
4993 		D2(vswp, "vsw_create_dring: adding first outbound ring");
4994 		ldcp->lane_out.dringp = dp;
4995 	} else {
4996 		tp = ldcp->lane_out.dringp;
4997 		while (tp->next != NULL)
4998 			tp = tp->next;
4999 
5000 		tp->next = dp;
5001 	}
5002 	RW_EXIT(&ldcp->lane_out.dlistrw);
5003 
5004 	return (dp);
5005 
5006 dring_fail_exit:
5007 	(void) ldc_mem_dring_destroy(dp->handle);
5008 
5009 create_fail_exit:
5010 	if (dp->priv_addr != NULL) {
5011 		priv_addr = dp->priv_addr;
5012 		for (i = 0; i < vsw_ntxds; i++) {
5013 			if (priv_addr->memhandle != NULL)
5014 				(void) ldc_mem_free_handle(
5015 				    priv_addr->memhandle);
5016 			priv_addr++;
5017 		}
5018 		kmem_free(dp->priv_addr,
5019 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5020 	}
5021 	mutex_destroy(&dp->dlock);
5022 
5023 	kmem_free(dp, sizeof (dring_info_t));
5024 	return (NULL);
5025 }
5026 
5027 /*
5028  * Create a ring consisting of just a private portion and link
5029  * it into the list of rings for the outbound lane.
5030  *
5031  * These type of rings are used primarily for temporary data
5032  * storage (i.e. as data buffers).
5033  */
5034 void
5035 vsw_create_privring(vsw_ldc_t *ldcp)
5036 {
5037 	dring_info_t		*dp, *tp;
5038 	vsw_t			*vswp = ldcp->ldc_vswp;
5039 
5040 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5041 
5042 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5043 
5044 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5045 
5046 	/* no public section */
5047 	dp->pub_addr = NULL;
5048 
5049 	dp->priv_addr = kmem_zalloc(
5050 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5051 
5052 	dp->num_descriptors = vsw_ntxds;
5053 
5054 	if (vsw_setup_ring(ldcp, dp)) {
5055 		DERR(vswp, "%s: setup of ring failed", __func__);
5056 		kmem_free(dp->priv_addr,
5057 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5058 		mutex_destroy(&dp->dlock);
5059 		kmem_free(dp, sizeof (dring_info_t));
5060 		return;
5061 	}
5062 
5063 	/* haven't used any descriptors yet */
5064 	dp->end_idx = 0;
5065 
5066 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5067 	dp->restart_reqd = B_TRUE;
5068 
5069 	/*
5070 	 * Only ever create rings for outgoing lane. Link it onto
5071 	 * end of list.
5072 	 */
5073 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5074 	if (ldcp->lane_out.dringp == NULL) {
5075 		D2(vswp, "%s: adding first outbound privring", __func__);
5076 		ldcp->lane_out.dringp = dp;
5077 	} else {
5078 		tp = ldcp->lane_out.dringp;
5079 		while (tp->next != NULL)
5080 			tp = tp->next;
5081 
5082 		tp->next = dp;
5083 	}
5084 	RW_EXIT(&ldcp->lane_out.dlistrw);
5085 
5086 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5087 }
5088 
5089 /*
5090  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5091  * failure.
5092  */
5093 int
5094 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5095 {
5096 	vnet_public_desc_t	*pub_addr = NULL;
5097 	vsw_private_desc_t	*priv_addr = NULL;
5098 	vsw_t			*vswp = ldcp->ldc_vswp;
5099 	uint64_t		*tmpp;
5100 	uint64_t		offset = 0;
5101 	uint32_t		ncookies = 0;
5102 	static char		*name = "vsw_setup_ring";
5103 	int			i, j, nc, rv;
5104 	size_t			data_sz;
5105 
5106 	priv_addr = dp->priv_addr;
5107 	pub_addr = dp->pub_addr;
5108 
5109 	/* public section may be null but private should never be */
5110 	ASSERT(priv_addr != NULL);
5111 
5112 	/*
5113 	 * Allocate the region of memory which will be used to hold
5114 	 * the data the descriptors will refer to.
5115 	 */
5116 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5117 	data_sz = VNET_ROUNDUP_2K(data_sz);
5118 	dp->desc_data_sz = data_sz;
5119 	dp->data_sz = vsw_ntxds * data_sz;
5120 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5121 
5122 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5123 	    dp->data_sz, dp->data_addr);
5124 
5125 	tmpp = (uint64_t *)dp->data_addr;
5126 	offset = dp->desc_data_sz/sizeof (tmpp);
5127 
5128 	/*
5129 	 * Initialise some of the private and public (if they exist)
5130 	 * descriptor fields.
5131 	 */
5132 	for (i = 0; i < vsw_ntxds; i++) {
5133 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5134 
5135 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5136 		    &priv_addr->memhandle)) != 0) {
5137 			DERR(vswp, "%s: alloc mem handle failed", name);
5138 			goto setup_ring_cleanup;
5139 		}
5140 
5141 		priv_addr->datap = (void *)tmpp;
5142 
5143 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5144 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5145 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5146 		    &(priv_addr->memcookie[0]), &ncookies);
5147 		if (rv != 0) {
5148 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5149 			    "(rv %d)", name, ldcp->ldc_id, rv);
5150 			goto setup_ring_cleanup;
5151 		}
5152 		priv_addr->bound = 1;
5153 
5154 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5155 		    name, i, priv_addr->memcookie[0].addr,
5156 		    priv_addr->memcookie[0].size);
5157 
5158 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5159 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5160 			    "invalid num of cookies (%d) for size 0x%llx",
5161 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5162 
5163 			goto setup_ring_cleanup;
5164 		} else {
5165 			for (j = 1; j < ncookies; j++) {
5166 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5167 				    &(priv_addr->memcookie[j]));
5168 				if (rv != 0) {
5169 					DERR(vswp, "%s: ldc_mem_nextcookie "
5170 					    "failed rv (%d)", name, rv);
5171 					goto setup_ring_cleanup;
5172 				}
5173 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5174 				    "size 0x%llx", name, j,
5175 				    priv_addr->memcookie[j].addr,
5176 				    priv_addr->memcookie[j].size);
5177 			}
5178 
5179 		}
5180 		priv_addr->ncookies = ncookies;
5181 		priv_addr->dstate = VIO_DESC_FREE;
5182 
5183 		if (pub_addr != NULL) {
5184 
5185 			/* link pub and private sides */
5186 			priv_addr->descp = pub_addr;
5187 
5188 			pub_addr->ncookies = priv_addr->ncookies;
5189 
5190 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5191 				bcopy(&priv_addr->memcookie[nc],
5192 				    &pub_addr->memcookie[nc],
5193 				    sizeof (ldc_mem_cookie_t));
5194 			}
5195 
5196 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5197 			pub_addr++;
5198 		}
5199 
5200 		/*
5201 		 * move to next element in the dring and the next
5202 		 * position in the data buffer.
5203 		 */
5204 		priv_addr++;
5205 		tmpp += offset;
5206 	}
5207 
5208 	return (0);
5209 
5210 setup_ring_cleanup:
5211 	priv_addr = dp->priv_addr;
5212 
5213 	for (j = 0; j < i; j++) {
5214 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5215 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5216 
5217 		mutex_destroy(&priv_addr->dstate_lock);
5218 
5219 		priv_addr++;
5220 	}
5221 	kmem_free(dp->data_addr, dp->data_sz);
5222 
5223 	return (1);
5224 }
5225 
5226 /*
5227  * Searches the private section of a ring for a free descriptor,
5228  * starting at the location of the last free descriptor found
5229  * previously.
5230  *
5231  * Returns 0 if free descriptor is available, and updates state
5232  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5233  *
5234  * FUTURE: might need to return contiguous range of descriptors
5235  * as dring info msg assumes all will be contiguous.
5236  */
5237 static int
5238 vsw_dring_find_free_desc(dring_info_t *dringp,
5239 		vsw_private_desc_t **priv_p, int *idx)
5240 {
5241 	vsw_private_desc_t	*addr = NULL;
5242 	int			num = vsw_ntxds;
5243 	int			ret = 1;
5244 
5245 	D1(NULL, "%s enter\n", __func__);
5246 
5247 	ASSERT(dringp->priv_addr != NULL);
5248 
5249 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5250 	    __func__, dringp, dringp->end_idx);
5251 
5252 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5253 
5254 	mutex_enter(&addr->dstate_lock);
5255 	if (addr->dstate == VIO_DESC_FREE) {
5256 		addr->dstate = VIO_DESC_READY;
5257 		*priv_p = addr;
5258 		*idx = dringp->end_idx;
5259 		dringp->end_idx = (dringp->end_idx + 1) % num;
5260 		ret = 0;
5261 
5262 	}
5263 	mutex_exit(&addr->dstate_lock);
5264 
5265 	/* ring full */
5266 	if (ret == 1) {
5267 		D2(NULL, "%s: no desp free: started at %d", __func__,
5268 		    dringp->end_idx);
5269 	}
5270 
5271 	D1(NULL, "%s: exit\n", __func__);
5272 
5273 	return (ret);
5274 }
5275 
5276 /*
5277  * Map from a dring identifier to the ring itself. Returns
5278  * pointer to ring or NULL if no match found.
5279  *
5280  * Should be called with dlistrw rwlock held as reader.
5281  */
5282 static dring_info_t *
5283 vsw_ident2dring(lane_t *lane, uint64_t ident)
5284 {
5285 	dring_info_t	*dp = NULL;
5286 
5287 	if ((dp = lane->dringp) == NULL) {
5288 		return (NULL);
5289 	} else {
5290 		if (dp->ident == ident)
5291 			return (dp);
5292 
5293 		while (dp != NULL) {
5294 			if (dp->ident == ident)
5295 				break;
5296 			dp = dp->next;
5297 		}
5298 	}
5299 
5300 	return (dp);
5301 }
5302 
5303 /*
5304  * Set the default lane attributes. These are copied into
5305  * the attr msg we send to our peer. If they are not acceptable
5306  * then (currently) the handshake ends.
5307  */
5308 static void
5309 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5310 {
5311 	bzero(lp, sizeof (lane_t));
5312 
5313 	READ_ENTER(&vswp->if_lockrw);
5314 	ether_copy(&(vswp->if_addr), &(lp->addr));
5315 	RW_EXIT(&vswp->if_lockrw);
5316 
5317 	lp->mtu = vswp->max_frame_size;
5318 	lp->addr_type = ADDR_TYPE_MAC;
5319 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5320 	lp->ack_freq = 0;	/* for shared mode */
5321 	lp->seq_num = VNET_ISS;
5322 }
5323 
5324 /*
5325  * Verify that the attributes are acceptable.
5326  *
5327  * FUTURE: If some attributes are not acceptable, change them
5328  * our desired values.
5329  */
5330 static int
5331 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5332 {
5333 	int			ret = 0;
5334 	struct ether_addr	ea;
5335 	vsw_port_t		*port = ldcp->ldc_port;
5336 	lane_t			*lp = &ldcp->lane_out;
5337 
5338 	D1(NULL, "vsw_check_attr enter\n");
5339 
5340 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5341 	    (pkt->xfer_mode != lp->xfer_mode)) {
5342 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5343 		ret = 1;
5344 	}
5345 
5346 	/* Only support MAC addresses at moment. */
5347 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5348 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5349 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5350 		ret = 1;
5351 	}
5352 
5353 	/*
5354 	 * MAC address supplied by device should match that stored
5355 	 * in the vsw-port OBP node. Need to decide what to do if they
5356 	 * don't match, for the moment just warn but don't fail.
5357 	 */
5358 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5359 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5360 		DERR(NULL, "vsw_check_attr: device supplied address "
5361 		    "0x%llx doesn't match node address 0x%llx\n",
5362 		    pkt->addr, port->p_macaddr);
5363 	}
5364 
5365 	/*
5366 	 * Ack freq only makes sense in pkt mode, in shared
5367 	 * mode the ring descriptors say whether or not to
5368 	 * send back an ACK.
5369 	 */
5370 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5371 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5372 	    (VSW_VER_LT(ldcp, 1, 2) &&
5373 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5374 		if (pkt->ack_freq > 0) {
5375 			D2(NULL, "vsw_check_attr: non zero ack freq "
5376 			    " in SHM mode\n");
5377 			ret = 1;
5378 		}
5379 	}
5380 
5381 	/*
5382 	 * Note: for the moment we only support ETHER
5383 	 * frames. This may change in the future.
5384 	 */
5385 	if ((pkt->mtu > lp->mtu) || (pkt->mtu <= 0)) {
5386 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5387 		    pkt->mtu);
5388 		ret = 1;
5389 	}
5390 
5391 	D1(NULL, "vsw_check_attr exit\n");
5392 
5393 	return (ret);
5394 }
5395 
5396 /*
5397  * Returns 1 if there is a problem, 0 otherwise.
5398  */
5399 static int
5400 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5401 {
5402 	_NOTE(ARGUNUSED(pkt))
5403 
5404 	int	ret = 0;
5405 
5406 	D1(NULL, "vsw_check_dring_info enter\n");
5407 
5408 	if ((pkt->num_descriptors == 0) ||
5409 	    (pkt->descriptor_size == 0) ||
5410 	    (pkt->ncookies != 1)) {
5411 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5412 		ret = 1;
5413 	}
5414 
5415 	D1(NULL, "vsw_check_dring_info exit\n");
5416 
5417 	return (ret);
5418 }
5419 
5420 /*
5421  * Returns 1 if two memory cookies match. Otherwise returns 0.
5422  */
5423 static int
5424 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5425 {
5426 	if ((m1->addr != m2->addr) ||
5427 	    (m2->size != m2->size)) {
5428 		return (0);
5429 	} else {
5430 		return (1);
5431 	}
5432 }
5433 
5434 /*
5435  * Returns 1 if ring described in reg message matches that
5436  * described by dring_info structure. Otherwise returns 0.
5437  */
5438 static int
5439 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5440 {
5441 	if ((msg->descriptor_size != dp->descriptor_size) ||
5442 	    (msg->num_descriptors != dp->num_descriptors) ||
5443 	    (msg->ncookies != dp->ncookies) ||
5444 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5445 		return (0);
5446 	} else {
5447 		return (1);
5448 	}
5449 
5450 }
5451 
5452 static caddr_t
5453 vsw_print_ethaddr(uint8_t *a, char *ebuf)
5454 {
5455 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
5456 	    a[0], a[1], a[2], a[3], a[4], a[5]);
5457 	return (ebuf);
5458 }
5459 
5460 /*
5461  * Reset and free all the resources associated with
5462  * the channel.
5463  */
5464 static void
5465 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5466 {
5467 	dring_info_t		*dp, *dpp;
5468 	lane_t			*lp = NULL;
5469 	int			rv = 0;
5470 
5471 	ASSERT(ldcp != NULL);
5472 
5473 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5474 
5475 	if (dir == INBOUND) {
5476 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5477 		    " of channel %lld", __func__, ldcp->ldc_id);
5478 		lp = &ldcp->lane_in;
5479 	} else {
5480 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5481 		    " of channel %lld", __func__, ldcp->ldc_id);
5482 		lp = &ldcp->lane_out;
5483 	}
5484 
5485 	lp->lstate = VSW_LANE_INACTIV;
5486 	lp->seq_num = VNET_ISS;
5487 
5488 	if (lp->dringp) {
5489 		if (dir == INBOUND) {
5490 			WRITE_ENTER(&lp->dlistrw);
5491 			dp = lp->dringp;
5492 			while (dp != NULL) {
5493 				dpp = dp->next;
5494 				if (dp->handle != NULL)
5495 					(void) ldc_mem_dring_unmap(dp->handle);
5496 				kmem_free(dp, sizeof (dring_info_t));
5497 				dp = dpp;
5498 			}
5499 			RW_EXIT(&lp->dlistrw);
5500 		} else {
5501 			/*
5502 			 * unbind, destroy exported dring, free dring struct
5503 			 */
5504 			WRITE_ENTER(&lp->dlistrw);
5505 			dp = lp->dringp;
5506 			rv = vsw_free_ring(dp);
5507 			RW_EXIT(&lp->dlistrw);
5508 		}
5509 		if (rv == 0) {
5510 			lp->dringp = NULL;
5511 		}
5512 	}
5513 
5514 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5515 }
5516 
5517 /*
5518  * Free ring and all associated resources.
5519  *
5520  * Should be called with dlistrw rwlock held as writer.
5521  */
5522 static int
5523 vsw_free_ring(dring_info_t *dp)
5524 {
5525 	vsw_private_desc_t	*paddr = NULL;
5526 	dring_info_t		*dpp;
5527 	int			i, rv = 1;
5528 
5529 	while (dp != NULL) {
5530 		mutex_enter(&dp->dlock);
5531 		dpp = dp->next;
5532 		if (dp->priv_addr != NULL) {
5533 			/*
5534 			 * First unbind and free the memory handles
5535 			 * stored in each descriptor within the ring.
5536 			 */
5537 			for (i = 0; i < vsw_ntxds; i++) {
5538 				paddr = (vsw_private_desc_t *)
5539 				    dp->priv_addr + i;
5540 				if (paddr->memhandle != NULL) {
5541 					if (paddr->bound == 1) {
5542 						rv = ldc_mem_unbind_handle(
5543 						    paddr->memhandle);
5544 
5545 						if (rv != 0) {
5546 							DERR(NULL, "error "
5547 							"unbinding handle for "
5548 							"ring 0x%llx at pos %d",
5549 							    dp, i);
5550 							mutex_exit(&dp->dlock);
5551 							return (rv);
5552 						}
5553 						paddr->bound = 0;
5554 					}
5555 
5556 					rv = ldc_mem_free_handle(
5557 					    paddr->memhandle);
5558 					if (rv != 0) {
5559 						DERR(NULL, "error freeing "
5560 						    "handle for ring 0x%llx "
5561 						    "at pos %d", dp, i);
5562 						mutex_exit(&dp->dlock);
5563 						return (rv);
5564 					}
5565 					paddr->memhandle = NULL;
5566 				}
5567 				mutex_destroy(&paddr->dstate_lock);
5568 			}
5569 			kmem_free(dp->priv_addr,
5570 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5571 		}
5572 
5573 		/*
5574 		 * Now unbind and destroy the ring itself.
5575 		 */
5576 		if (dp->handle != NULL) {
5577 			(void) ldc_mem_dring_unbind(dp->handle);
5578 			(void) ldc_mem_dring_destroy(dp->handle);
5579 		}
5580 
5581 		if (dp->data_addr != NULL) {
5582 			kmem_free(dp->data_addr, dp->data_sz);
5583 		}
5584 
5585 		mutex_exit(&dp->dlock);
5586 		mutex_destroy(&dp->dlock);
5587 		mutex_destroy(&dp->restart_lock);
5588 		kmem_free(dp, sizeof (dring_info_t));
5589 
5590 		dp = dpp;
5591 	}
5592 	return (0);
5593 }
5594 
5595 /*
5596  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5597  * This thread is woken up by the LDC interrupt handler to process
5598  * LDC packets and receive data.
5599  */
5600 static void
5601 vsw_ldc_rx_worker(void *arg)
5602 {
5603 	callb_cpr_t	cprinfo;
5604 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5605 	vsw_t *vswp = ldcp->ldc_vswp;
5606 
5607 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5608 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5609 	    "vsw_rx_thread");
5610 	mutex_enter(&ldcp->rx_thr_lock);
5611 	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5612 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5613 
5614 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5615 		/*
5616 		 * Wait until the data is received or a stop
5617 		 * request is received.
5618 		 */
5619 		while (!(ldcp->rx_thr_flags &
5620 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5621 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5622 		}
5623 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5624 
5625 		/*
5626 		 * First process the stop request.
5627 		 */
5628 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5629 			D2(vswp, "%s(%lld):Rx thread stopped\n",
5630 			    __func__, ldcp->ldc_id);
5631 			break;
5632 		}
5633 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5634 		mutex_exit(&ldcp->rx_thr_lock);
5635 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5636 		    __func__, ldcp->ldc_id);
5637 		mutex_enter(&ldcp->ldc_cblock);
5638 		vsw_process_pkt(ldcp);
5639 		mutex_exit(&ldcp->ldc_cblock);
5640 		mutex_enter(&ldcp->rx_thr_lock);
5641 	}
5642 
5643 	/*
5644 	 * Update the run status and wakeup the thread that
5645 	 * has sent the stop request.
5646 	 */
5647 	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5648 	cv_signal(&ldcp->rx_thr_cv);
5649 	CALLB_CPR_EXIT(&cprinfo);
5650 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5651 	thread_exit();
5652 }
5653 
5654 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5655 static void
5656 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5657 {
5658 	vsw_t *vswp = ldcp->ldc_vswp;
5659 
5660 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5661 	/*
5662 	 * Send a stop request by setting the stop flag and
5663 	 * wait until the receive thread stops.
5664 	 */
5665 	mutex_enter(&ldcp->rx_thr_lock);
5666 	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5667 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5668 		cv_signal(&ldcp->rx_thr_cv);
5669 		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5670 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5671 		}
5672 	}
5673 	mutex_exit(&ldcp->rx_thr_lock);
5674 	ldcp->rx_thread = NULL;
5675 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5676 }
5677 
5678 /*
5679  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5680  * This thread is woken up by the vsw_portsend to transmit
5681  * packets.
5682  */
5683 static void
5684 vsw_ldc_tx_worker(void *arg)
5685 {
5686 	callb_cpr_t	cprinfo;
5687 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5688 	vsw_t *vswp = ldcp->ldc_vswp;
5689 	mblk_t *mp;
5690 	mblk_t *tmp;
5691 
5692 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5693 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5694 	    "vnet_tx_thread");
5695 	mutex_enter(&ldcp->tx_thr_lock);
5696 	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5697 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5698 
5699 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5700 		/*
5701 		 * Wait until the data is received or a stop
5702 		 * request is received.
5703 		 */
5704 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5705 		    (ldcp->tx_mhead == NULL)) {
5706 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5707 		}
5708 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5709 
5710 		/*
5711 		 * First process the stop request.
5712 		 */
5713 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5714 			D2(vswp, "%s(%lld):tx thread stopped\n",
5715 			    __func__, ldcp->ldc_id);
5716 			break;
5717 		}
5718 		mp = ldcp->tx_mhead;
5719 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5720 		ldcp->tx_cnt = 0;
5721 		mutex_exit(&ldcp->tx_thr_lock);
5722 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5723 		    __func__, ldcp->ldc_id);
5724 		while (mp != NULL) {
5725 			tmp = mp->b_next;
5726 			mp->b_next = mp->b_prev = NULL;
5727 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5728 			mp = tmp;
5729 		}
5730 		mutex_enter(&ldcp->tx_thr_lock);
5731 	}
5732 
5733 	/*
5734 	 * Update the run status and wakeup the thread that
5735 	 * has sent the stop request.
5736 	 */
5737 	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5738 	cv_signal(&ldcp->tx_thr_cv);
5739 	CALLB_CPR_EXIT(&cprinfo);
5740 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5741 	thread_exit();
5742 }
5743 
5744 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5745 static void
5746 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5747 {
5748 	vsw_t *vswp = ldcp->ldc_vswp;
5749 
5750 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5751 	/*
5752 	 * Send a stop request by setting the stop flag and
5753 	 * wait until the receive thread stops.
5754 	 */
5755 	mutex_enter(&ldcp->tx_thr_lock);
5756 	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5757 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5758 		cv_signal(&ldcp->tx_thr_cv);
5759 		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5760 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5761 		}
5762 	}
5763 	mutex_exit(&ldcp->tx_thr_lock);
5764 	ldcp->tx_thread = NULL;
5765 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5766 }
5767 
5768 /* vsw_reclaim_dring -- reclaim descriptors */
5769 static int
5770 vsw_reclaim_dring(dring_info_t *dp, int start)
5771 {
5772 	int i, j, len;
5773 	vsw_private_desc_t *priv_addr;
5774 	vnet_public_desc_t *pub_addr;
5775 
5776 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5777 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5778 	len = dp->num_descriptors;
5779 
5780 	D2(NULL, "%s: start index %ld\n", __func__, start);
5781 
5782 	j = 0;
5783 	for (i = start; j < len; i = (i + 1) % len, j++) {
5784 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5785 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5786 
5787 		mutex_enter(&priv_addr->dstate_lock);
5788 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
5789 			mutex_exit(&priv_addr->dstate_lock);
5790 			break;
5791 		}
5792 		pub_addr->hdr.dstate = VIO_DESC_FREE;
5793 		priv_addr->dstate = VIO_DESC_FREE;
5794 		/* clear all the fields */
5795 		priv_addr->datalen = 0;
5796 		pub_addr->hdr.ack = 0;
5797 		mutex_exit(&priv_addr->dstate_lock);
5798 
5799 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
5800 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
5801 	}
5802 	return (j);
5803 }
5804 
5805 /*
5806  * Debugging routines
5807  */
5808 static void
5809 display_state(void)
5810 {
5811 	vsw_t		*vswp;
5812 	vsw_port_list_t	*plist;
5813 	vsw_port_t 	*port;
5814 	vsw_ldc_list_t	*ldcl;
5815 	vsw_ldc_t 	*ldcp;
5816 	extern vsw_t 	*vsw_head;
5817 
5818 	cmn_err(CE_NOTE, "***** system state *****");
5819 
5820 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
5821 		plist = &vswp->plist;
5822 		READ_ENTER(&plist->lockrw);
5823 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
5824 		    vswp->instance, plist->num_ports);
5825 
5826 		for (port = plist->head; port != NULL; port = port->p_next) {
5827 			ldcl = &port->p_ldclist;
5828 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
5829 			    port->p_instance, port->num_ldcs);
5830 			READ_ENTER(&ldcl->lockrw);
5831 			ldcp = ldcl->head;
5832 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
5833 				cmn_err(CE_CONT, "chan %lu : dev %d : "
5834 				    "status %d : phase %u\n",
5835 				    ldcp->ldc_id, ldcp->dev_class,
5836 				    ldcp->ldc_status, ldcp->hphase);
5837 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
5838 				    "psession %lu\n", ldcp->ldc_id,
5839 				    ldcp->local_session, ldcp->peer_session);
5840 
5841 				cmn_err(CE_CONT, "Inbound lane:\n");
5842 				display_lane(&ldcp->lane_in);
5843 				cmn_err(CE_CONT, "Outbound lane:\n");
5844 				display_lane(&ldcp->lane_out);
5845 			}
5846 			RW_EXIT(&ldcl->lockrw);
5847 		}
5848 		RW_EXIT(&plist->lockrw);
5849 	}
5850 	cmn_err(CE_NOTE, "***** system state *****");
5851 }
5852 
5853 static void
5854 display_lane(lane_t *lp)
5855 {
5856 	dring_info_t	*drp;
5857 
5858 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
5859 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
5860 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
5861 	    lp->addr_type, lp->addr, lp->xfer_mode);
5862 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
5863 
5864 	cmn_err(CE_CONT, "Dring info:\n");
5865 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
5866 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
5867 		    drp->num_descriptors, drp->descriptor_size);
5868 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
5869 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
5870 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
5871 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
5872 		    drp->ident, drp->end_idx);
5873 		display_ring(drp);
5874 	}
5875 }
5876 
5877 static void
5878 display_ring(dring_info_t *dringp)
5879 {
5880 	uint64_t		i;
5881 	uint64_t		priv_count = 0;
5882 	uint64_t		pub_count = 0;
5883 	vnet_public_desc_t	*pub_addr = NULL;
5884 	vsw_private_desc_t	*priv_addr = NULL;
5885 
5886 	for (i = 0; i < vsw_ntxds; i++) {
5887 		if (dringp->pub_addr != NULL) {
5888 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
5889 
5890 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
5891 				pub_count++;
5892 		}
5893 
5894 		if (dringp->priv_addr != NULL) {
5895 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
5896 
5897 			if (priv_addr->dstate == VIO_DESC_FREE)
5898 				priv_count++;
5899 		}
5900 	}
5901 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
5902 	    i, priv_count, pub_count);
5903 }
5904 
5905 static void
5906 dump_flags(uint64_t state)
5907 {
5908 	int	i;
5909 
5910 	typedef struct flag_name {
5911 		int	flag_val;
5912 		char	*flag_name;
5913 	} flag_name_t;
5914 
5915 	flag_name_t	flags[] = {
5916 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
5917 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
5918 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
5919 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
5920 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
5921 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
5922 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
5923 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
5924 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
5925 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
5926 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
5927 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
5928 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
5929 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
5930 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
5931 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
5932 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
5933 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
5934 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
5935 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
5936 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
5937 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
5938 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
5939 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
5940 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
5941 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
5942 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
5943 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
5944 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
5945 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
5946 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
5947 
5948 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
5949 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
5950 		if (state & flags[i].flag_val)
5951 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
5952 	}
5953 }
5954