xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_ldc.c (revision 72cf314316bed51cd2e5fd0cb021a9725316a6b0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/callb.h>
72 #include <sys/vlan.h>
73 
74 /* Port add/deletion/etc routines */
75 static	void vsw_port_delete(vsw_port_t *port);
76 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
77 static	void vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
78 static	int vsw_init_ldcs(vsw_port_t *port);
79 static	void vsw_uninit_ldcs(vsw_port_t *port);
80 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
81 static	void vsw_ldc_uninit(vsw_ldc_t *ldcp);
82 static	void vsw_drain_ldcs(vsw_port_t *port);
83 static	void vsw_drain_port_taskq(vsw_port_t *port);
84 static	void vsw_marker_task(void *);
85 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
86 void vsw_detach_ports(vsw_t *vswp);
87 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
88 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
89 int vsw_port_detach(vsw_t *vswp, int p_instance);
90 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
91 int vsw_port_attach(vsw_port_t *portp);
92 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
93 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
94 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
95 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
96 void vsw_reset_ports(vsw_t *vswp);
97 void vsw_port_reset(vsw_port_t *portp);
98 void vsw_physlink_update_ports(vsw_t *vswp);
99 static	void vsw_port_physlink_update(vsw_port_t *portp);
100 
101 /* Interrupt routines */
102 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
103 
104 /* Handshake routines */
105 static	void vsw_ldc_reinit(vsw_ldc_t *);
106 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
107 static	void vsw_conn_task(void *);
108 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
109 static	void vsw_next_milestone(vsw_ldc_t *);
110 static	int vsw_supported_version(vio_ver_msg_t *);
111 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
112 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
113 
114 /* Data processing routines */
115 static void vsw_process_pkt(void *);
116 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
117 static void vsw_process_ctrl_pkt(void *);
118 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
123 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
124 static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
125 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
126 	uint32_t);
127 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
128 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
129 static void vsw_process_pkt_data(void *, void *, uint32_t);
130 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
131 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
132 
133 /* Switching/data transmit routines */
134 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
135 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
136 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
137 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
138 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
139 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
140 
141 /* Packet creation routines */
142 static void vsw_send_ver(void *);
143 static void vsw_send_attr(vsw_ldc_t *);
144 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
145 static void vsw_send_dring_info(vsw_ldc_t *);
146 static void vsw_send_rdx(vsw_ldc_t *);
147 static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);
148 
149 /* Dring routines */
150 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
151 static void vsw_create_privring(vsw_ldc_t *);
152 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
153 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
154     int *);
155 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
156 static int vsw_reclaim_dring(dring_info_t *dp, int start);
157 
158 static void vsw_set_lane_attr(vsw_t *, lane_t *);
159 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
160 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
161 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
162 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
163 
164 /* Rcv/Tx thread routines */
165 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
166 static void vsw_ldc_tx_worker(void *arg);
167 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
168 static void vsw_ldc_rx_worker(void *arg);
169 
170 /* Misc support routines */
171 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
172 static void vsw_free_ring(dring_info_t *);
173 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
174 static int vsw_get_same_dest_list(struct ether_header *ehp,
175     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
176 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
177 
178 /* Debugging routines */
179 static void dump_flags(uint64_t);
180 static void display_state(void);
181 static void display_lane(lane_t *);
182 static void display_ring(dring_info_t *);
183 
184 /*
185  * Functions imported from other files.
186  */
187 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
188 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
189 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
190 extern void vsw_del_mcst_port(vsw_port_t *port);
191 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
192 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
193 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
194 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
195 extern void vsw_create_vlans(void *arg, int type);
196 extern void vsw_destroy_vlans(void *arg, int type);
197 extern void vsw_vlan_add_ids(void *arg, int type);
198 extern void vsw_vlan_remove_ids(void *arg, int type);
199 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
200 	struct ether_header *ehp, uint16_t *vidp);
201 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
202 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
203 	mblk_t **npt);
204 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
205 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
206 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
207 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
208 extern void vsw_hio_stop_port(vsw_port_t *portp);
209 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
210 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
211 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
212 
213 
214 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
215 
216 /*
217  * Tunables used in this file.
218  */
219 extern int vsw_num_handshakes;
220 extern int vsw_wretries;
221 extern int vsw_desc_delay;
222 extern int vsw_read_attempts;
223 extern int vsw_ldc_tx_delay;
224 extern int vsw_ldc_tx_retries;
225 extern int vsw_ldc_retries;
226 extern int vsw_ldc_delay;
227 extern boolean_t vsw_ldc_rxthr_enabled;
228 extern boolean_t vsw_ldc_txthr_enabled;
229 extern uint32_t vsw_ntxds;
230 extern uint32_t vsw_max_tx_qcount;
231 extern uint32_t vsw_chain_len;
232 extern uint32_t vsw_mblk_size1;
233 extern uint32_t vsw_mblk_size2;
234 extern uint32_t vsw_mblk_size3;
235 extern uint32_t vsw_mblk_size4;
236 extern uint32_t vsw_num_mblks1;
237 extern uint32_t vsw_num_mblks2;
238 extern uint32_t vsw_num_mblks3;
239 extern uint32_t vsw_num_mblks4;
240 extern boolean_t vsw_obp_ver_proto_workaround;
241 extern uint32_t vsw_publish_macaddr_count;
242 extern boolean_t vsw_jumbo_rxpools;
243 
244 #define	LDC_ENTER_LOCK(ldcp)	\
245 				mutex_enter(&((ldcp)->ldc_cblock));\
246 				mutex_enter(&((ldcp)->ldc_rxlock));\
247 				mutex_enter(&((ldcp)->ldc_txlock));
248 #define	LDC_EXIT_LOCK(ldcp)	\
249 				mutex_exit(&((ldcp)->ldc_txlock));\
250 				mutex_exit(&((ldcp)->ldc_rxlock));\
251 				mutex_exit(&((ldcp)->ldc_cblock));
252 
253 #define	VSW_VER_EQ(ldcp, major, minor)	\
254 	((ldcp)->lane_out.ver_major == (major) &&	\
255 	    (ldcp)->lane_out.ver_minor == (minor))
256 
257 #define	VSW_VER_LT(ldcp, major, minor)	\
258 	(((ldcp)->lane_out.ver_major < (major)) ||	\
259 	    ((ldcp)->lane_out.ver_major == (major) &&	\
260 	    (ldcp)->lane_out.ver_minor < (minor)))
261 
262 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
263 	(((ldcp)->lane_out.ver_major > (major)) ||	\
264 	    ((ldcp)->lane_out.ver_major == (major) &&	\
265 	    (ldcp)->lane_out.ver_minor >= (minor)))
266 
267 /*
268  * VIO Protocol Version Info:
269  *
270  * The version specified below represents the version of protocol currently
271  * supported in the driver. It means the driver can negotiate with peers with
272  * versions <= this version. Here is a summary of the feature(s) that are
273  * supported at each version of the protocol:
274  *
275  * 1.0			Basic VIO protocol.
276  * 1.1			vDisk protocol update (no virtual network update).
277  * 1.2			Support for priority frames (priority-ether-types).
278  * 1.3			VLAN and HybridIO support.
279  * 1.4			Jumbo Frame support.
280  * 1.5			Link State Notification support with optional support
281  * 			for Physical Link information.
282  */
283 static	ver_sup_t	vsw_versions[] = { {1, 5} };
284 
285 /*
286  * For the moment the state dump routines have their own
287  * private flag.
288  */
289 #define	DUMP_STATE	0
290 
291 #if DUMP_STATE
292 
293 #define	DUMP_TAG(tag) \
294 {			\
295 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
296 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
297 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
298 }
299 
300 #define	DUMP_TAG_PTR(tag) \
301 {			\
302 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
303 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
304 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
305 }
306 
307 #define	DUMP_FLAGS(flags) dump_flags(flags);
308 #define	DISPLAY_STATE()	display_state()
309 
310 #else
311 
312 #define	DUMP_TAG(tag)
313 #define	DUMP_TAG_PTR(tag)
314 #define	DUMP_FLAGS(state)
315 #define	DISPLAY_STATE()
316 
317 #endif	/* DUMP_STATE */
318 
319 /*
320  * Attach the specified port.
321  *
322  * Returns 0 on success, 1 on failure.
323  */
324 int
325 vsw_port_attach(vsw_port_t *port)
326 {
327 	vsw_t			*vswp = port->p_vswp;
328 	vsw_port_list_t		*plist = &vswp->plist;
329 	vsw_port_t		*p, **pp;
330 	int			i;
331 	int			nids = port->num_ldcs;
332 	uint64_t		*ldcids;
333 	int			rv;
334 
335 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
336 
337 	/* port already exists? */
338 	READ_ENTER(&plist->lockrw);
339 	for (p = plist->head; p != NULL; p = p->p_next) {
340 		if (p->p_instance == port->p_instance) {
341 			DWARN(vswp, "%s: port instance %d already attached",
342 			    __func__, p->p_instance);
343 			RW_EXIT(&plist->lockrw);
344 			return (1);
345 		}
346 	}
347 	RW_EXIT(&plist->lockrw);
348 
349 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
350 
351 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
352 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
353 	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
354 
355 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
356 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
357 	port->state = VSW_PORT_INIT;
358 
359 	D2(vswp, "%s: %d nids", __func__, nids);
360 	ldcids = port->ldc_ids;
361 	for (i = 0; i < nids; i++) {
362 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
363 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
364 			DERR(vswp, "%s: ldc_attach failed", __func__);
365 			goto exit_error;
366 		}
367 	}
368 
369 	if (vswp->switching_setup_done == B_TRUE) {
370 		/*
371 		 * If the underlying network device has been setup,
372 		 * then open a mac client and porgram the mac address
373 		 * for this port.
374 		 */
375 		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
376 		if (rv != 0) {
377 			goto exit_error;
378 		}
379 	}
380 
381 	/* create the fdb entry for this port/mac address */
382 	vsw_fdbe_add(vswp, port);
383 
384 	vsw_create_vlans(port, VSW_VNETPORT);
385 
386 	WRITE_ENTER(&plist->lockrw);
387 
388 	/* link it into the list of ports for this vsw instance */
389 	pp = (vsw_port_t **)(&plist->head);
390 	port->p_next = *pp;
391 	*pp = port;
392 	plist->num_ports++;
393 
394 	RW_EXIT(&plist->lockrw);
395 
396 	/*
397 	 * Initialise the port and any ldc's under it.
398 	 */
399 	(void) vsw_init_ldcs(port);
400 
401 	/* announce macaddr of vnet to the physical switch */
402 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
403 		vsw_publish_macaddr(vswp, port);
404 	}
405 
406 	D1(vswp, "%s: exit", __func__);
407 	return (0);
408 
409 exit_error:
410 	rw_destroy(&port->p_ldclist.lockrw);
411 
412 	cv_destroy(&port->state_cv);
413 	mutex_destroy(&port->state_lock);
414 
415 	rw_destroy(&port->maccl_rwlock);
416 	mutex_destroy(&port->tx_lock);
417 	mutex_destroy(&port->mca_lock);
418 	kmem_free(port, sizeof (vsw_port_t));
419 	return (1);
420 }
421 
422 /*
423  * Detach the specified port.
424  *
425  * Returns 0 on success, 1 on failure.
426  */
427 int
428 vsw_port_detach(vsw_t *vswp, int p_instance)
429 {
430 	vsw_port_t	*port = NULL;
431 	vsw_port_list_t	*plist = &vswp->plist;
432 
433 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
434 
435 	WRITE_ENTER(&plist->lockrw);
436 
437 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
438 		RW_EXIT(&plist->lockrw);
439 		return (1);
440 	}
441 
442 	if (vsw_plist_del_node(vswp, port)) {
443 		RW_EXIT(&plist->lockrw);
444 		return (1);
445 	}
446 
447 	/* cleanup any HybridIO for this port */
448 	vsw_hio_stop_port(port);
449 
450 	/*
451 	 * No longer need to hold writer lock on port list now
452 	 * that we have unlinked the target port from the list.
453 	 */
454 	RW_EXIT(&plist->lockrw);
455 
456 	/* Cleanup and close the mac client */
457 	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
458 
459 	/* Remove the fdb entry for this port/mac address */
460 	vsw_fdbe_del(vswp, &(port->p_macaddr));
461 	vsw_destroy_vlans(port, VSW_VNETPORT);
462 
463 	/* Remove any multicast addresses.. */
464 	vsw_del_mcst_port(port);
465 
466 	vsw_port_delete(port);
467 
468 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
469 	return (0);
470 }
471 
472 /*
473  * Detach all active ports.
474  */
475 void
476 vsw_detach_ports(vsw_t *vswp)
477 {
478 	vsw_port_list_t 	*plist = &vswp->plist;
479 	vsw_port_t		*port = NULL;
480 
481 	D1(vswp, "%s: enter", __func__);
482 
483 	WRITE_ENTER(&plist->lockrw);
484 
485 	while ((port = plist->head) != NULL) {
486 		(void) vsw_plist_del_node(vswp, port);
487 
488 		/* cleanup any HybridIO for this port */
489 		vsw_hio_stop_port(port);
490 
491 		/* Cleanup and close the mac client */
492 		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
493 
494 		/* Remove the fdb entry for this port/mac address */
495 		vsw_fdbe_del(vswp, &(port->p_macaddr));
496 		vsw_destroy_vlans(port, VSW_VNETPORT);
497 
498 		/* Remove any multicast addresses.. */
499 		vsw_del_mcst_port(port);
500 
501 		/*
502 		 * No longer need to hold the lock on the port list
503 		 * now that we have unlinked the target port from the
504 		 * list.
505 		 */
506 		RW_EXIT(&plist->lockrw);
507 		vsw_port_delete(port);
508 		WRITE_ENTER(&plist->lockrw);
509 	}
510 	RW_EXIT(&plist->lockrw);
511 
512 	D1(vswp, "%s: exit", __func__);
513 }
514 
515 /*
516  * Delete the specified port.
517  */
518 static void
519 vsw_port_delete(vsw_port_t *port)
520 {
521 	vsw_ldc_list_t 		*ldcl;
522 	vsw_t			*vswp = port->p_vswp;
523 	int			num_ldcs;
524 
525 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
526 
527 	vsw_uninit_ldcs(port);
528 
529 	/*
530 	 * Wait for any pending ctrl msg tasks which reference this
531 	 * port to finish.
532 	 */
533 	vsw_drain_port_taskq(port);
534 
535 	/*
536 	 * Wait for any active callbacks to finish
537 	 */
538 	vsw_drain_ldcs(port);
539 
540 	ldcl = &port->p_ldclist;
541 	num_ldcs = port->num_ldcs;
542 	WRITE_ENTER(&ldcl->lockrw);
543 	while (num_ldcs > 0) {
544 		vsw_ldc_detach(port, ldcl->head->ldc_id);
545 		num_ldcs--;
546 	}
547 	RW_EXIT(&ldcl->lockrw);
548 
549 	rw_destroy(&port->p_ldclist.lockrw);
550 
551 	rw_destroy(&port->maccl_rwlock);
552 	mutex_destroy(&port->mca_lock);
553 	mutex_destroy(&port->tx_lock);
554 
555 	cv_destroy(&port->state_cv);
556 	mutex_destroy(&port->state_lock);
557 
558 	if (port->num_ldcs != 0) {
559 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
560 		port->num_ldcs = 0;
561 	}
562 
563 	if (port->nvids != 0) {
564 		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
565 	}
566 
567 	kmem_free(port, sizeof (vsw_port_t));
568 
569 	D1(vswp, "%s: exit", __func__);
570 }
571 
572 static int
573 vsw_init_multipools(vsw_ldc_t *ldcp, vsw_t *vswp)
574 {
575 	size_t		data_sz;
576 	int		rv;
577 	uint32_t	sz1 = 0;
578 	uint32_t	sz2 = 0;
579 	uint32_t	sz3 = 0;
580 	uint32_t	sz4 = 0;
581 
582 	/*
583 	 * We round up the mtu specified to be a multiple of 2K to limit the
584 	 * number of rx buffer pools created for a given mtu.
585 	 */
586 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
587 	data_sz = VNET_ROUNDUP_2K(data_sz);
588 
589 	/*
590 	 * If pool sizes are specified, use them. Note that the presence of
591 	 * the first tunable will be used as a hint.
592 	 */
593 	if (vsw_mblk_size1 != 0) {
594 		sz1 = vsw_mblk_size1;
595 		sz2 = vsw_mblk_size2;
596 		sz3 = vsw_mblk_size3;
597 		sz4 = vsw_mblk_size4;
598 
599 		if (sz4 == 0) { /* need 3 pools */
600 
601 			ldcp->max_rxpool_size = sz3;
602 			rv = vio_init_multipools(&ldcp->vmp,
603 			    VSW_NUM_VMPOOLS, sz1, sz2, sz3,
604 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
605 
606 		} else {
607 
608 			ldcp->max_rxpool_size = sz4;
609 			rv = vio_init_multipools(&ldcp->vmp,
610 			    VSW_NUM_VMPOOLS + 1, sz1, sz2, sz3, sz4,
611 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
612 			    vsw_num_mblks4);
613 
614 		}
615 
616 		return (rv);
617 	}
618 
619 	/*
620 	 * Pool sizes are not specified. We select the pool sizes based on the
621 	 * mtu if vnet_jumbo_rxpools is enabled.
622 	 */
623 	if (vsw_jumbo_rxpools == B_FALSE || data_sz == VNET_2K) {
624 		/*
625 		 * Receive buffer pool allocation based on mtu is disabled.
626 		 * Use the default mechanism of standard size pool allocation.
627 		 */
628 		sz1 = VSW_MBLK_SZ_128;
629 		sz2 = VSW_MBLK_SZ_256;
630 		sz3 = VSW_MBLK_SZ_2048;
631 		ldcp->max_rxpool_size = sz3;
632 
633 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
634 		    sz1, sz2, sz3,
635 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
636 
637 		return (rv);
638 	}
639 
640 	switch (data_sz) {
641 
642 	case VNET_4K:
643 
644 		sz1 = VSW_MBLK_SZ_128;
645 		sz2 = VSW_MBLK_SZ_256;
646 		sz3 = VSW_MBLK_SZ_2048;
647 		sz4 = sz3 << 1;			/* 4K */
648 		ldcp->max_rxpool_size = sz4;
649 
650 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
651 		    sz1, sz2, sz3, sz4,
652 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
653 		    vsw_num_mblks4);
654 		break;
655 
656 	default:	/* data_sz:  4K+ to 16K */
657 
658 		sz1 = VSW_MBLK_SZ_256;
659 		sz2 = VSW_MBLK_SZ_2048;
660 		sz3 = data_sz >> 1;	/* Jumbo-size/2 */
661 		sz4 = data_sz;	/* Jumbo-size */
662 		ldcp->max_rxpool_size = sz4;
663 
664 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
665 		    sz1, sz2, sz3, sz4,
666 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
667 		    vsw_num_mblks4);
668 		break;
669 	}
670 
671 	return (rv);
672 
673 }
674 
675 /*
676  * Attach a logical domain channel (ldc) under a specified port.
677  *
678  * Returns 0 on success, 1 on failure.
679  */
680 static int
681 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
682 {
683 	vsw_t 		*vswp = port->p_vswp;
684 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
685 	vsw_ldc_t 	*ldcp = NULL;
686 	ldc_attr_t 	attr;
687 	ldc_status_t	istatus;
688 	int 		status = DDI_FAILURE;
689 	char		kname[MAXNAMELEN];
690 	enum		{ PROG_init = 0x0,
691 			    PROG_callback = 0x1, PROG_rx_thread = 0x2,
692 			    PROG_tx_thread = 0x4}
693 			progress;
694 
695 	progress = PROG_init;
696 
697 	D1(vswp, "%s: enter", __func__);
698 
699 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
700 	if (ldcp == NULL) {
701 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
702 		return (1);
703 	}
704 	ldcp->ldc_id = ldc_id;
705 
706 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
707 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
708 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
709 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
710 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
711 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
712 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
713 
714 	/* required for handshake with peer */
715 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
716 	ldcp->peer_session = 0;
717 	ldcp->session_status = 0;
718 	ldcp->hss_id = 1;	/* Initial handshake session id */
719 
720 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
721 
722 	/* only set for outbound lane, inbound set by peer */
723 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
724 
725 	attr.devclass = LDC_DEV_NT_SVC;
726 	attr.instance = ddi_get_instance(vswp->dip);
727 	attr.mode = LDC_MODE_UNRELIABLE;
728 	attr.mtu = VSW_LDC_MTU;
729 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
730 	if (status != 0) {
731 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
732 		    __func__, ldc_id, status);
733 		goto ldc_attach_fail;
734 	}
735 
736 	if (vsw_ldc_rxthr_enabled) {
737 		ldcp->rx_thr_flags = 0;
738 
739 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
740 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
741 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
742 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
743 
744 		progress |= PROG_rx_thread;
745 		if (ldcp->rx_thread == NULL) {
746 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
747 			    __func__, ldc_id);
748 			goto ldc_attach_fail;
749 		}
750 	}
751 
752 	if (vsw_ldc_txthr_enabled) {
753 		ldcp->tx_thr_flags = 0;
754 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
755 
756 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
757 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
758 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
759 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
760 
761 		progress |= PROG_tx_thread;
762 		if (ldcp->tx_thread == NULL) {
763 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
764 			    __func__, ldc_id);
765 			goto ldc_attach_fail;
766 		}
767 	}
768 
769 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
770 	if (status != 0) {
771 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
772 		    __func__, ldc_id, status);
773 		(void) ldc_fini(ldcp->ldc_handle);
774 		goto ldc_attach_fail;
775 	}
776 	/*
777 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
778 	 * data msgs, including raw data msgs used to recv priority frames.
779 	 */
780 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
781 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
782 
783 	progress |= PROG_callback;
784 
785 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
786 
787 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
788 		DERR(vswp, "%s: ldc_status failed", __func__);
789 		mutex_destroy(&ldcp->status_lock);
790 		goto ldc_attach_fail;
791 	}
792 
793 	ldcp->ldc_status = istatus;
794 	ldcp->ldc_port = port;
795 	ldcp->ldc_vswp = vswp;
796 
797 	vsw_reset_vnet_proto_ops(ldcp);
798 
799 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
800 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
801 	    kname, &ldcp->ldc_stats);
802 	if (ldcp->ksp == NULL) {
803 		DERR(vswp, "%s: kstats setup failed", __func__);
804 		goto ldc_attach_fail;
805 	}
806 
807 	/* link it into the list of channels for this port */
808 	WRITE_ENTER(&ldcl->lockrw);
809 	ldcp->ldc_next = ldcl->head;
810 	ldcl->head = ldcp;
811 	RW_EXIT(&ldcl->lockrw);
812 
813 	D1(vswp, "%s: exit", __func__);
814 	return (0);
815 
816 ldc_attach_fail:
817 
818 	if (progress & PROG_callback) {
819 		(void) ldc_unreg_callback(ldcp->ldc_handle);
820 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
821 	}
822 
823 	if (progress & PROG_rx_thread) {
824 		if (ldcp->rx_thread != NULL) {
825 			vsw_stop_rx_thread(ldcp);
826 		}
827 		mutex_destroy(&ldcp->rx_thr_lock);
828 		cv_destroy(&ldcp->rx_thr_cv);
829 	}
830 
831 	if (progress & PROG_tx_thread) {
832 		if (ldcp->tx_thread != NULL) {
833 			vsw_stop_tx_thread(ldcp);
834 		}
835 		mutex_destroy(&ldcp->tx_thr_lock);
836 		cv_destroy(&ldcp->tx_thr_cv);
837 	}
838 	if (ldcp->ksp != NULL) {
839 		vgen_destroy_kstats(ldcp->ksp);
840 	}
841 	mutex_destroy(&ldcp->ldc_txlock);
842 	mutex_destroy(&ldcp->ldc_rxlock);
843 	mutex_destroy(&ldcp->ldc_cblock);
844 	mutex_destroy(&ldcp->drain_cv_lock);
845 
846 	cv_destroy(&ldcp->drain_cv);
847 
848 	rw_destroy(&ldcp->lane_in.dlistrw);
849 	rw_destroy(&ldcp->lane_out.dlistrw);
850 
851 	kmem_free(ldcp, sizeof (vsw_ldc_t));
852 
853 	return (1);
854 }
855 
856 /*
857  * Detach a logical domain channel (ldc) belonging to a
858  * particular port.
859  */
860 static void
861 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
862 {
863 	vsw_t 		*vswp = port->p_vswp;
864 	vsw_ldc_t 	*ldcp, *prev_ldcp;
865 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
866 	int 		rv;
867 	int		retries = 0;
868 
869 	prev_ldcp = ldcl->head;
870 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
871 		if (ldcp->ldc_id == ldc_id) {
872 			break;
873 		}
874 	}
875 
876 	/* specified ldc id not found */
877 	ASSERT(ldcp != NULL);
878 
879 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
880 
881 	/* Stop the receive thread */
882 	if (ldcp->rx_thread != NULL) {
883 		vsw_stop_rx_thread(ldcp);
884 		mutex_destroy(&ldcp->rx_thr_lock);
885 		cv_destroy(&ldcp->rx_thr_cv);
886 	}
887 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
888 
889 	/* Stop the tx thread */
890 	if (ldcp->tx_thread != NULL) {
891 		vsw_stop_tx_thread(ldcp);
892 		mutex_destroy(&ldcp->tx_thr_lock);
893 		cv_destroy(&ldcp->tx_thr_cv);
894 		if (ldcp->tx_mhead != NULL) {
895 			freemsgchain(ldcp->tx_mhead);
896 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
897 			ldcp->tx_cnt = 0;
898 		}
899 	}
900 
901 	/* Destory kstats */
902 	vgen_destroy_kstats(ldcp->ksp);
903 
904 	/*
905 	 * Before we can close the channel we must release any mapped
906 	 * resources (e.g. drings).
907 	 */
908 	vsw_free_lane_resources(ldcp, INBOUND);
909 	vsw_free_lane_resources(ldcp, OUTBOUND);
910 
911 	/*
912 	 * Close the channel, retry on EAAGIN.
913 	 */
914 	while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
915 		if (++retries > vsw_ldc_retries) {
916 			break;
917 		}
918 		drv_usecwait(vsw_ldc_delay);
919 	}
920 	if (rv != 0) {
921 		cmn_err(CE_NOTE,
922 		    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
923 		    vswp->instance, rv, ldcp->ldc_id);
924 	}
925 
926 	(void) ldc_fini(ldcp->ldc_handle);
927 
928 	ldcp->ldc_status = LDC_INIT;
929 	ldcp->ldc_handle = NULL;
930 	ldcp->ldc_vswp = NULL;
931 
932 
933 	/*
934 	 * Most likely some mblks are still in use and
935 	 * have not been returned to the pool. These mblks are
936 	 * added to the pool that is maintained in the device instance.
937 	 * Another attempt will be made to destroy the pool
938 	 * when the device detaches.
939 	 */
940 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
941 
942 	/* unlink it from the list */
943 	prev_ldcp = ldcp->ldc_next;
944 
945 	mutex_destroy(&ldcp->ldc_txlock);
946 	mutex_destroy(&ldcp->ldc_rxlock);
947 	mutex_destroy(&ldcp->ldc_cblock);
948 	cv_destroy(&ldcp->drain_cv);
949 	mutex_destroy(&ldcp->drain_cv_lock);
950 	mutex_destroy(&ldcp->status_lock);
951 	rw_destroy(&ldcp->lane_in.dlistrw);
952 	rw_destroy(&ldcp->lane_out.dlistrw);
953 
954 	kmem_free(ldcp, sizeof (vsw_ldc_t));
955 }
956 
957 /*
958  * Open and attempt to bring up the channel. Note that channel
959  * can only be brought up if peer has also opened channel.
960  *
961  * Returns 0 if can open and bring up channel, otherwise
962  * returns 1.
963  */
964 static int
965 vsw_ldc_init(vsw_ldc_t *ldcp)
966 {
967 	vsw_t 		*vswp = ldcp->ldc_vswp;
968 	ldc_status_t	istatus = 0;
969 	int		rv;
970 
971 	D1(vswp, "%s: enter", __func__);
972 
973 	LDC_ENTER_LOCK(ldcp);
974 
975 	/* don't start at 0 in case clients don't like that */
976 	ldcp->next_ident = 1;
977 
978 	rv = ldc_open(ldcp->ldc_handle);
979 	if (rv != 0) {
980 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
981 		    __func__, ldcp->ldc_id, rv);
982 		LDC_EXIT_LOCK(ldcp);
983 		return (1);
984 	}
985 
986 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
987 		DERR(vswp, "%s: unable to get status", __func__);
988 		LDC_EXIT_LOCK(ldcp);
989 		return (1);
990 
991 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
992 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
993 		    __func__, ldcp->ldc_id, istatus);
994 		LDC_EXIT_LOCK(ldcp);
995 		return (1);
996 	}
997 
998 	mutex_enter(&ldcp->status_lock);
999 	ldcp->ldc_status = istatus;
1000 	mutex_exit(&ldcp->status_lock);
1001 
1002 	rv = ldc_up(ldcp->ldc_handle);
1003 	if (rv != 0) {
1004 		/*
1005 		 * Not a fatal error for ldc_up() to fail, as peer
1006 		 * end point may simply not be ready yet.
1007 		 */
1008 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
1009 		    ldcp->ldc_id, rv);
1010 		LDC_EXIT_LOCK(ldcp);
1011 		return (1);
1012 	}
1013 
1014 	/*
1015 	 * ldc_up() call is non-blocking so need to explicitly
1016 	 * check channel status to see if in fact the channel
1017 	 * is UP.
1018 	 */
1019 	mutex_enter(&ldcp->status_lock);
1020 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
1021 		DERR(vswp, "%s: unable to get status", __func__);
1022 		mutex_exit(&ldcp->status_lock);
1023 		LDC_EXIT_LOCK(ldcp);
1024 		return (1);
1025 
1026 	}
1027 
1028 	if (ldcp->ldc_status == LDC_UP) {
1029 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
1030 		    ldcp->ldc_id, istatus);
1031 		mutex_exit(&ldcp->status_lock);
1032 		LDC_EXIT_LOCK(ldcp);
1033 
1034 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1035 		return (0);
1036 	}
1037 
1038 	mutex_exit(&ldcp->status_lock);
1039 	LDC_EXIT_LOCK(ldcp);
1040 
1041 	D1(vswp, "%s: exit", __func__);
1042 	return (0);
1043 }
1044 
1045 /* disable callbacks on the channel */
1046 static void
1047 vsw_ldc_uninit(vsw_ldc_t *ldcp)
1048 {
1049 	vsw_t	*vswp = ldcp->ldc_vswp;
1050 	int	rv;
1051 
1052 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
1053 
1054 	LDC_ENTER_LOCK(ldcp);
1055 
1056 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
1057 	if (rv != 0) {
1058 		cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
1059 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
1060 	}
1061 
1062 	mutex_enter(&ldcp->status_lock);
1063 	ldcp->ldc_status = LDC_INIT;
1064 	mutex_exit(&ldcp->status_lock);
1065 
1066 	LDC_EXIT_LOCK(ldcp);
1067 
1068 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
1069 }
1070 
1071 static int
1072 vsw_init_ldcs(vsw_port_t *port)
1073 {
1074 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1075 	vsw_ldc_t	*ldcp;
1076 
1077 	READ_ENTER(&ldcl->lockrw);
1078 	ldcp =  ldcl->head;
1079 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1080 		(void) vsw_ldc_init(ldcp);
1081 	}
1082 	RW_EXIT(&ldcl->lockrw);
1083 
1084 	return (0);
1085 }
1086 
1087 static void
1088 vsw_uninit_ldcs(vsw_port_t *port)
1089 {
1090 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1091 	vsw_ldc_t	*ldcp;
1092 
1093 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1094 
1095 	READ_ENTER(&ldcl->lockrw);
1096 	ldcp =  ldcl->head;
1097 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1098 		vsw_ldc_uninit(ldcp);
1099 	}
1100 	RW_EXIT(&ldcl->lockrw);
1101 
1102 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1103 }
1104 
1105 /*
1106  * Wait until the callback(s) associated with the ldcs under the specified
1107  * port have completed.
1108  *
1109  * Prior to this function being invoked each channel under this port
1110  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1111  *
1112  * A short explaination of what we are doing below..
1113  *
1114  * The simplest approach would be to have a reference counter in
1115  * the ldc structure which is increment/decremented by the callbacks as
1116  * they use the channel. The drain function could then simply disable any
1117  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1118  * there is a tiny window here - before the callback is able to get the lock
1119  * on the channel it is interrupted and this function gets to execute. It
1120  * sees that the ref count is zero and believes its free to delete the
1121  * associated data structures.
1122  *
1123  * We get around this by taking advantage of the fact that before the ldc
1124  * framework invokes a callback it sets a flag to indicate that there is a
1125  * callback active (or about to become active). If when we attempt to
1126  * unregister a callback when this active flag is set then the unregister
1127  * will fail with EWOULDBLOCK.
1128  *
1129  * If the unregister fails we do a cv_timedwait. We will either be signaled
1130  * by the callback as it is exiting (note we have to wait a short period to
1131  * allow the callback to return fully to the ldc framework and it to clear
1132  * the active flag), or by the timer expiring. In either case we again attempt
1133  * the unregister. We repeat this until we can succesfully unregister the
1134  * callback.
1135  *
1136  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1137  * the case where the callback has finished but the ldc framework has not yet
1138  * cleared the active flag. In this case we would never get a cv_signal.
1139  */
1140 static void
1141 vsw_drain_ldcs(vsw_port_t *port)
1142 {
1143 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1144 	vsw_ldc_t	*ldcp;
1145 	vsw_t		*vswp = port->p_vswp;
1146 
1147 	D1(vswp, "%s: enter", __func__);
1148 
1149 	READ_ENTER(&ldcl->lockrw);
1150 
1151 	ldcp = ldcl->head;
1152 
1153 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1154 		/*
1155 		 * If we can unregister the channel callback then we
1156 		 * know that there is no callback either running or
1157 		 * scheduled to run for this channel so move on to next
1158 		 * channel in the list.
1159 		 */
1160 		mutex_enter(&ldcp->drain_cv_lock);
1161 
1162 		/* prompt active callbacks to quit */
1163 		ldcp->drain_state = VSW_LDC_DRAINING;
1164 
1165 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1166 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1167 			    ldcp->ldc_id);
1168 			mutex_exit(&ldcp->drain_cv_lock);
1169 			continue;
1170 		} else {
1171 			/*
1172 			 * If we end up here we know that either 1) a callback
1173 			 * is currently executing, 2) is about to start (i.e.
1174 			 * the ldc framework has set the active flag but
1175 			 * has not actually invoked the callback yet, or 3)
1176 			 * has finished and has returned to the ldc framework
1177 			 * but the ldc framework has not yet cleared the
1178 			 * active bit.
1179 			 *
1180 			 * Wait for it to finish.
1181 			 */
1182 			while (ldc_unreg_callback(ldcp->ldc_handle)
1183 			    == EWOULDBLOCK)
1184 				(void) cv_timedwait(&ldcp->drain_cv,
1185 				    &ldcp->drain_cv_lock, lbolt + hz);
1186 
1187 			mutex_exit(&ldcp->drain_cv_lock);
1188 			D2(vswp, "%s: unreg callback for chan %ld after "
1189 			    "timeout", __func__, ldcp->ldc_id);
1190 		}
1191 	}
1192 	RW_EXIT(&ldcl->lockrw);
1193 
1194 	D1(vswp, "%s: exit", __func__);
1195 }
1196 
1197 /*
1198  * Wait until all tasks which reference this port have completed.
1199  *
1200  * Prior to this function being invoked each channel under this port
1201  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1202  */
1203 static void
1204 vsw_drain_port_taskq(vsw_port_t *port)
1205 {
1206 	vsw_t		*vswp = port->p_vswp;
1207 
1208 	D1(vswp, "%s: enter", __func__);
1209 
1210 	/*
1211 	 * Mark the port as in the process of being detached, and
1212 	 * dispatch a marker task to the queue so we know when all
1213 	 * relevant tasks have completed.
1214 	 */
1215 	mutex_enter(&port->state_lock);
1216 	port->state = VSW_PORT_DETACHING;
1217 
1218 	if ((vswp->taskq_p == NULL) ||
1219 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1220 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1221 		cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
1222 		    vswp->instance);
1223 		mutex_exit(&port->state_lock);
1224 		return;
1225 	}
1226 
1227 	/*
1228 	 * Wait for the marker task to finish.
1229 	 */
1230 	while (port->state != VSW_PORT_DETACHABLE)
1231 		cv_wait(&port->state_cv, &port->state_lock);
1232 
1233 	mutex_exit(&port->state_lock);
1234 
1235 	D1(vswp, "%s: exit", __func__);
1236 }
1237 
1238 static void
1239 vsw_marker_task(void *arg)
1240 {
1241 	vsw_port_t	*port = arg;
1242 	vsw_t		*vswp = port->p_vswp;
1243 
1244 	D1(vswp, "%s: enter", __func__);
1245 
1246 	mutex_enter(&port->state_lock);
1247 
1248 	/*
1249 	 * No further tasks should be dispatched which reference
1250 	 * this port so ok to mark it as safe to detach.
1251 	 */
1252 	port->state = VSW_PORT_DETACHABLE;
1253 
1254 	cv_signal(&port->state_cv);
1255 
1256 	mutex_exit(&port->state_lock);
1257 
1258 	D1(vswp, "%s: exit", __func__);
1259 }
1260 
1261 vsw_port_t *
1262 vsw_lookup_port(vsw_t *vswp, int p_instance)
1263 {
1264 	vsw_port_list_t *plist = &vswp->plist;
1265 	vsw_port_t	*port;
1266 
1267 	for (port = plist->head; port != NULL; port = port->p_next) {
1268 		if (port->p_instance == p_instance) {
1269 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1270 			return (port);
1271 		}
1272 	}
1273 
1274 	return (NULL);
1275 }
1276 
1277 void
1278 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1279 {
1280 	vsw_ldc_list_t 	*ldclp;
1281 	vsw_ldc_t	*ldcp;
1282 
1283 	ldclp = &portp->p_ldclist;
1284 
1285 	READ_ENTER(&ldclp->lockrw);
1286 
1287 	/*
1288 	 * NOTE: for now, we will assume we have a single channel.
1289 	 */
1290 	if (ldclp->head == NULL) {
1291 		RW_EXIT(&ldclp->lockrw);
1292 		return;
1293 	}
1294 	ldcp = ldclp->head;
1295 
1296 	mutex_enter(&ldcp->ldc_cblock);
1297 
1298 	/*
1299 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1300 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1301 	 */
1302 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1303 	    portp->nvids != 0) {
1304 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1305 	}
1306 
1307 	mutex_exit(&ldcp->ldc_cblock);
1308 
1309 	RW_EXIT(&ldclp->lockrw);
1310 }
1311 
1312 void
1313 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1314 {
1315 	vsw_ldc_list_t	*ldclp;
1316 	vsw_ldc_t	*ldcp;
1317 
1318 	ldclp = &portp->p_ldclist;
1319 
1320 	READ_ENTER(&ldclp->lockrw);
1321 
1322 	/*
1323 	 * NOTE: for now, we will assume we have a single channel.
1324 	 */
1325 	if (ldclp->head == NULL) {
1326 		RW_EXIT(&ldclp->lockrw);
1327 		return;
1328 	}
1329 	ldcp = ldclp->head;
1330 
1331 	mutex_enter(&ldcp->ldc_cblock);
1332 
1333 	/*
1334 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1335 	 * to trigger re-negotiation, which inturn trigger HybridIO
1336 	 * setup/cleanup.
1337 	 */
1338 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1339 	    (portp->p_hio_capable == B_TRUE)) {
1340 		if (immediate == B_TRUE) {
1341 			(void) ldc_down(ldcp->ldc_handle);
1342 		} else {
1343 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1344 		}
1345 	}
1346 
1347 	mutex_exit(&ldcp->ldc_cblock);
1348 
1349 	RW_EXIT(&ldclp->lockrw);
1350 }
1351 
1352 void
1353 vsw_port_reset(vsw_port_t *portp)
1354 {
1355 	vsw_ldc_list_t 	*ldclp;
1356 	vsw_ldc_t	*ldcp;
1357 
1358 	ldclp = &portp->p_ldclist;
1359 
1360 	READ_ENTER(&ldclp->lockrw);
1361 
1362 	/*
1363 	 * NOTE: for now, we will assume we have a single channel.
1364 	 */
1365 	if (ldclp->head == NULL) {
1366 		RW_EXIT(&ldclp->lockrw);
1367 		return;
1368 	}
1369 	ldcp = ldclp->head;
1370 
1371 	mutex_enter(&ldcp->ldc_cblock);
1372 
1373 	/*
1374 	 * reset channel and terminate the connection.
1375 	 */
1376 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1377 
1378 	mutex_exit(&ldcp->ldc_cblock);
1379 
1380 	RW_EXIT(&ldclp->lockrw);
1381 }
1382 
1383 void
1384 vsw_reset_ports(vsw_t *vswp)
1385 {
1386 	vsw_port_list_t	*plist = &vswp->plist;
1387 	vsw_port_t	*portp;
1388 
1389 	READ_ENTER(&plist->lockrw);
1390 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1391 		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1392 			vsw_hio_stop_port(portp);
1393 		}
1394 		vsw_port_reset(portp);
1395 	}
1396 	RW_EXIT(&plist->lockrw);
1397 }
1398 
1399 static void
1400 vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
1401 {
1402 	vnet_physlink_msg_t	msg;
1403 	vnet_physlink_msg_t	*msgp = &msg;
1404 	uint32_t		physlink_info = 0;
1405 
1406 	if (plink_state == LINK_STATE_UP) {
1407 		physlink_info |= VNET_PHYSLINK_STATE_UP;
1408 	} else {
1409 		physlink_info |= VNET_PHYSLINK_STATE_DOWN;
1410 	}
1411 
1412 	msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
1413 	msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
1414 	msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
1415 	msgp->tag.vio_sid = ldcp->local_session;
1416 	msgp->physlink_info = physlink_info;
1417 
1418 	(void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
1419 }
1420 
1421 static void
1422 vsw_port_physlink_update(vsw_port_t *portp)
1423 {
1424 	vsw_ldc_list_t 	*ldclp;
1425 	vsw_ldc_t	*ldcp;
1426 	vsw_t		*vswp;
1427 
1428 	vswp = portp->p_vswp;
1429 	ldclp = &portp->p_ldclist;
1430 
1431 	READ_ENTER(&ldclp->lockrw);
1432 
1433 	/*
1434 	 * NOTE: for now, we will assume we have a single channel.
1435 	 */
1436 	if (ldclp->head == NULL) {
1437 		RW_EXIT(&ldclp->lockrw);
1438 		return;
1439 	}
1440 	ldcp = ldclp->head;
1441 
1442 	mutex_enter(&ldcp->ldc_cblock);
1443 
1444 	/*
1445 	 * If handshake has completed successfully and if the vnet device
1446 	 * has negotiated to get physical link state updates, send a message
1447 	 * with the current state.
1448 	 */
1449 	if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
1450 		vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
1451 	}
1452 
1453 	mutex_exit(&ldcp->ldc_cblock);
1454 
1455 	RW_EXIT(&ldclp->lockrw);
1456 }
1457 
1458 void
1459 vsw_physlink_update_ports(vsw_t *vswp)
1460 {
1461 	vsw_port_list_t	*plist = &vswp->plist;
1462 	vsw_port_t	*portp;
1463 
1464 	READ_ENTER(&plist->lockrw);
1465 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1466 		vsw_port_physlink_update(portp);
1467 	}
1468 	RW_EXIT(&plist->lockrw);
1469 }
1470 
1471 /*
1472  * Search for and remove the specified port from the port
1473  * list. Returns 0 if able to locate and remove port, otherwise
1474  * returns 1.
1475  */
1476 static int
1477 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1478 {
1479 	vsw_port_list_t *plist = &vswp->plist;
1480 	vsw_port_t	*curr_p, *prev_p;
1481 
1482 	if (plist->head == NULL)
1483 		return (1);
1484 
1485 	curr_p = prev_p = plist->head;
1486 
1487 	while (curr_p != NULL) {
1488 		if (curr_p == port) {
1489 			if (prev_p == curr_p) {
1490 				plist->head = curr_p->p_next;
1491 			} else {
1492 				prev_p->p_next = curr_p->p_next;
1493 			}
1494 			plist->num_ports--;
1495 			break;
1496 		} else {
1497 			prev_p = curr_p;
1498 			curr_p = curr_p->p_next;
1499 		}
1500 	}
1501 	return (0);
1502 }
1503 
1504 /*
1505  * Interrupt handler for ldc messages.
1506  */
1507 static uint_t
1508 vsw_ldc_cb(uint64_t event, caddr_t arg)
1509 {
1510 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1511 	vsw_t 		*vswp = ldcp->ldc_vswp;
1512 
1513 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1514 
1515 	mutex_enter(&ldcp->ldc_cblock);
1516 	ldcp->ldc_stats.callbacks++;
1517 
1518 	mutex_enter(&ldcp->status_lock);
1519 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1520 		mutex_exit(&ldcp->status_lock);
1521 		mutex_exit(&ldcp->ldc_cblock);
1522 		return (LDC_SUCCESS);
1523 	}
1524 	mutex_exit(&ldcp->status_lock);
1525 
1526 	if (event & LDC_EVT_UP) {
1527 		/*
1528 		 * Channel has come up.
1529 		 */
1530 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1531 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1532 
1533 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1534 
1535 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1536 	}
1537 
1538 	if (event & LDC_EVT_READ) {
1539 		/*
1540 		 * Data available for reading.
1541 		 */
1542 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1543 		    __func__, ldcp->ldc_id, event);
1544 
1545 		if (ldcp->rx_thread != NULL) {
1546 			/*
1547 			 * If the receive thread is enabled, then
1548 			 * wakeup the receive thread to process the
1549 			 * LDC messages.
1550 			 */
1551 			mutex_exit(&ldcp->ldc_cblock);
1552 			mutex_enter(&ldcp->rx_thr_lock);
1553 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1554 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1555 				cv_signal(&ldcp->rx_thr_cv);
1556 			}
1557 			mutex_exit(&ldcp->rx_thr_lock);
1558 			mutex_enter(&ldcp->ldc_cblock);
1559 		} else {
1560 			vsw_process_pkt(ldcp);
1561 		}
1562 
1563 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1564 
1565 		goto vsw_cb_exit;
1566 	}
1567 
1568 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1569 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1570 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1571 
1572 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1573 	}
1574 
1575 	/*
1576 	 * Catch either LDC_EVT_WRITE which we don't support or any
1577 	 * unknown event.
1578 	 */
1579 	if (event &
1580 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1581 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1582 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1583 	}
1584 
1585 vsw_cb_exit:
1586 	mutex_exit(&ldcp->ldc_cblock);
1587 
1588 	/*
1589 	 * Let the drain function know we are finishing if it
1590 	 * is waiting.
1591 	 */
1592 	mutex_enter(&ldcp->drain_cv_lock);
1593 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1594 		cv_signal(&ldcp->drain_cv);
1595 	mutex_exit(&ldcp->drain_cv_lock);
1596 
1597 	return (LDC_SUCCESS);
1598 }
1599 
1600 /*
1601  * Reinitialise data structures associated with the channel.
1602  */
1603 static void
1604 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1605 {
1606 	vsw_t		*vswp = ldcp->ldc_vswp;
1607 	vsw_port_t	*port;
1608 	vsw_ldc_list_t	*ldcl;
1609 
1610 	D1(vswp, "%s: enter", __func__);
1611 
1612 	/* free receive mblk pools for the channel */
1613 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
1614 
1615 	port = ldcp->ldc_port;
1616 	ldcl = &port->p_ldclist;
1617 
1618 	READ_ENTER(&ldcl->lockrw);
1619 
1620 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1621 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1622 
1623 	vsw_free_lane_resources(ldcp, INBOUND);
1624 	vsw_free_lane_resources(ldcp, OUTBOUND);
1625 	RW_EXIT(&ldcl->lockrw);
1626 
1627 	ldcp->lane_in.lstate = 0;
1628 	ldcp->lane_out.lstate = 0;
1629 
1630 	/* Remove the fdb entry for this port/mac address */
1631 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1632 
1633 	/* remove the port from vlans it has been assigned to */
1634 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1635 
1636 	/*
1637 	 * Remove parent port from any multicast groups
1638 	 * it may have registered with. Client must resend
1639 	 * multicast add command after handshake completes.
1640 	 */
1641 	vsw_del_mcst_port(port);
1642 
1643 	ldcp->peer_session = 0;
1644 	ldcp->session_status = 0;
1645 	ldcp->hcnt = 0;
1646 	ldcp->hphase = VSW_MILESTONE0;
1647 
1648 	vsw_reset_vnet_proto_ops(ldcp);
1649 
1650 	D1(vswp, "%s: exit", __func__);
1651 }
1652 
1653 /*
1654  * Process a connection event.
1655  *
1656  * Note - care must be taken to ensure that this function is
1657  * not called with the dlistrw lock held.
1658  */
1659 static void
1660 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1661 {
1662 	vsw_t		*vswp = ldcp->ldc_vswp;
1663 	vsw_conn_evt_t	*conn = NULL;
1664 
1665 	D1(vswp, "%s: enter", __func__);
1666 
1667 	/*
1668 	 * Check if either a reset or restart event is pending
1669 	 * or in progress. If so just return.
1670 	 *
1671 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1672 	 * being received by the callback handler, or a ECONNRESET error
1673 	 * code being returned from a ldc_read() or ldc_write() call.
1674 	 *
1675 	 * A VSW_CONN_RESTART event occurs when some error checking code
1676 	 * decides that there is a problem with data from the channel,
1677 	 * and that the handshake should be restarted.
1678 	 */
1679 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1680 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1681 		return;
1682 
1683 	/*
1684 	 * If it is an LDC_UP event we first check the recorded
1685 	 * state of the channel. If this is UP then we know that
1686 	 * the channel moving to the UP state has already been dealt
1687 	 * with and don't need to dispatch a  new task.
1688 	 *
1689 	 * The reason for this check is that when we do a ldc_up(),
1690 	 * depending on the state of the peer, we may or may not get
1691 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1692 	 * every time we do ldc_up() we explicitly check the channel
1693 	 * status to see has it come up (ldc_up() is asynch and will
1694 	 * complete at some undefined time), and take the appropriate
1695 	 * action.
1696 	 *
1697 	 * The flip side of this is that we may get a LDC_UP event
1698 	 * when we have already seen that the channel is up and have
1699 	 * dealt with that.
1700 	 */
1701 	mutex_enter(&ldcp->status_lock);
1702 	if (evt == VSW_CONN_UP) {
1703 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1704 			mutex_exit(&ldcp->status_lock);
1705 			return;
1706 		}
1707 	}
1708 	mutex_exit(&ldcp->status_lock);
1709 
1710 	/*
1711 	 * The transaction group id allows us to identify and discard
1712 	 * any tasks which are still pending on the taskq and refer
1713 	 * to the handshake session we are about to restart or reset.
1714 	 * These stale messages no longer have any real meaning.
1715 	 */
1716 	(void) atomic_inc_32(&ldcp->hss_id);
1717 
1718 	ASSERT(vswp->taskq_p != NULL);
1719 
1720 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1721 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1722 		    " connection event", vswp->instance);
1723 		goto err_exit;
1724 	}
1725 
1726 	conn->evt = evt;
1727 	conn->ldcp = ldcp;
1728 
1729 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1730 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1731 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1732 		    vswp->instance);
1733 
1734 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1735 		goto err_exit;
1736 	}
1737 
1738 	D1(vswp, "%s: exit", __func__);
1739 	return;
1740 
1741 err_exit:
1742 	/*
1743 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1744 	 * that future requests will at least be attempted and will hopefully
1745 	 * succeed.
1746 	 */
1747 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1748 		ldcp->reset_active = 0;
1749 }
1750 
1751 /*
1752  * Deal with events relating to a connection. Invoked from a taskq.
1753  */
1754 static void
1755 vsw_conn_task(void *arg)
1756 {
1757 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1758 	vsw_ldc_t	*ldcp = NULL;
1759 	vsw_port_t	*portp;
1760 	vsw_t		*vswp = NULL;
1761 	uint16_t	evt;
1762 	ldc_status_t	curr_status;
1763 
1764 	ldcp = conn->ldcp;
1765 	evt = conn->evt;
1766 	vswp = ldcp->ldc_vswp;
1767 	portp = ldcp->ldc_port;
1768 
1769 	D1(vswp, "%s: enter", __func__);
1770 
1771 	/* can safely free now have copied out data */
1772 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1773 
1774 	mutex_enter(&ldcp->status_lock);
1775 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1776 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1777 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1778 		mutex_exit(&ldcp->status_lock);
1779 		return;
1780 	}
1781 
1782 	/*
1783 	 * If we wish to restart the handshake on this channel, then if
1784 	 * the channel is UP we bring it DOWN to flush the underlying
1785 	 * ldc queue.
1786 	 */
1787 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1788 		(void) ldc_down(ldcp->ldc_handle);
1789 
1790 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1791 		vsw_hio_stop(vswp, ldcp);
1792 	}
1793 
1794 	/*
1795 	 * re-init all the associated data structures.
1796 	 */
1797 	vsw_ldc_reinit(ldcp);
1798 
1799 	/*
1800 	 * Bring the channel back up (note it does no harm to
1801 	 * do this even if the channel is already UP, Just
1802 	 * becomes effectively a no-op).
1803 	 */
1804 	(void) ldc_up(ldcp->ldc_handle);
1805 
1806 	/*
1807 	 * Check if channel is now UP. This will only happen if
1808 	 * peer has also done a ldc_up().
1809 	 */
1810 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1811 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1812 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1813 		mutex_exit(&ldcp->status_lock);
1814 		return;
1815 	}
1816 
1817 	ldcp->ldc_status = curr_status;
1818 
1819 	/* channel UP so restart handshake by sending version info */
1820 	if (curr_status == LDC_UP) {
1821 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1822 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1823 			    " handshake attempts (%d) on channel %ld",
1824 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1825 			mutex_exit(&ldcp->status_lock);
1826 			return;
1827 		}
1828 
1829 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1830 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1831 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1832 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1833 			    vswp->instance);
1834 
1835 			/*
1836 			 * Don't count as valid restart attempt if couldn't
1837 			 * send version msg.
1838 			 */
1839 			if (ldcp->hcnt > 0)
1840 				ldcp->hcnt--;
1841 		}
1842 	}
1843 
1844 	/*
1845 	 * Mark that the process is complete by clearing the flag.
1846 	 *
1847 	 * Note is it possible that the taskq dispatch above may have failed,
1848 	 * most likely due to memory shortage. We still clear the flag so
1849 	 * future attempts will at least be attempted and will hopefully
1850 	 * succeed.
1851 	 */
1852 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1853 		ldcp->reset_active = 0;
1854 
1855 	mutex_exit(&ldcp->status_lock);
1856 
1857 	D1(vswp, "%s: exit", __func__);
1858 }
1859 
1860 /*
1861  * returns 0 if legal for event signified by flag to have
1862  * occured at the time it did. Otherwise returns 1.
1863  */
1864 int
1865 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1866 {
1867 	vsw_t		*vswp = ldcp->ldc_vswp;
1868 	uint64_t	state;
1869 	uint64_t	phase;
1870 
1871 	if (dir == INBOUND)
1872 		state = ldcp->lane_in.lstate;
1873 	else
1874 		state = ldcp->lane_out.lstate;
1875 
1876 	phase = ldcp->hphase;
1877 
1878 	switch (flag) {
1879 	case VSW_VER_INFO_RECV:
1880 		if (phase > VSW_MILESTONE0) {
1881 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1882 			    " when in state %d\n", ldcp->ldc_id, phase);
1883 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1884 			return (1);
1885 		}
1886 		break;
1887 
1888 	case VSW_VER_ACK_RECV:
1889 	case VSW_VER_NACK_RECV:
1890 		if (!(state & VSW_VER_INFO_SENT)) {
1891 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1892 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1893 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1894 			return (1);
1895 		} else
1896 			state &= ~VSW_VER_INFO_SENT;
1897 		break;
1898 
1899 	case VSW_ATTR_INFO_RECV:
1900 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1901 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1902 			    " when in state %d\n", ldcp->ldc_id, phase);
1903 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1904 			return (1);
1905 		}
1906 		break;
1907 
1908 	case VSW_ATTR_ACK_RECV:
1909 	case VSW_ATTR_NACK_RECV:
1910 		if (!(state & VSW_ATTR_INFO_SENT)) {
1911 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1912 			    " or ATTR_NACK when in state %d\n",
1913 			    ldcp->ldc_id, phase);
1914 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1915 			return (1);
1916 		} else
1917 			state &= ~VSW_ATTR_INFO_SENT;
1918 		break;
1919 
1920 	case VSW_DRING_INFO_RECV:
1921 		if (phase < VSW_MILESTONE1) {
1922 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1923 			    " when in state %d\n", ldcp->ldc_id, phase);
1924 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1925 			return (1);
1926 		}
1927 		break;
1928 
1929 	case VSW_DRING_ACK_RECV:
1930 	case VSW_DRING_NACK_RECV:
1931 		if (!(state & VSW_DRING_INFO_SENT)) {
1932 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1933 			    " or DRING_NACK when in state %d\n",
1934 			    ldcp->ldc_id, phase);
1935 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1936 			return (1);
1937 		} else
1938 			state &= ~VSW_DRING_INFO_SENT;
1939 		break;
1940 
1941 	case VSW_RDX_INFO_RECV:
1942 		if (phase < VSW_MILESTONE3) {
1943 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1944 			    " when in state %d\n", ldcp->ldc_id, phase);
1945 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1946 			return (1);
1947 		}
1948 		break;
1949 
1950 	case VSW_RDX_ACK_RECV:
1951 	case VSW_RDX_NACK_RECV:
1952 		if (!(state & VSW_RDX_INFO_SENT)) {
1953 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1954 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1955 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1956 			return (1);
1957 		} else
1958 			state &= ~VSW_RDX_INFO_SENT;
1959 		break;
1960 
1961 	case VSW_MCST_INFO_RECV:
1962 		if (phase < VSW_MILESTONE3) {
1963 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1964 			    " when in state %d\n", ldcp->ldc_id, phase);
1965 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1966 			return (1);
1967 		}
1968 		break;
1969 
1970 	default:
1971 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1972 		    ldcp->ldc_id, flag);
1973 		return (1);
1974 	}
1975 
1976 	if (dir == INBOUND)
1977 		ldcp->lane_in.lstate = state;
1978 	else
1979 		ldcp->lane_out.lstate = state;
1980 
1981 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1982 
1983 	return (0);
1984 }
1985 
1986 void
1987 vsw_next_milestone(vsw_ldc_t *ldcp)
1988 {
1989 	vsw_t		*vswp = ldcp->ldc_vswp;
1990 	vsw_port_t	*portp = ldcp->ldc_port;
1991 
1992 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1993 	    ldcp->ldc_id, ldcp->hphase);
1994 
1995 	DUMP_FLAGS(ldcp->lane_in.lstate);
1996 	DUMP_FLAGS(ldcp->lane_out.lstate);
1997 
1998 	switch (ldcp->hphase) {
1999 
2000 	case VSW_MILESTONE0:
2001 		/*
2002 		 * If we haven't started to handshake with our peer,
2003 		 * start to do so now.
2004 		 */
2005 		if (ldcp->lane_out.lstate == 0) {
2006 			D2(vswp, "%s: (chan %lld) starting handshake "
2007 			    "with peer", __func__, ldcp->ldc_id);
2008 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
2009 		}
2010 
2011 		/*
2012 		 * Only way to pass this milestone is to have successfully
2013 		 * negotiated version info.
2014 		 */
2015 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
2016 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
2017 
2018 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
2019 			    __func__, ldcp->ldc_id);
2020 
2021 			vsw_set_vnet_proto_ops(ldcp);
2022 
2023 			/*
2024 			 * Next milestone is passed when attribute
2025 			 * information has been successfully exchanged.
2026 			 */
2027 			ldcp->hphase = VSW_MILESTONE1;
2028 			vsw_send_attr(ldcp);
2029 
2030 		}
2031 		break;
2032 
2033 	case VSW_MILESTONE1:
2034 		/*
2035 		 * Only way to pass this milestone is to have successfully
2036 		 * negotiated attribute information.
2037 		 */
2038 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
2039 
2040 			ldcp->hphase = VSW_MILESTONE2;
2041 
2042 			/*
2043 			 * If the peer device has said it wishes to
2044 			 * use descriptor rings then we send it our ring
2045 			 * info, otherwise we just set up a private ring
2046 			 * which we use an internal buffer
2047 			 */
2048 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2049 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2050 			    (VSW_VER_LT(ldcp, 1, 2) &&
2051 			    (ldcp->lane_in.xfer_mode ==
2052 			    VIO_DRING_MODE_V1_0))) {
2053 				vsw_send_dring_info(ldcp);
2054 			}
2055 		}
2056 		break;
2057 
2058 	case VSW_MILESTONE2:
2059 		/*
2060 		 * If peer has indicated in its attribute message that
2061 		 * it wishes to use descriptor rings then the only way
2062 		 * to pass this milestone is for us to have received
2063 		 * valid dring info.
2064 		 *
2065 		 * If peer is not using descriptor rings then just fall
2066 		 * through.
2067 		 */
2068 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2069 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2070 		    (VSW_VER_LT(ldcp, 1, 2) &&
2071 		    (ldcp->lane_in.xfer_mode ==
2072 		    VIO_DRING_MODE_V1_0))) {
2073 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
2074 				break;
2075 		}
2076 
2077 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
2078 		    __func__, ldcp->ldc_id);
2079 
2080 		ldcp->hphase = VSW_MILESTONE3;
2081 		vsw_send_rdx(ldcp);
2082 		break;
2083 
2084 	case VSW_MILESTONE3:
2085 		/*
2086 		 * Pass this milestone when all paramaters have been
2087 		 * successfully exchanged and RDX sent in both directions.
2088 		 *
2089 		 * Mark outbound lane as available to transmit data.
2090 		 */
2091 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
2092 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
2093 
2094 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
2095 			    __func__, ldcp->ldc_id);
2096 			D2(vswp, "%s: ** handshake complete (0x%llx : "
2097 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
2098 			    ldcp->lane_out.lstate);
2099 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
2100 			ldcp->hphase = VSW_MILESTONE4;
2101 			ldcp->hcnt = 0;
2102 			DISPLAY_STATE();
2103 			/* Start HIO if enabled and capable */
2104 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
2105 				D2(vswp, "%s: start HybridIO setup", __func__);
2106 				vsw_hio_start(vswp, ldcp);
2107 			}
2108 
2109 			if (ldcp->pls_negotiated == B_TRUE) {
2110 				/*
2111 				 * The vnet device has negotiated to get phys
2112 				 * link updates. Now that the handshake with
2113 				 * the vnet device is complete, send an initial
2114 				 * update with the current physical link state.
2115 				 */
2116 				vsw_send_physlink_msg(ldcp,
2117 				    vswp->phys_link_state);
2118 			}
2119 
2120 		} else {
2121 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
2122 			    __func__, ldcp->lane_in.lstate,
2123 			    ldcp->lane_out.lstate);
2124 		}
2125 		break;
2126 
2127 	case VSW_MILESTONE4:
2128 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
2129 		    ldcp->ldc_id);
2130 		break;
2131 
2132 	default:
2133 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
2134 		    ldcp->ldc_id, ldcp->hphase);
2135 	}
2136 
2137 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
2138 	    ldcp->hphase);
2139 }
2140 
2141 /*
2142  * Check if major version is supported.
2143  *
2144  * Returns 0 if finds supported major number, and if necessary
2145  * adjusts the minor field.
2146  *
2147  * Returns 1 if can't match major number exactly. Sets mjor/minor
2148  * to next lowest support values, or to zero if no other values possible.
2149  */
2150 static int
2151 vsw_supported_version(vio_ver_msg_t *vp)
2152 {
2153 	int	i;
2154 
2155 	D1(NULL, "vsw_supported_version: enter");
2156 
2157 	for (i = 0; i < VSW_NUM_VER; i++) {
2158 		if (vsw_versions[i].ver_major == vp->ver_major) {
2159 			/*
2160 			 * Matching or lower major version found. Update
2161 			 * minor number if necessary.
2162 			 */
2163 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
2164 				D2(NULL, "%s: adjusting minor value from %d "
2165 				    "to %d", __func__, vp->ver_minor,
2166 				    vsw_versions[i].ver_minor);
2167 				vp->ver_minor = vsw_versions[i].ver_minor;
2168 			}
2169 
2170 			return (0);
2171 		}
2172 
2173 		/*
2174 		 * If the message contains a higher major version number, set
2175 		 * the message's major/minor versions to the current values
2176 		 * and return false, so this message will get resent with
2177 		 * these values.
2178 		 */
2179 		if (vsw_versions[i].ver_major < vp->ver_major) {
2180 			D2(NULL, "%s: adjusting major and minor "
2181 			    "values to %d, %d\n",
2182 			    __func__, vsw_versions[i].ver_major,
2183 			    vsw_versions[i].ver_minor);
2184 			vp->ver_major = vsw_versions[i].ver_major;
2185 			vp->ver_minor = vsw_versions[i].ver_minor;
2186 			return (1);
2187 		}
2188 	}
2189 
2190 	/* No match was possible, zero out fields */
2191 	vp->ver_major = 0;
2192 	vp->ver_minor = 0;
2193 
2194 	D1(NULL, "vsw_supported_version: exit");
2195 
2196 	return (1);
2197 }
2198 
2199 /*
2200  * Set vnet-protocol-version dependent functions based on version.
2201  */
2202 static void
2203 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
2204 {
2205 	vsw_t	*vswp = ldcp->ldc_vswp;
2206 	lane_t	*lp = &ldcp->lane_out;
2207 
2208 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2209 		/*
2210 		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
2211 		 * Support), set the mtu in our attributes to max_frame_size.
2212 		 */
2213 		lp->mtu = vswp->max_frame_size;
2214 	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
2215 		/*
2216 		 * If the version negotiated with peer is == 1.3 (Vlan Tag
2217 		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
2218 		 */
2219 		lp->mtu = ETHERMAX + VLAN_TAGSZ;
2220 	} else {
2221 		vsw_port_t	*portp = ldcp->ldc_port;
2222 		/*
2223 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2224 		 * We can negotiate that size with those peers provided only
2225 		 * pvid is defined for our peer and there are no vids. Then we
2226 		 * can send/recv only untagged frames of max size ETHERMAX.
2227 		 * Note that pvid of the peer can be different, as vsw has to
2228 		 * serve the vnet in that vlan even if itself is not assigned
2229 		 * to that vlan.
2230 		 */
2231 		if (portp->nvids == 0) {
2232 			lp->mtu = ETHERMAX;
2233 		}
2234 	}
2235 
2236 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2237 		/* Versions >= 1.2 */
2238 
2239 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2240 			/*
2241 			 * enable priority routines and pkt mode only if
2242 			 * at least one pri-eth-type is specified in MD.
2243 			 */
2244 			ldcp->tx = vsw_ldctx_pri;
2245 			ldcp->rx_pktdata = vsw_process_pkt_data;
2246 
2247 			/* set xfer mode for vsw_send_attr() */
2248 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2249 		} else {
2250 			/* no priority eth types defined in MD */
2251 
2252 			ldcp->tx = vsw_ldctx;
2253 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2254 
2255 			/* set xfer mode for vsw_send_attr() */
2256 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2257 		}
2258 
2259 	} else {
2260 		/* Versions prior to 1.2  */
2261 
2262 		vsw_reset_vnet_proto_ops(ldcp);
2263 	}
2264 }
2265 
2266 /*
2267  * Reset vnet-protocol-version dependent functions to v1.0.
2268  */
2269 static void
2270 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2271 {
2272 	lane_t	*lp = &ldcp->lane_out;
2273 
2274 	ldcp->tx = vsw_ldctx;
2275 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2276 
2277 	/* set xfer mode for vsw_send_attr() */
2278 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2279 }
2280 
2281 /*
2282  * Main routine for processing messages received over LDC.
2283  */
2284 static void
2285 vsw_process_pkt(void *arg)
2286 {
2287 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2288 	vsw_t 		*vswp = ldcp->ldc_vswp;
2289 	size_t		msglen;
2290 	vio_msg_tag_t	*tagp;
2291 	uint64_t	*ldcmsg;
2292 	int 		rv = 0;
2293 
2294 
2295 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2296 
2297 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2298 
2299 	ldcmsg = ldcp->ldcmsg;
2300 	/*
2301 	 * If channel is up read messages until channel is empty.
2302 	 */
2303 	do {
2304 		msglen = ldcp->msglen;
2305 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2306 
2307 		if (rv != 0) {
2308 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2309 			    __func__, ldcp->ldc_id, rv, msglen);
2310 		}
2311 
2312 		/* channel has been reset */
2313 		if (rv == ECONNRESET) {
2314 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2315 			break;
2316 		}
2317 
2318 		if (msglen == 0) {
2319 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2320 			    ldcp->ldc_id);
2321 			break;
2322 		}
2323 
2324 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2325 		    ldcp->ldc_id, msglen);
2326 
2327 		/*
2328 		 * Figure out what sort of packet we have gotten by
2329 		 * examining the msg tag, and then switch it appropriately.
2330 		 */
2331 		tagp = (vio_msg_tag_t *)ldcmsg;
2332 
2333 		switch (tagp->vio_msgtype) {
2334 		case VIO_TYPE_CTRL:
2335 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2336 			break;
2337 		case VIO_TYPE_DATA:
2338 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2339 			break;
2340 		case VIO_TYPE_ERR:
2341 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2342 			break;
2343 		default:
2344 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2345 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2346 			break;
2347 		}
2348 	} while (msglen);
2349 
2350 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2351 }
2352 
2353 /*
2354  * Dispatch a task to process a VIO control message.
2355  */
2356 static void
2357 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2358 {
2359 	vsw_ctrl_task_t		*ctaskp = NULL;
2360 	vsw_port_t		*port = ldcp->ldc_port;
2361 	vsw_t			*vswp = port->p_vswp;
2362 
2363 	D1(vswp, "%s: enter", __func__);
2364 
2365 	/*
2366 	 * We need to handle RDX ACK messages in-band as once they
2367 	 * are exchanged it is possible that we will get an
2368 	 * immediate (legitimate) data packet.
2369 	 */
2370 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2371 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2372 
2373 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2374 			return;
2375 
2376 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2377 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2378 		    "(ostate 0x%llx : hphase %d)", __func__,
2379 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2380 		vsw_next_milestone(ldcp);
2381 		return;
2382 	}
2383 
2384 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2385 
2386 	if (ctaskp == NULL) {
2387 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2388 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2389 		return;
2390 	}
2391 
2392 	ctaskp->ldcp = ldcp;
2393 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2394 	ctaskp->hss_id = ldcp->hss_id;
2395 
2396 	/*
2397 	 * Dispatch task to processing taskq if port is not in
2398 	 * the process of being detached.
2399 	 */
2400 	mutex_enter(&port->state_lock);
2401 	if (port->state == VSW_PORT_INIT) {
2402 		if ((vswp->taskq_p == NULL) ||
2403 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2404 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2405 			mutex_exit(&port->state_lock);
2406 			DERR(vswp, "%s: unable to dispatch task to taskq",
2407 			    __func__);
2408 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2409 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2410 			return;
2411 		}
2412 	} else {
2413 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2414 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2415 		    "task", __func__, port->p_instance);
2416 	}
2417 
2418 	mutex_exit(&port->state_lock);
2419 
2420 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2421 	    ldcp->ldc_id);
2422 	D1(vswp, "%s: exit", __func__);
2423 }
2424 
2425 /*
2426  * Process a VIO ctrl message. Invoked from taskq.
2427  */
2428 static void
2429 vsw_process_ctrl_pkt(void *arg)
2430 {
2431 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2432 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2433 	vsw_t 		*vswp = ldcp->ldc_vswp;
2434 	vio_msg_tag_t	tag;
2435 	uint16_t	env;
2436 
2437 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2438 
2439 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2440 	env = tag.vio_subtype_env;
2441 
2442 	/* stale pkt check */
2443 	if (ctaskp->hss_id < ldcp->hss_id) {
2444 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2445 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2446 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2447 		return;
2448 	}
2449 
2450 	/* session id check */
2451 	if (ldcp->session_status & VSW_PEER_SESSION) {
2452 		if (ldcp->peer_session != tag.vio_sid) {
2453 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2454 			    __func__, ldcp->ldc_id, tag.vio_sid);
2455 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2456 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2457 			return;
2458 		}
2459 	}
2460 
2461 	/*
2462 	 * Switch on vio_subtype envelope, then let lower routines
2463 	 * decide if its an INFO, ACK or NACK packet.
2464 	 */
2465 	switch (env) {
2466 	case VIO_VER_INFO:
2467 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2468 		break;
2469 	case VIO_DRING_REG:
2470 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2471 		break;
2472 	case VIO_DRING_UNREG:
2473 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2474 		break;
2475 	case VIO_ATTR_INFO:
2476 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2477 		break;
2478 	case VNET_MCAST_INFO:
2479 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2480 		break;
2481 	case VIO_RDX:
2482 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2483 		break;
2484 	case VIO_DDS_INFO:
2485 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2486 		break;
2487 
2488 	case VNET_PHYSLINK_INFO:
2489 		vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
2490 		break;
2491 	default:
2492 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2493 	}
2494 
2495 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2496 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2497 }
2498 
2499 /*
2500  * Version negotiation. We can end up here either because our peer
2501  * has responded to a handshake message we have sent it, or our peer
2502  * has initiated a handshake with us. If its the former then can only
2503  * be ACK or NACK, if its the later can only be INFO.
2504  *
2505  * If its an ACK we move to the next stage of the handshake, namely
2506  * attribute exchange. If its a NACK we see if we can specify another
2507  * version, if we can't we stop.
2508  *
2509  * If it is an INFO we reset all params associated with communication
2510  * in that direction over this channel (remember connection is
2511  * essentially 2 independent simplex channels).
2512  */
2513 void
2514 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2515 {
2516 	vio_ver_msg_t	*ver_pkt;
2517 	vsw_t 		*vswp = ldcp->ldc_vswp;
2518 
2519 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2520 
2521 	/*
2522 	 * We know this is a ctrl/version packet so
2523 	 * cast it into the correct structure.
2524 	 */
2525 	ver_pkt = (vio_ver_msg_t *)pkt;
2526 
2527 	switch (ver_pkt->tag.vio_subtype) {
2528 	case VIO_SUBTYPE_INFO:
2529 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2530 
2531 		/*
2532 		 * Record the session id, which we will use from now
2533 		 * until we see another VER_INFO msg. Even then the
2534 		 * session id in most cases will be unchanged, execpt
2535 		 * if channel was reset.
2536 		 */
2537 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2538 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2539 			DERR(vswp, "%s: updating session id for chan %lld "
2540 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2541 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2542 		}
2543 
2544 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2545 		ldcp->session_status |= VSW_PEER_SESSION;
2546 
2547 		/* Legal message at this time ? */
2548 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2549 			return;
2550 
2551 		/*
2552 		 * First check the device class. Currently only expect
2553 		 * to be talking to a network device. In the future may
2554 		 * also talk to another switch.
2555 		 */
2556 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2557 			DERR(vswp, "%s: illegal device class %d", __func__,
2558 			    ver_pkt->dev_class);
2559 
2560 			ver_pkt->tag.vio_sid = ldcp->local_session;
2561 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2562 
2563 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2564 
2565 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2566 			    sizeof (vio_ver_msg_t), B_TRUE);
2567 
2568 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2569 			vsw_next_milestone(ldcp);
2570 			return;
2571 		} else {
2572 			ldcp->dev_class = ver_pkt->dev_class;
2573 		}
2574 
2575 		/*
2576 		 * Now check the version.
2577 		 */
2578 		if (vsw_supported_version(ver_pkt) == 0) {
2579 			/*
2580 			 * Support this major version and possibly
2581 			 * adjusted minor version.
2582 			 */
2583 
2584 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2585 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2586 
2587 			/* Store accepted values */
2588 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2589 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2590 
2591 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2592 
2593 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2594 
2595 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2596 				/*
2597 				 * Send a version info message
2598 				 * using the accepted version that
2599 				 * we are about to ack. Also note that
2600 				 * we send our ver info before we ack.
2601 				 * Otherwise, as soon as receiving the
2602 				 * ack, obp sends attr info msg, which
2603 				 * breaks vsw_check_flag() invoked
2604 				 * from vsw_process_ctrl_attr_pkt();
2605 				 * as we also need VSW_VER_ACK_RECV to
2606 				 * be set in lane_out.lstate, before
2607 				 * we can receive attr info.
2608 				 */
2609 				vsw_send_ver(ldcp);
2610 			}
2611 		} else {
2612 			/*
2613 			 * NACK back with the next lower major/minor
2614 			 * pairing we support (if don't suuport any more
2615 			 * versions then they will be set to zero.
2616 			 */
2617 
2618 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2619 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2620 
2621 			/* Store updated values */
2622 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2623 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2624 
2625 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2626 
2627 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2628 		}
2629 
2630 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2631 		ver_pkt->tag.vio_sid = ldcp->local_session;
2632 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2633 		    sizeof (vio_ver_msg_t), B_TRUE);
2634 
2635 		vsw_next_milestone(ldcp);
2636 		break;
2637 
2638 	case VIO_SUBTYPE_ACK:
2639 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2640 
2641 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2642 			return;
2643 
2644 		/* Store updated values */
2645 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2646 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2647 
2648 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2649 		vsw_next_milestone(ldcp);
2650 
2651 		break;
2652 
2653 	case VIO_SUBTYPE_NACK:
2654 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2655 
2656 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2657 			return;
2658 
2659 		/*
2660 		 * If our peer sent us a NACK with the ver fields set to
2661 		 * zero then there is nothing more we can do. Otherwise see
2662 		 * if we support either the version suggested, or a lesser
2663 		 * one.
2664 		 */
2665 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2666 			DERR(vswp, "%s: peer unable to negotiate any "
2667 			    "further.", __func__);
2668 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2669 			vsw_next_milestone(ldcp);
2670 			return;
2671 		}
2672 
2673 		/*
2674 		 * Check to see if we support this major version or
2675 		 * a lower one. If we don't then maj/min will be set
2676 		 * to zero.
2677 		 */
2678 		(void) vsw_supported_version(ver_pkt);
2679 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2680 			/* Nothing more we can do */
2681 			DERR(vswp, "%s: version negotiation failed.\n",
2682 			    __func__);
2683 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2684 			vsw_next_milestone(ldcp);
2685 		} else {
2686 			/* found a supported major version */
2687 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2688 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2689 
2690 			D2(vswp, "%s: resending with updated values (%x, %x)",
2691 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2692 
2693 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2694 			ver_pkt->tag.vio_sid = ldcp->local_session;
2695 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2696 
2697 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2698 
2699 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2700 			    sizeof (vio_ver_msg_t), B_TRUE);
2701 
2702 			vsw_next_milestone(ldcp);
2703 
2704 		}
2705 		break;
2706 
2707 	default:
2708 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2709 		    ver_pkt->tag.vio_subtype);
2710 	}
2711 
2712 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2713 }
2714 
2715 /*
2716  * Process an attribute packet. We can end up here either because our peer
2717  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2718  * peer has sent us an attribute INFO message
2719  *
2720  * If its an ACK we then move to the next stage of the handshake which
2721  * is to send our descriptor ring info to our peer. If its a NACK then
2722  * there is nothing more we can (currently) do.
2723  *
2724  * If we get a valid/acceptable INFO packet (and we have already negotiated
2725  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2726  * NACK back and reset channel state to INACTIV.
2727  *
2728  * FUTURE: in time we will probably negotiate over attributes, but for
2729  * the moment unacceptable attributes are regarded as a fatal error.
2730  *
2731  */
2732 void
2733 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2734 {
2735 	vnet_attr_msg_t		*attr_pkt;
2736 	vsw_t			*vswp = ldcp->ldc_vswp;
2737 	vsw_port_t		*port = ldcp->ldc_port;
2738 	uint64_t		macaddr = 0;
2739 	lane_t			*lane_out = &ldcp->lane_out;
2740 	lane_t			*lane_in = &ldcp->lane_in;
2741 	uint32_t		mtu;
2742 	boolean_t		ack = B_TRUE;
2743 	int			i;
2744 
2745 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2746 
2747 	/*
2748 	 * We know this is a ctrl/attr packet so
2749 	 * cast it into the correct structure.
2750 	 */
2751 	attr_pkt = (vnet_attr_msg_t *)pkt;
2752 
2753 	switch (attr_pkt->tag.vio_subtype) {
2754 	case VIO_SUBTYPE_INFO:
2755 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2756 
2757 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2758 			return;
2759 
2760 		/*
2761 		 * If the attributes are unacceptable then we NACK back.
2762 		 */
2763 		if (vsw_check_attr(attr_pkt, ldcp)) {
2764 			ack = B_FALSE;
2765 
2766 			DERR(vswp, "%s (chan %d): invalid attributes",
2767 			    __func__, ldcp->ldc_id);
2768 
2769 		} else {
2770 
2771 			if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2772 				/*
2773 				 * Versions >= 1.4:
2774 				 * The mtu is negotiated down to the
2775 				 * minimum of our mtu and peer's mtu.
2776 				 */
2777 				mtu = MIN(attr_pkt->mtu, vswp->max_frame_size);
2778 
2779 				/*
2780 				 * If we have received an ack for the attr info
2781 				 * that we sent, then check if the mtu computed
2782 				 * above matches the mtu that the peer had ack'd
2783 				 * (saved in local hparams). If they don't
2784 				 * match, we fail the handshake.
2785 				 */
2786 				if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2787 					if (mtu != lane_out->mtu) {
2788 						/* send NACK */
2789 						ack = B_FALSE;
2790 					}
2791 				} else {
2792 					/*
2793 					 * Save the mtu computed above in our
2794 					 * attr parameters, so it gets sent in
2795 					 * the attr info from us to the peer.
2796 					 */
2797 					lane_out->mtu = mtu;
2798 				}
2799 			}
2800 
2801 		}
2802 
2803 		if (ack == B_FALSE) {
2804 
2805 			vsw_free_lane_resources(ldcp, INBOUND);
2806 
2807 			attr_pkt->tag.vio_sid = ldcp->local_session;
2808 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2809 
2810 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2811 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2812 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2813 			    sizeof (vnet_attr_msg_t), B_TRUE);
2814 
2815 			vsw_next_milestone(ldcp);
2816 			return;
2817 		}
2818 
2819 		/*
2820 		 * Otherwise store attributes for this lane and update
2821 		 * lane state.
2822 		 */
2823 		lane_in->mtu = attr_pkt->mtu;
2824 		lane_in->addr = attr_pkt->addr;
2825 		lane_in->addr_type = attr_pkt->addr_type;
2826 		lane_in->xfer_mode = attr_pkt->xfer_mode;
2827 		lane_in->ack_freq = attr_pkt->ack_freq;
2828 		lane_in->physlink_update = attr_pkt->physlink_update;
2829 
2830 		/*
2831 		 * Check if the client has requested physlink state updates.
2832 		 * If there is a physical device bound to this vswitch (L2
2833 		 * mode), set the ack bits to indicate it is supported.
2834 		 * Otherwise, set the nack bits.
2835 		 */
2836 		if (VSW_VER_GTEQ(ldcp, 1, 5)) {	/* Protocol ver >= 1.5 */
2837 
2838 			/* Does the vnet need phys link state updates ? */
2839 			if ((lane_in->physlink_update &
2840 			    PHYSLINK_UPDATE_STATE_MASK) ==
2841 			    PHYSLINK_UPDATE_STATE) {
2842 
2843 				if (vswp->smode & VSW_LAYER2) {
2844 					/* is a net-dev assigned to us ? */
2845 					attr_pkt->physlink_update =
2846 					    PHYSLINK_UPDATE_STATE_ACK;
2847 					ldcp->pls_negotiated = B_TRUE;
2848 				} else {
2849 					/* not in L2 mode */
2850 					attr_pkt->physlink_update =
2851 					    PHYSLINK_UPDATE_STATE_NACK;
2852 					ldcp->pls_negotiated = B_FALSE;
2853 				}
2854 
2855 			} else {
2856 				attr_pkt->physlink_update =
2857 				    PHYSLINK_UPDATE_NONE;
2858 				ldcp->pls_negotiated = B_FALSE;
2859 			}
2860 
2861 		} else {
2862 			/*
2863 			 * physlink_update bits are ignored
2864 			 * if set by clients < v1.5 protocol.
2865 			 */
2866 			attr_pkt->physlink_update = PHYSLINK_UPDATE_NONE;
2867 			ldcp->pls_negotiated = B_FALSE;
2868 		}
2869 
2870 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2871 			/* save the MIN mtu in the msg to be replied */
2872 			attr_pkt->mtu = mtu;
2873 		}
2874 
2875 		macaddr = lane_in->addr;
2876 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2877 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2878 			macaddr >>= 8;
2879 		}
2880 
2881 		/* create the fdb entry for this port/mac address */
2882 		vsw_fdbe_add(vswp, port);
2883 
2884 		/* add the port to the specified vlans */
2885 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2886 
2887 		/* setup device specifc xmit routines */
2888 		mutex_enter(&port->tx_lock);
2889 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2890 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2891 		    (VSW_VER_LT(ldcp, 1, 2) &&
2892 		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2893 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2894 			port->transmit = vsw_dringsend;
2895 		} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2896 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2897 			vsw_create_privring(ldcp);
2898 			port->transmit = vsw_descrsend;
2899 			lane_out->xfer_mode = VIO_DESC_MODE;
2900 		}
2901 
2902 		/*
2903 		 * HybridIO is supported only vnet, not by OBP.
2904 		 * So, set hio_capable to true only when in DRING mode.
2905 		 */
2906 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2907 		    (lane_in->xfer_mode != VIO_DESC_MODE)) {
2908 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2909 		} else {
2910 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2911 		}
2912 
2913 		mutex_exit(&port->tx_lock);
2914 
2915 		attr_pkt->tag.vio_sid = ldcp->local_session;
2916 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2917 
2918 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2919 
2920 		lane_in->lstate |= VSW_ATTR_ACK_SENT;
2921 
2922 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2923 		    sizeof (vnet_attr_msg_t), B_TRUE);
2924 
2925 		vsw_next_milestone(ldcp);
2926 		break;
2927 
2928 	case VIO_SUBTYPE_ACK:
2929 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2930 
2931 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2932 			return;
2933 
2934 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2935 			/*
2936 			 * Versions >= 1.4:
2937 			 * The ack msg sent by the peer contains the minimum of
2938 			 * our mtu (that we had sent in our attr info) and the
2939 			 * peer's mtu.
2940 			 *
2941 			 * If we have sent an ack for the attr info msg from
2942 			 * the peer, check if the mtu that was computed then
2943 			 * (saved in lane_out params) matches the mtu that the
2944 			 * peer has ack'd. If they don't match, we fail the
2945 			 * handshake.
2946 			 */
2947 			if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2948 				if (lane_out->mtu != attr_pkt->mtu) {
2949 					return;
2950 				}
2951 			} else {
2952 				/*
2953 				 * If the mtu ack'd by the peer is > our mtu
2954 				 * fail handshake. Otherwise, save the mtu, so
2955 				 * we can validate it when we receive attr info
2956 				 * from our peer.
2957 				 */
2958 				if (attr_pkt->mtu > lane_out->mtu) {
2959 					return;
2960 				}
2961 				if (attr_pkt->mtu <= lane_out->mtu) {
2962 					lane_out->mtu = attr_pkt->mtu;
2963 				}
2964 			}
2965 		}
2966 
2967 		lane_out->lstate |= VSW_ATTR_ACK_RECV;
2968 		vsw_next_milestone(ldcp);
2969 		break;
2970 
2971 	case VIO_SUBTYPE_NACK:
2972 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2973 
2974 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2975 			return;
2976 
2977 		lane_out->lstate |= VSW_ATTR_NACK_RECV;
2978 		vsw_next_milestone(ldcp);
2979 		break;
2980 
2981 	default:
2982 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2983 		    attr_pkt->tag.vio_subtype);
2984 	}
2985 
2986 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2987 }
2988 
2989 /*
2990  * Process a dring info packet. We can end up here either because our peer
2991  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2992  * peer has sent us a dring INFO message.
2993  *
2994  * If we get a valid/acceptable INFO packet (and we have already negotiated
2995  * a version) we ACK back and update the lane state, otherwise we NACK back.
2996  *
2997  * FUTURE: nothing to stop client from sending us info on multiple dring's
2998  * but for the moment we will just use the first one we are given.
2999  *
3000  */
3001 void
3002 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3003 {
3004 	vio_dring_reg_msg_t	*dring_pkt;
3005 	vsw_t			*vswp = ldcp->ldc_vswp;
3006 	ldc_mem_info_t		minfo;
3007 	dring_info_t		*dp, *dbp;
3008 	int			dring_found = 0;
3009 
3010 	/*
3011 	 * We know this is a ctrl/dring packet so
3012 	 * cast it into the correct structure.
3013 	 */
3014 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
3015 
3016 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3017 
3018 	switch (dring_pkt->tag.vio_subtype) {
3019 	case VIO_SUBTYPE_INFO:
3020 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3021 
3022 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3023 			return;
3024 
3025 		/*
3026 		 * If the dring params are unacceptable then we NACK back.
3027 		 */
3028 		if (vsw_check_dring_info(dring_pkt)) {
3029 
3030 			DERR(vswp, "%s (%lld): invalid dring info",
3031 			    __func__, ldcp->ldc_id);
3032 
3033 			vsw_free_lane_resources(ldcp, INBOUND);
3034 
3035 			dring_pkt->tag.vio_sid = ldcp->local_session;
3036 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3037 
3038 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3039 
3040 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3041 
3042 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3043 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
3044 
3045 			vsw_next_milestone(ldcp);
3046 			return;
3047 		}
3048 
3049 		/*
3050 		 * Otherwise, attempt to map in the dring using the
3051 		 * cookie. If that succeeds we send back a unique dring
3052 		 * identifier that the sending side will use in future
3053 		 * to refer to this descriptor ring.
3054 		 */
3055 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
3056 
3057 		dp->num_descriptors = dring_pkt->num_descriptors;
3058 		dp->descriptor_size = dring_pkt->descriptor_size;
3059 		dp->options = dring_pkt->options;
3060 		dp->ncookies = dring_pkt->ncookies;
3061 
3062 		/*
3063 		 * Note: should only get one cookie. Enforced in
3064 		 * the ldc layer.
3065 		 */
3066 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
3067 		    sizeof (ldc_mem_cookie_t));
3068 
3069 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
3070 		    dp->num_descriptors, dp->descriptor_size);
3071 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
3072 		    dp->options, dp->ncookies);
3073 
3074 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
3075 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
3076 		    LDC_DIRECT_MAP, &(dp->handle))) != 0) {
3077 
3078 			DERR(vswp, "%s: dring_map failed\n", __func__);
3079 
3080 			kmem_free(dp, sizeof (dring_info_t));
3081 			vsw_free_lane_resources(ldcp, INBOUND);
3082 
3083 			dring_pkt->tag.vio_sid = ldcp->local_session;
3084 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3085 
3086 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3087 
3088 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3089 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3090 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
3091 
3092 			vsw_next_milestone(ldcp);
3093 			return;
3094 		}
3095 
3096 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
3097 
3098 			DERR(vswp, "%s: dring_addr failed\n", __func__);
3099 
3100 			kmem_free(dp, sizeof (dring_info_t));
3101 			vsw_free_lane_resources(ldcp, INBOUND);
3102 
3103 			dring_pkt->tag.vio_sid = ldcp->local_session;
3104 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3105 
3106 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3107 
3108 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3109 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3110 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
3111 
3112 			vsw_next_milestone(ldcp);
3113 			return;
3114 		} else {
3115 			/* store the address of the pub part of ring */
3116 			dp->pub_addr = minfo.vaddr;
3117 
3118 			/* cache the dring mtype */
3119 			dp->dring_mtype = minfo.mtype;
3120 		}
3121 
3122 		/* no private section as we are importing */
3123 		dp->priv_addr = NULL;
3124 
3125 		/*
3126 		 * Using simple mono increasing int for ident at
3127 		 * the moment.
3128 		 */
3129 		dp->ident = ldcp->next_ident;
3130 		ldcp->next_ident++;
3131 
3132 		dp->end_idx = 0;
3133 		dp->next = NULL;
3134 
3135 		/*
3136 		 * Link it onto the end of the list of drings
3137 		 * for this lane.
3138 		 */
3139 		if (ldcp->lane_in.dringp == NULL) {
3140 			D2(vswp, "%s: adding first INBOUND dring", __func__);
3141 			ldcp->lane_in.dringp = dp;
3142 		} else {
3143 			dbp = ldcp->lane_in.dringp;
3144 
3145 			while (dbp->next != NULL)
3146 				dbp = dbp->next;
3147 
3148 			dbp->next = dp;
3149 		}
3150 
3151 		/* acknowledge it */
3152 		dring_pkt->tag.vio_sid = ldcp->local_session;
3153 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3154 		dring_pkt->dring_ident = dp->ident;
3155 
3156 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3157 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
3158 
3159 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
3160 		vsw_next_milestone(ldcp);
3161 		break;
3162 
3163 	case VIO_SUBTYPE_ACK:
3164 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3165 
3166 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
3167 			return;
3168 
3169 		/*
3170 		 * Peer is acknowledging our dring info and will have
3171 		 * sent us a dring identifier which we will use to
3172 		 * refer to this ring w.r.t. our peer.
3173 		 */
3174 		dp = ldcp->lane_out.dringp;
3175 		if (dp != NULL) {
3176 			/*
3177 			 * Find the ring this ident should be associated
3178 			 * with.
3179 			 */
3180 			if (vsw_dring_match(dp, dring_pkt)) {
3181 				dring_found = 1;
3182 
3183 			} else while (dp != NULL) {
3184 				if (vsw_dring_match(dp, dring_pkt)) {
3185 					dring_found = 1;
3186 					break;
3187 				}
3188 				dp = dp->next;
3189 			}
3190 
3191 			if (dring_found == 0) {
3192 				DERR(NULL, "%s: unrecognised ring cookie",
3193 				    __func__);
3194 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3195 				return;
3196 			}
3197 
3198 		} else {
3199 			DERR(vswp, "%s: DRING ACK received but no drings "
3200 			    "allocated", __func__);
3201 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3202 			return;
3203 		}
3204 
3205 		/* store ident */
3206 		dp->ident = dring_pkt->dring_ident;
3207 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
3208 		vsw_next_milestone(ldcp);
3209 		break;
3210 
3211 	case VIO_SUBTYPE_NACK:
3212 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3213 
3214 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3215 			return;
3216 
3217 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
3218 		vsw_next_milestone(ldcp);
3219 		break;
3220 
3221 	default:
3222 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3223 		    dring_pkt->tag.vio_subtype);
3224 	}
3225 
3226 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3227 }
3228 
3229 /*
3230  * Process a request from peer to unregister a dring.
3231  *
3232  * For the moment we just restart the handshake if our
3233  * peer endpoint attempts to unregister a dring.
3234  */
3235 void
3236 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3237 {
3238 	vsw_t			*vswp = ldcp->ldc_vswp;
3239 	vio_dring_unreg_msg_t	*dring_pkt;
3240 
3241 	/*
3242 	 * We know this is a ctrl/dring packet so
3243 	 * cast it into the correct structure.
3244 	 */
3245 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3246 
3247 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3248 
3249 	switch (dring_pkt->tag.vio_subtype) {
3250 	case VIO_SUBTYPE_INFO:
3251 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3252 
3253 		DWARN(vswp, "%s: restarting handshake..", __func__);
3254 		break;
3255 
3256 	case VIO_SUBTYPE_ACK:
3257 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3258 
3259 		DWARN(vswp, "%s: restarting handshake..", __func__);
3260 		break;
3261 
3262 	case VIO_SUBTYPE_NACK:
3263 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3264 
3265 		DWARN(vswp, "%s: restarting handshake..", __func__);
3266 		break;
3267 
3268 	default:
3269 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3270 		    dring_pkt->tag.vio_subtype);
3271 	}
3272 
3273 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3274 
3275 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3276 }
3277 
3278 #define	SND_MCST_NACK(ldcp, pkt) \
3279 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3280 	pkt->tag.vio_sid = ldcp->local_session; \
3281 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3282 			sizeof (vnet_mcast_msg_t), B_TRUE);
3283 
3284 /*
3285  * Process a multicast request from a vnet.
3286  *
3287  * Vnet's specify a multicast address that they are interested in. This
3288  * address is used as a key into the hash table which forms the multicast
3289  * forwarding database (mFDB).
3290  *
3291  * The table keys are the multicast addresses, while the table entries
3292  * are pointers to lists of ports which wish to receive packets for the
3293  * specified multicast address.
3294  *
3295  * When a multicast packet is being switched we use the address as a key
3296  * into the hash table, and then walk the appropriate port list forwarding
3297  * the pkt to each port in turn.
3298  *
3299  * If a vnet is no longer interested in a particular multicast grouping
3300  * we simply find the correct location in the hash table and then delete
3301  * the relevant port from the port list.
3302  *
3303  * To deal with the case whereby a port is being deleted without first
3304  * removing itself from the lists in the hash table, we maintain a list
3305  * of multicast addresses the port has registered an interest in, within
3306  * the port structure itself. We then simply walk that list of addresses
3307  * using them as keys into the hash table and remove the port from the
3308  * appropriate lists.
3309  */
3310 static void
3311 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3312 {
3313 	vnet_mcast_msg_t	*mcst_pkt;
3314 	vsw_port_t		*port = ldcp->ldc_port;
3315 	vsw_t			*vswp = ldcp->ldc_vswp;
3316 	int			i;
3317 
3318 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3319 
3320 	/*
3321 	 * We know this is a ctrl/mcast packet so
3322 	 * cast it into the correct structure.
3323 	 */
3324 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3325 
3326 	switch (mcst_pkt->tag.vio_subtype) {
3327 	case VIO_SUBTYPE_INFO:
3328 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3329 
3330 		/*
3331 		 * Check if in correct state to receive a multicast
3332 		 * message (i.e. handshake complete). If not reset
3333 		 * the handshake.
3334 		 */
3335 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3336 			return;
3337 
3338 		/*
3339 		 * Before attempting to add or remove address check
3340 		 * that they are valid multicast addresses.
3341 		 * If not, then NACK back.
3342 		 */
3343 		for (i = 0; i < mcst_pkt->count; i++) {
3344 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3345 				DERR(vswp, "%s: invalid multicast address",
3346 				    __func__);
3347 				SND_MCST_NACK(ldcp, mcst_pkt);
3348 				return;
3349 			}
3350 		}
3351 
3352 		/*
3353 		 * Now add/remove the addresses. If this fails we
3354 		 * NACK back.
3355 		 */
3356 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3357 			SND_MCST_NACK(ldcp, mcst_pkt);
3358 			return;
3359 		}
3360 
3361 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3362 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3363 
3364 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3365 
3366 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3367 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3368 		break;
3369 
3370 	case VIO_SUBTYPE_ACK:
3371 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3372 
3373 		/*
3374 		 * We shouldn't ever get a multicast ACK message as
3375 		 * at the moment we never request multicast addresses
3376 		 * to be set on some other device. This may change in
3377 		 * the future if we have cascading switches.
3378 		 */
3379 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3380 			return;
3381 
3382 				/* Do nothing */
3383 		break;
3384 
3385 	case VIO_SUBTYPE_NACK:
3386 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3387 
3388 		/*
3389 		 * We shouldn't get a multicast NACK packet for the
3390 		 * same reasons as we shouldn't get a ACK packet.
3391 		 */
3392 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3393 			return;
3394 
3395 				/* Do nothing */
3396 		break;
3397 
3398 	default:
3399 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3400 		    mcst_pkt->tag.vio_subtype);
3401 	}
3402 
3403 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3404 }
3405 
3406 static void
3407 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3408 {
3409 	vio_rdx_msg_t	*rdx_pkt;
3410 	vsw_t		*vswp = ldcp->ldc_vswp;
3411 
3412 	/*
3413 	 * We know this is a ctrl/rdx packet so
3414 	 * cast it into the correct structure.
3415 	 */
3416 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3417 
3418 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3419 
3420 	switch (rdx_pkt->tag.vio_subtype) {
3421 	case VIO_SUBTYPE_INFO:
3422 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3423 
3424 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3425 			return;
3426 
3427 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3428 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3429 
3430 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3431 
3432 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3433 
3434 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3435 		    sizeof (vio_rdx_msg_t), B_TRUE);
3436 
3437 		vsw_next_milestone(ldcp);
3438 		break;
3439 
3440 	case VIO_SUBTYPE_ACK:
3441 		/*
3442 		 * Should be handled in-band by callback handler.
3443 		 */
3444 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3445 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3446 		break;
3447 
3448 	case VIO_SUBTYPE_NACK:
3449 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3450 
3451 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3452 			return;
3453 
3454 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3455 		vsw_next_milestone(ldcp);
3456 		break;
3457 
3458 	default:
3459 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3460 		    rdx_pkt->tag.vio_subtype);
3461 	}
3462 
3463 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3464 }
3465 
3466 static void
3467 vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
3468 {
3469 	vnet_physlink_msg_t	*msgp;
3470 	vsw_t			*vswp = ldcp->ldc_vswp;
3471 
3472 	msgp = (vnet_physlink_msg_t *)pkt;
3473 
3474 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3475 
3476 	switch (msgp->tag.vio_subtype) {
3477 	case VIO_SUBTYPE_INFO:
3478 
3479 		/* vsw shouldn't recv physlink info */
3480 		DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
3481 		break;
3482 
3483 	case VIO_SUBTYPE_ACK:
3484 
3485 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3486 		break;
3487 
3488 	case VIO_SUBTYPE_NACK:
3489 
3490 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3491 		break;
3492 
3493 	default:
3494 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3495 		    msgp->tag.vio_subtype);
3496 	}
3497 
3498 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3499 }
3500 
3501 static void
3502 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3503 	uint32_t msglen)
3504 {
3505 	uint16_t	env = tagp->vio_subtype_env;
3506 	vsw_t		*vswp = ldcp->ldc_vswp;
3507 
3508 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3509 
3510 	/* session id check */
3511 	if (ldcp->session_status & VSW_PEER_SESSION) {
3512 		if (ldcp->peer_session != tagp->vio_sid) {
3513 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3514 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3515 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3516 			return;
3517 		}
3518 	}
3519 
3520 	/*
3521 	 * It is an error for us to be getting data packets
3522 	 * before the handshake has completed.
3523 	 */
3524 	if (ldcp->hphase != VSW_MILESTONE4) {
3525 		DERR(vswp, "%s: got data packet before handshake complete "
3526 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3527 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3528 		DUMP_FLAGS(ldcp->lane_in.lstate);
3529 		DUMP_FLAGS(ldcp->lane_out.lstate);
3530 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3531 		return;
3532 	}
3533 
3534 	/*
3535 	 * To reduce the locking contention, release the
3536 	 * ldc_cblock here and re-acquire it once we are done
3537 	 * receiving packets.
3538 	 */
3539 	mutex_exit(&ldcp->ldc_cblock);
3540 	mutex_enter(&ldcp->ldc_rxlock);
3541 
3542 	/*
3543 	 * Switch on vio_subtype envelope, then let lower routines
3544 	 * decide if its an INFO, ACK or NACK packet.
3545 	 */
3546 	if (env == VIO_DRING_DATA) {
3547 		vsw_process_data_dring_pkt(ldcp, dpkt);
3548 	} else if (env == VIO_PKT_DATA) {
3549 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3550 	} else if (env == VIO_DESC_DATA) {
3551 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3552 	} else {
3553 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3554 	}
3555 
3556 	mutex_exit(&ldcp->ldc_rxlock);
3557 	mutex_enter(&ldcp->ldc_cblock);
3558 
3559 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3560 }
3561 
3562 #define	SND_DRING_NACK(ldcp, pkt) \
3563 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3564 	pkt->tag.vio_sid = ldcp->local_session; \
3565 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3566 			sizeof (vio_dring_msg_t), B_TRUE);
3567 
3568 static void
3569 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3570 {
3571 	vio_dring_msg_t		*dring_pkt;
3572 	vnet_public_desc_t	desc, *pub_addr = NULL;
3573 	vsw_private_desc_t	*priv_addr = NULL;
3574 	dring_info_t		*dp = NULL;
3575 	vsw_t			*vswp = ldcp->ldc_vswp;
3576 	mblk_t			*mp = NULL;
3577 	mblk_t			*bp = NULL;
3578 	mblk_t			*bpt = NULL;
3579 	size_t			nbytes = 0;
3580 	uint64_t		chain = 0;
3581 	uint64_t		len;
3582 	uint32_t		pos, start;
3583 	uint32_t		range_start, range_end;
3584 	int32_t			end, num, cnt = 0;
3585 	int			i, rv, rng_rv = 0, msg_rv = 0;
3586 	boolean_t		prev_desc_ack = B_FALSE;
3587 	int			read_attempts = 0;
3588 	struct ether_header	*ehp;
3589 	lane_t			*lp = &ldcp->lane_out;
3590 
3591 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3592 
3593 	/*
3594 	 * We know this is a data/dring packet so
3595 	 * cast it into the correct structure.
3596 	 */
3597 	dring_pkt = (vio_dring_msg_t *)dpkt;
3598 
3599 	/*
3600 	 * Switch on the vio_subtype. If its INFO then we need to
3601 	 * process the data. If its an ACK we need to make sure
3602 	 * it makes sense (i.e did we send an earlier data/info),
3603 	 * and if its a NACK then we maybe attempt a retry.
3604 	 */
3605 	switch (dring_pkt->tag.vio_subtype) {
3606 	case VIO_SUBTYPE_INFO:
3607 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3608 
3609 		READ_ENTER(&ldcp->lane_in.dlistrw);
3610 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3611 		    dring_pkt->dring_ident)) == NULL) {
3612 			RW_EXIT(&ldcp->lane_in.dlistrw);
3613 
3614 			DERR(vswp, "%s(%lld): unable to find dring from "
3615 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3616 			    dring_pkt->dring_ident);
3617 
3618 			SND_DRING_NACK(ldcp, dring_pkt);
3619 			return;
3620 		}
3621 
3622 		start = pos = dring_pkt->start_idx;
3623 		end = dring_pkt->end_idx;
3624 		len = dp->num_descriptors;
3625 
3626 		range_start = range_end = pos;
3627 
3628 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3629 		    __func__, ldcp->ldc_id, start, end);
3630 
3631 		if (end == -1) {
3632 			num = -1;
3633 		} else if (end >= 0) {
3634 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3635 
3636 			/* basic sanity check */
3637 			if (end > len) {
3638 				RW_EXIT(&ldcp->lane_in.dlistrw);
3639 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3640 				    "ring length %lld", __func__,
3641 				    ldcp->ldc_id, end, len);
3642 
3643 				SND_DRING_NACK(ldcp, dring_pkt);
3644 				return;
3645 			}
3646 		} else {
3647 			RW_EXIT(&ldcp->lane_in.dlistrw);
3648 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3649 			    __func__, ldcp->ldc_id, end);
3650 			SND_DRING_NACK(ldcp, dring_pkt);
3651 			return;
3652 		}
3653 
3654 		while (cnt != num) {
3655 vsw_recheck_desc:
3656 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3657 
3658 			if ((rng_rv = vnet_dring_entry_copy(pub_addr,
3659 			    &desc, dp->dring_mtype, dp->handle,
3660 			    pos, pos)) != 0) {
3661 				DERR(vswp, "%s(%lld): unable to copy "
3662 				    "descriptor at pos %d: err %d",
3663 				    __func__, pos, ldcp->ldc_id, rng_rv);
3664 				ldcp->ldc_stats.ierrors++;
3665 				break;
3666 			}
3667 
3668 			/*
3669 			 * When given a bounded range of descriptors
3670 			 * to process, its an error to hit a descriptor
3671 			 * which is not ready. In the non-bounded case
3672 			 * (end_idx == -1) this simply indicates we have
3673 			 * reached the end of the current active range.
3674 			 */
3675 			if (desc.hdr.dstate != VIO_DESC_READY) {
3676 				/* unbound - no error */
3677 				if (end == -1) {
3678 					if (read_attempts == vsw_read_attempts)
3679 						break;
3680 
3681 					delay(drv_usectohz(vsw_desc_delay));
3682 					read_attempts++;
3683 					goto vsw_recheck_desc;
3684 				}
3685 
3686 				/* bounded - error - so NACK back */
3687 				RW_EXIT(&ldcp->lane_in.dlistrw);
3688 				DERR(vswp, "%s(%lld): descriptor not READY "
3689 				    "(%d)", __func__, ldcp->ldc_id,
3690 				    desc.hdr.dstate);
3691 				SND_DRING_NACK(ldcp, dring_pkt);
3692 				return;
3693 			}
3694 
3695 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3696 
3697 			range_end = pos;
3698 
3699 			/*
3700 			 * If we ACK'd the previous descriptor then now
3701 			 * record the new range start position for later
3702 			 * ACK's.
3703 			 */
3704 			if (prev_desc_ack) {
3705 				range_start = pos;
3706 
3707 				D2(vswp, "%s(%lld): updating range start to be "
3708 				    "%d", __func__, ldcp->ldc_id, range_start);
3709 
3710 				prev_desc_ack = B_FALSE;
3711 			}
3712 
3713 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3714 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3715 			    __func__, ldcp->ldc_id, pos, &desc,
3716 			    desc.hdr.dstate, desc.nbytes);
3717 
3718 			if ((desc.nbytes < ETHERMIN) ||
3719 			    (desc.nbytes > lp->mtu)) {
3720 				/* invalid size; drop the packet */
3721 				ldcp->ldc_stats.ierrors++;
3722 				goto vsw_process_desc_done;
3723 			}
3724 
3725 			/*
3726 			 * Ensure that we ask ldc for an aligned
3727 			 * number of bytes. Data is padded to align on 8
3728 			 * byte boundary, desc.nbytes is actual data length,
3729 			 * i.e. minus that padding.
3730 			 */
3731 			nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7;
3732 			if (nbytes > ldcp->max_rxpool_size) {
3733 				mp = allocb(desc.nbytes + VNET_IPALIGN + 8,
3734 				    BPRI_MED);
3735 			} else {
3736 				mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3737 				if (mp == NULL) {
3738 					ldcp->ldc_stats.rx_vio_allocb_fail++;
3739 					/*
3740 					 * No free receive buffers available,
3741 					 * so fallback onto allocb(9F). Make
3742 					 * sure that we get a data buffer which
3743 					 * is a multiple of 8 as this is
3744 					 * required by ldc_mem_copy.
3745 					 */
3746 					DTRACE_PROBE(allocb);
3747 					mp = allocb(desc.nbytes +
3748 					    VNET_IPALIGN + 8, BPRI_MED);
3749 				}
3750 			}
3751 			if (mp == NULL) {
3752 				DERR(vswp, "%s(%ld): allocb failed",
3753 				    __func__, ldcp->ldc_id);
3754 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3755 				    dp->dring_mtype, dp->handle, pos, pos,
3756 				    VIO_DESC_DONE);
3757 				ldcp->ldc_stats.ierrors++;
3758 				ldcp->ldc_stats.rx_allocb_fail++;
3759 				break;
3760 			}
3761 
3762 			rv = ldc_mem_copy(ldcp->ldc_handle,
3763 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3764 			    desc.memcookie, desc.ncookies, LDC_COPY_IN);
3765 			if (rv != 0) {
3766 				DERR(vswp, "%s(%d): unable to copy in data "
3767 				    "from %d cookies in desc %d (rv %d)",
3768 				    __func__, ldcp->ldc_id, desc.ncookies,
3769 				    pos, rv);
3770 				freemsg(mp);
3771 
3772 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3773 				    dp->dring_mtype, dp->handle, pos, pos,
3774 				    VIO_DESC_DONE);
3775 				ldcp->ldc_stats.ierrors++;
3776 				break;
3777 			} else {
3778 				D2(vswp, "%s(%d): copied in %ld bytes"
3779 				    " using %d cookies", __func__,
3780 				    ldcp->ldc_id, nbytes, desc.ncookies);
3781 			}
3782 
3783 			/* adjust the read pointer to skip over the padding */
3784 			mp->b_rptr += VNET_IPALIGN;
3785 
3786 			/* point to the actual end of data */
3787 			mp->b_wptr = mp->b_rptr + desc.nbytes;
3788 
3789 			/* update statistics */
3790 			ehp = (struct ether_header *)mp->b_rptr;
3791 			if (IS_BROADCAST(ehp))
3792 				ldcp->ldc_stats.brdcstrcv++;
3793 			else if (IS_MULTICAST(ehp))
3794 				ldcp->ldc_stats.multircv++;
3795 
3796 			ldcp->ldc_stats.ipackets++;
3797 			ldcp->ldc_stats.rbytes += desc.nbytes;
3798 
3799 			/*
3800 			 * IPALIGN space can be used for VLAN_TAG
3801 			 */
3802 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3803 			    VSW_VNETPORT, mp);
3804 
3805 			/* build a chain of received packets */
3806 			if (bp == NULL) {
3807 				/* first pkt */
3808 				bp = mp;
3809 				bp->b_next = bp->b_prev = NULL;
3810 				bpt = bp;
3811 				chain = 1;
3812 			} else {
3813 				mp->b_next = mp->b_prev = NULL;
3814 				bpt->b_next = mp;
3815 				bpt = mp;
3816 				chain++;
3817 			}
3818 
3819 vsw_process_desc_done:
3820 			/* mark we are finished with this descriptor */
3821 			if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3822 			    dp->dring_mtype, dp->handle, pos, pos,
3823 			    VIO_DESC_DONE)) != 0) {
3824 				DERR(vswp, "%s(%lld): unable to update "
3825 				    "dstate at pos %d: err %d",
3826 				    __func__, pos, ldcp->ldc_id, rng_rv);
3827 				ldcp->ldc_stats.ierrors++;
3828 				break;
3829 			}
3830 
3831 			/*
3832 			 * Send an ACK back to peer if requested.
3833 			 */
3834 			if (desc.hdr.ack) {
3835 				dring_pkt->start_idx = range_start;
3836 				dring_pkt->end_idx = range_end;
3837 
3838 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3839 				    " requested", __func__, ldcp->ldc_id,
3840 				    dring_pkt->start_idx, dring_pkt->end_idx);
3841 
3842 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3843 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3844 				dring_pkt->tag.vio_sid = ldcp->local_session;
3845 
3846 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3847 				    sizeof (vio_dring_msg_t), B_FALSE);
3848 
3849 				/*
3850 				 * Check if ACK was successfully sent. If not
3851 				 * we break and deal with that below.
3852 				 */
3853 				if (msg_rv != 0)
3854 					break;
3855 
3856 				prev_desc_ack = B_TRUE;
3857 				range_start = pos;
3858 			}
3859 
3860 			/* next descriptor */
3861 			pos = (pos + 1) % len;
3862 			cnt++;
3863 
3864 			/*
3865 			 * Break out of loop here and stop processing to
3866 			 * allow some other network device (or disk) to
3867 			 * get access to the cpu.
3868 			 */
3869 			if (chain > vsw_chain_len) {
3870 				D3(vswp, "%s(%lld): switching chain of %d "
3871 				    "msgs", __func__, ldcp->ldc_id, chain);
3872 				break;
3873 			}
3874 		}
3875 		RW_EXIT(&ldcp->lane_in.dlistrw);
3876 
3877 		/* send the chain of packets to be switched */
3878 		if (bp != NULL) {
3879 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3880 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3881 			    __func__, ldcp->ldc_id, chain);
3882 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3883 			    ldcp->ldc_port, NULL);
3884 		}
3885 
3886 		/*
3887 		 * If when we encountered an error when attempting to
3888 		 * access an imported dring, initiate a connection reset.
3889 		 */
3890 		if (rng_rv != 0) {
3891 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3892 			break;
3893 		}
3894 
3895 		/*
3896 		 * If when we attempted to send the ACK we found that the
3897 		 * channel had been reset then now handle this. We deal with
3898 		 * it here as we cannot reset the channel while holding the
3899 		 * dlistrw lock, and we don't want to acquire/release it
3900 		 * continuously in the above loop, as a channel reset should
3901 		 * be a rare event.
3902 		 */
3903 		if (msg_rv == ECONNRESET) {
3904 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3905 			break;
3906 		}
3907 
3908 		DTRACE_PROBE1(msg_cnt, int, cnt);
3909 
3910 		/*
3911 		 * We are now finished so ACK back with the state
3912 		 * set to STOPPING so our peer knows we are finished
3913 		 */
3914 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3915 		dring_pkt->tag.vio_sid = ldcp->local_session;
3916 
3917 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3918 
3919 		DTRACE_PROBE(stop_process_sent);
3920 
3921 		/*
3922 		 * We have not processed any more descriptors beyond
3923 		 * the last one we ACK'd.
3924 		 */
3925 		if (prev_desc_ack)
3926 			range_start = range_end;
3927 
3928 		dring_pkt->start_idx = range_start;
3929 		dring_pkt->end_idx = range_end;
3930 
3931 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3932 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3933 		    dring_pkt->end_idx);
3934 
3935 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3936 		    sizeof (vio_dring_msg_t), B_TRUE);
3937 		break;
3938 
3939 	case VIO_SUBTYPE_ACK:
3940 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3941 		/*
3942 		 * Verify that the relevant descriptors are all
3943 		 * marked as DONE
3944 		 */
3945 		READ_ENTER(&ldcp->lane_out.dlistrw);
3946 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3947 		    dring_pkt->dring_ident)) == NULL) {
3948 			RW_EXIT(&ldcp->lane_out.dlistrw);
3949 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3950 			return;
3951 		}
3952 
3953 		start = end = 0;
3954 		start = dring_pkt->start_idx;
3955 		end = dring_pkt->end_idx;
3956 		len = dp->num_descriptors;
3957 
3958 
3959 		mutex_enter(&dp->dlock);
3960 		dp->last_ack_recv = end;
3961 		ldcp->ldc_stats.dring_data_acks++;
3962 		mutex_exit(&dp->dlock);
3963 
3964 		(void) vsw_reclaim_dring(dp, start);
3965 
3966 		/*
3967 		 * If our peer is stopping processing descriptors then
3968 		 * we check to make sure it has processed all the descriptors
3969 		 * we have updated. If not then we send it a new message
3970 		 * to prompt it to restart.
3971 		 */
3972 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3973 			DTRACE_PROBE(stop_process_recv);
3974 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3975 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3976 			    dring_pkt->end_idx);
3977 
3978 			/*
3979 			 * Check next descriptor in public section of ring.
3980 			 * If its marked as READY then we need to prompt our
3981 			 * peer to start processing the ring again.
3982 			 */
3983 			i = (end + 1) % len;
3984 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3985 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3986 
3987 			/*
3988 			 * Hold the restart lock across all of this to
3989 			 * make sure that its not possible for us to
3990 			 * decide that a msg needs to be sent in the future
3991 			 * but the sending code having already checked is
3992 			 * about to exit.
3993 			 */
3994 			mutex_enter(&dp->restart_lock);
3995 			ldcp->ldc_stats.dring_stopped_acks++;
3996 			mutex_enter(&priv_addr->dstate_lock);
3997 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3998 
3999 				mutex_exit(&priv_addr->dstate_lock);
4000 
4001 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4002 				dring_pkt->tag.vio_sid = ldcp->local_session;
4003 
4004 				dring_pkt->start_idx = (end + 1) % len;
4005 				dring_pkt->end_idx = -1;
4006 
4007 				D2(vswp, "%s(%lld) : sending restart msg:"
4008 				    " %d : %d", __func__, ldcp->ldc_id,
4009 				    dring_pkt->start_idx, dring_pkt->end_idx);
4010 
4011 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
4012 				    sizeof (vio_dring_msg_t), B_FALSE);
4013 				ldcp->ldc_stats.dring_data_msgs++;
4014 
4015 			} else {
4016 				mutex_exit(&priv_addr->dstate_lock);
4017 				dp->restart_reqd = B_TRUE;
4018 			}
4019 			mutex_exit(&dp->restart_lock);
4020 		}
4021 		RW_EXIT(&ldcp->lane_out.dlistrw);
4022 
4023 		/* only do channel reset after dropping dlistrw lock */
4024 		if (msg_rv == ECONNRESET)
4025 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4026 
4027 		break;
4028 
4029 	case VIO_SUBTYPE_NACK:
4030 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
4031 		    __func__, ldcp->ldc_id);
4032 		/*
4033 		 * Something is badly wrong if we are getting NACK's
4034 		 * for our data pkts. So reset the channel.
4035 		 */
4036 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4037 
4038 		break;
4039 
4040 	default:
4041 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4042 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
4043 	}
4044 
4045 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4046 }
4047 
4048 /*
4049  * dummy pkt data handler function for vnet protocol version 1.0
4050  */
4051 static void
4052 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
4053 {
4054 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
4055 }
4056 
4057 /*
4058  * This function handles raw pkt data messages received over the channel.
4059  * Currently, only priority-eth-type frames are received through this mechanism.
4060  * In this case, the frame(data) is present within the message itself which
4061  * is copied into an mblk before switching it.
4062  */
4063 static void
4064 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
4065 {
4066 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
4067 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
4068 	uint32_t		size;
4069 	mblk_t			*mp;
4070 	vsw_t			*vswp = ldcp->ldc_vswp;
4071 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4072 	lane_t			*lp = &ldcp->lane_out;
4073 
4074 	size = msglen - VIO_PKT_DATA_HDRSIZE;
4075 	if (size < ETHERMIN || size > lp->mtu) {
4076 		(void) atomic_inc_32(&statsp->rx_pri_fail);
4077 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4078 		    ldcp->ldc_id, size);
4079 		return;
4080 	}
4081 
4082 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
4083 	if (mp == NULL) {
4084 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
4085 		if (mp == NULL) {
4086 			(void) atomic_inc_32(&statsp->rx_pri_fail);
4087 			DWARN(vswp, "%s(%lld) allocb failure, "
4088 			    "unable to process priority frame\n", __func__,
4089 			    ldcp->ldc_id);
4090 			return;
4091 		}
4092 	}
4093 
4094 	/* skip over the extra space for vlan tag */
4095 	mp->b_rptr += VLAN_TAGSZ;
4096 
4097 	/* copy the frame from the payload of raw data msg into the mblk */
4098 	bcopy(dpkt->data, mp->b_rptr, size);
4099 	mp->b_wptr = mp->b_rptr + size;
4100 
4101 	/* update stats */
4102 	(void) atomic_inc_64(&statsp->rx_pri_packets);
4103 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
4104 
4105 	/*
4106 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
4107 	 */
4108 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
4109 
4110 	/* switch the frame to destination */
4111 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
4112 }
4113 
4114 /*
4115  * Process an in-band descriptor message (most likely from
4116  * OBP).
4117  */
4118 static void
4119 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
4120 {
4121 	vnet_ibnd_desc_t	*ibnd_desc;
4122 	dring_info_t		*dp = NULL;
4123 	vsw_private_desc_t	*priv_addr = NULL;
4124 	vsw_t			*vswp = ldcp->ldc_vswp;
4125 	mblk_t			*mp = NULL;
4126 	size_t			nbytes = 0;
4127 	size_t			off = 0;
4128 	uint64_t		idx = 0;
4129 	uint32_t		num = 1, len, datalen = 0;
4130 	uint64_t		ncookies = 0;
4131 	int			i, rv;
4132 	int			j = 0;
4133 
4134 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4135 
4136 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
4137 
4138 	switch (ibnd_desc->hdr.tag.vio_subtype) {
4139 	case VIO_SUBTYPE_INFO:
4140 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4141 
4142 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
4143 			return;
4144 
4145 		/*
4146 		 * Data is padded to align on a 8 byte boundary,
4147 		 * nbytes is actual data length, i.e. minus that
4148 		 * padding.
4149 		 */
4150 		datalen = ibnd_desc->nbytes;
4151 
4152 		D2(vswp, "%s(%lld): processing inband desc : "
4153 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
4154 
4155 		ncookies = ibnd_desc->ncookies;
4156 
4157 		/*
4158 		 * allocb(9F) returns an aligned data block. We
4159 		 * need to ensure that we ask ldc for an aligned
4160 		 * number of bytes also.
4161 		 */
4162 		nbytes = datalen;
4163 		if (nbytes & 0x7) {
4164 			off = 8 - (nbytes & 0x7);
4165 			nbytes += off;
4166 		}
4167 
4168 		/* alloc extra space for VLAN_TAG */
4169 		mp = allocb(datalen + 8, BPRI_MED);
4170 		if (mp == NULL) {
4171 			DERR(vswp, "%s(%lld): allocb failed",
4172 			    __func__, ldcp->ldc_id);
4173 			ldcp->ldc_stats.rx_allocb_fail++;
4174 			return;
4175 		}
4176 
4177 		/* skip over the extra space for VLAN_TAG */
4178 		mp->b_rptr += 8;
4179 
4180 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
4181 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
4182 		    LDC_COPY_IN);
4183 
4184 		if (rv != 0) {
4185 			DERR(vswp, "%s(%d): unable to copy in data from "
4186 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
4187 			freemsg(mp);
4188 			ldcp->ldc_stats.ierrors++;
4189 			return;
4190 		}
4191 
4192 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
4193 		    __func__, ldcp->ldc_id, nbytes, ncookies);
4194 
4195 		/* point to the actual end of data */
4196 		mp->b_wptr = mp->b_rptr + datalen;
4197 		ldcp->ldc_stats.ipackets++;
4198 		ldcp->ldc_stats.rbytes += datalen;
4199 
4200 		/*
4201 		 * We ACK back every in-band descriptor message we process
4202 		 */
4203 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
4204 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
4205 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
4206 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
4207 
4208 		/*
4209 		 * there is extra space alloc'd for VLAN_TAG
4210 		 */
4211 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
4212 
4213 		/* send the packet to be switched */
4214 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
4215 		    ldcp->ldc_port, NULL);
4216 
4217 		break;
4218 
4219 	case VIO_SUBTYPE_ACK:
4220 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4221 
4222 		/* Verify the ACK is valid */
4223 		idx = ibnd_desc->hdr.desc_handle;
4224 
4225 		if (idx >= vsw_ntxds) {
4226 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
4227 			    "(idx %ld)", vswp->instance, idx);
4228 			return;
4229 		}
4230 
4231 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4232 			DERR(vswp, "%s: no dring found", __func__);
4233 			return;
4234 		}
4235 
4236 		len = dp->num_descriptors;
4237 		/*
4238 		 * If the descriptor we are being ACK'ed for is not the
4239 		 * one we expected, then pkts were lost somwhere, either
4240 		 * when we tried to send a msg, or a previous ACK msg from
4241 		 * our peer. In either case we now reclaim the descriptors
4242 		 * in the range from the last ACK we received up to the
4243 		 * current ACK.
4244 		 */
4245 		if (idx != dp->last_ack_recv) {
4246 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
4247 			    __func__, dp->last_ack_recv, idx);
4248 			num = idx >= dp->last_ack_recv ?
4249 			    idx - dp->last_ack_recv + 1:
4250 			    (len - dp->last_ack_recv + 1) + idx;
4251 		}
4252 
4253 		/*
4254 		 * When we sent the in-band message to our peer we
4255 		 * marked the copy in our private ring as READY. We now
4256 		 * check that the descriptor we are being ACK'ed for is in
4257 		 * fact READY, i.e. it is one we have shared with our peer.
4258 		 *
4259 		 * If its not we flag an error, but still reset the descr
4260 		 * back to FREE.
4261 		 */
4262 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
4263 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4264 			mutex_enter(&priv_addr->dstate_lock);
4265 			if (priv_addr->dstate != VIO_DESC_READY) {
4266 				DERR(vswp, "%s: (%ld) desc at index %ld not "
4267 				    "READY (0x%lx)", __func__,
4268 				    ldcp->ldc_id, idx, priv_addr->dstate);
4269 				DERR(vswp, "%s: bound %d: ncookies %ld : "
4270 				    "datalen %ld", __func__,
4271 				    priv_addr->bound, priv_addr->ncookies,
4272 				    priv_addr->datalen);
4273 			}
4274 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
4275 			    ldcp->ldc_id, idx);
4276 			/* release resources associated with sent msg */
4277 			priv_addr->datalen = 0;
4278 			priv_addr->dstate = VIO_DESC_FREE;
4279 			mutex_exit(&priv_addr->dstate_lock);
4280 		}
4281 		/* update to next expected value */
4282 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
4283 
4284 		break;
4285 
4286 	case VIO_SUBTYPE_NACK:
4287 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4288 
4289 		/*
4290 		 * We should only get a NACK if our peer doesn't like
4291 		 * something about a message we have sent it. If this
4292 		 * happens we just release the resources associated with
4293 		 * the message. (We are relying on higher layers to decide
4294 		 * whether or not to resend.
4295 		 */
4296 
4297 		/* limit check */
4298 		idx = ibnd_desc->hdr.desc_handle;
4299 
4300 		if (idx >= vsw_ntxds) {
4301 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
4302 			    __func__, idx);
4303 			return;
4304 		}
4305 
4306 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4307 			DERR(vswp, "%s: no dring found", __func__);
4308 			return;
4309 		}
4310 
4311 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4312 
4313 		/* move to correct location in ring */
4314 		priv_addr += idx;
4315 
4316 		/* release resources associated with sent msg */
4317 		mutex_enter(&priv_addr->dstate_lock);
4318 		priv_addr->datalen = 0;
4319 		priv_addr->dstate = VIO_DESC_FREE;
4320 		mutex_exit(&priv_addr->dstate_lock);
4321 
4322 		break;
4323 
4324 	default:
4325 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4326 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
4327 	}
4328 
4329 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4330 }
4331 
4332 static void
4333 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
4334 {
4335 	_NOTE(ARGUNUSED(epkt))
4336 
4337 	vsw_t		*vswp = ldcp->ldc_vswp;
4338 	uint16_t	env = tagp->vio_subtype_env;
4339 
4340 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4341 
4342 	/*
4343 	 * Error vio_subtypes have yet to be defined. So for
4344 	 * the moment we can't do anything.
4345 	 */
4346 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
4347 
4348 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4349 }
4350 
4351 /* transmit the packet over the given port */
4352 int
4353 vsw_portsend(vsw_port_t *port, mblk_t *mp)
4354 {
4355 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
4356 	vsw_ldc_t 	*ldcp;
4357 	mblk_t		*mpt;
4358 	int		count;
4359 	int		status = 0;
4360 
4361 	READ_ENTER(&ldcl->lockrw);
4362 	/*
4363 	 * Note for now, we have a single channel.
4364 	 */
4365 	ldcp = ldcl->head;
4366 	if (ldcp == NULL) {
4367 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
4368 		freemsgchain(mp);
4369 		RW_EXIT(&ldcl->lockrw);
4370 		return (1);
4371 	}
4372 
4373 	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
4374 
4375 	if (count != 0) {
4376 		status = ldcp->tx(ldcp, mp, mpt, count);
4377 	}
4378 
4379 	RW_EXIT(&ldcl->lockrw);
4380 	return (status);
4381 }
4382 
4383 /*
4384  * Break up frames into 2 seperate chains: normal and
4385  * priority, based on the frame type. The number of
4386  * priority frames is also counted and returned.
4387  *
4388  * Params:
4389  * 	vswp:	pointer to the instance of vsw
4390  *	np:	head of packet chain to be broken
4391  *	npt:	tail of packet chain to be broken
4392  *
4393  * Returns:
4394  *	np:	head of normal data packets
4395  *	npt:	tail of normal data packets
4396  *	hp:	head of high priority packets
4397  *	hpt:	tail of high priority packets
4398  */
4399 static uint32_t
4400 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4401 	mblk_t **hp, mblk_t **hpt)
4402 {
4403 	mblk_t			*tmp = NULL;
4404 	mblk_t			*smp = NULL;
4405 	mblk_t			*hmp = NULL;	/* high prio pkts head */
4406 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4407 	mblk_t			*nmp = NULL;	/* normal pkts head */
4408 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4409 	uint32_t		count = 0;
4410 	int			i;
4411 	struct ether_header	*ehp;
4412 	uint32_t		num_types;
4413 	uint16_t		*types;
4414 
4415 	tmp = *np;
4416 	while (tmp != NULL) {
4417 
4418 		smp = tmp;
4419 		tmp = tmp->b_next;
4420 		smp->b_next = NULL;
4421 		smp->b_prev = NULL;
4422 
4423 		ehp = (struct ether_header *)smp->b_rptr;
4424 		num_types = vswp->pri_num_types;
4425 		types = vswp->pri_types;
4426 		for (i = 0; i < num_types; i++) {
4427 			if (ehp->ether_type == types[i]) {
4428 				/* high priority frame */
4429 
4430 				if (hmp != NULL) {
4431 					hmpt->b_next = smp;
4432 					hmpt = smp;
4433 				} else {
4434 					hmp = hmpt = smp;
4435 				}
4436 				count++;
4437 				break;
4438 			}
4439 		}
4440 		if (i == num_types) {
4441 			/* normal data frame */
4442 
4443 			if (nmp != NULL) {
4444 				nmpt->b_next = smp;
4445 				nmpt = smp;
4446 			} else {
4447 				nmp = nmpt = smp;
4448 			}
4449 		}
4450 	}
4451 
4452 	*hp = hmp;
4453 	*hpt = hmpt;
4454 	*np = nmp;
4455 	*npt = nmpt;
4456 
4457 	return (count);
4458 }
4459 
4460 /*
4461  * Wrapper function to transmit normal and/or priority frames over the channel.
4462  */
4463 static int
4464 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4465 {
4466 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4467 	mblk_t			*tmp;
4468 	mblk_t			*smp;
4469 	mblk_t			*hmp;	/* high prio pkts head */
4470 	mblk_t			*hmpt;	/* high prio pkts tail */
4471 	mblk_t			*nmp;	/* normal pkts head */
4472 	mblk_t			*nmpt;	/* normal pkts tail */
4473 	uint32_t		n = 0;
4474 	vsw_t			*vswp = ldcp->ldc_vswp;
4475 
4476 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4477 	ASSERT(count != 0);
4478 
4479 	nmp = mp;
4480 	nmpt = mpt;
4481 
4482 	/* gather any priority frames from the chain of packets */
4483 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4484 
4485 	/* transmit priority frames */
4486 	tmp = hmp;
4487 	while (tmp != NULL) {
4488 		smp = tmp;
4489 		tmp = tmp->b_next;
4490 		smp->b_next = NULL;
4491 		vsw_ldcsend_pkt(ldcp, smp);
4492 	}
4493 
4494 	count -= n;
4495 
4496 	if (count == 0) {
4497 		/* no normal data frames to process */
4498 		return (0);
4499 	}
4500 
4501 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4502 }
4503 
4504 /*
4505  * Wrapper function to transmit normal frames over the channel.
4506  */
4507 static int
4508 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4509 {
4510 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4511 	mblk_t		*tmp = NULL;
4512 
4513 	ASSERT(count != 0);
4514 	/*
4515 	 * If the TX thread is enabled, then queue the
4516 	 * ordinary frames and signal the tx thread.
4517 	 */
4518 	if (ldcp->tx_thread != NULL) {
4519 
4520 		mutex_enter(&ldcp->tx_thr_lock);
4521 
4522 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4523 			/*
4524 			 * If we reached queue limit,
4525 			 * do not queue new packets,
4526 			 * drop them.
4527 			 */
4528 			ldcp->ldc_stats.tx_qfull += count;
4529 			mutex_exit(&ldcp->tx_thr_lock);
4530 			freemsgchain(mp);
4531 			goto exit;
4532 		}
4533 		if (ldcp->tx_mhead == NULL) {
4534 			ldcp->tx_mhead = mp;
4535 			ldcp->tx_mtail = mpt;
4536 			cv_signal(&ldcp->tx_thr_cv);
4537 		} else {
4538 			ldcp->tx_mtail->b_next = mp;
4539 			ldcp->tx_mtail = mpt;
4540 		}
4541 		ldcp->tx_cnt += count;
4542 		mutex_exit(&ldcp->tx_thr_lock);
4543 	} else {
4544 		while (mp != NULL) {
4545 			tmp = mp->b_next;
4546 			mp->b_next = mp->b_prev = NULL;
4547 			(void) vsw_ldcsend(ldcp, mp, 1);
4548 			mp = tmp;
4549 		}
4550 	}
4551 
4552 exit:
4553 	return (0);
4554 }
4555 
4556 /*
4557  * This function transmits the frame in the payload of a raw data
4558  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4559  * send special frames with high priorities, without going through
4560  * the normal data path which uses descriptor ring mechanism.
4561  */
4562 static void
4563 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4564 {
4565 	vio_raw_data_msg_t	*pkt;
4566 	mblk_t			*bp;
4567 	mblk_t			*nmp = NULL;
4568 	caddr_t			dst;
4569 	uint32_t		mblksz;
4570 	uint32_t		size;
4571 	uint32_t		nbytes;
4572 	int			rv;
4573 	vsw_t			*vswp = ldcp->ldc_vswp;
4574 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4575 
4576 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4577 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4578 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4579 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4580 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4581 		    ldcp->lane_out.lstate);
4582 		goto send_pkt_exit;
4583 	}
4584 
4585 	size = msgsize(mp);
4586 
4587 	/* frame size bigger than available payload len of raw data msg ? */
4588 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4589 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4590 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4591 		    ldcp->ldc_id, size);
4592 		goto send_pkt_exit;
4593 	}
4594 
4595 	if (size < ETHERMIN)
4596 		size = ETHERMIN;
4597 
4598 	/* alloc space for a raw data message */
4599 	nmp = vio_allocb(vswp->pri_tx_vmp);
4600 	if (nmp == NULL) {
4601 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4602 		DWARN(vswp, "vio_allocb failed\n");
4603 		goto send_pkt_exit;
4604 	}
4605 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4606 
4607 	/* copy frame into the payload of raw data message */
4608 	dst = (caddr_t)pkt->data;
4609 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4610 		mblksz = MBLKL(bp);
4611 		bcopy(bp->b_rptr, dst, mblksz);
4612 		dst += mblksz;
4613 	}
4614 
4615 	/* setup the raw data msg */
4616 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4617 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4618 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4619 	pkt->tag.vio_sid = ldcp->local_session;
4620 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4621 
4622 	/* send the msg over ldc */
4623 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4624 	if (rv != 0) {
4625 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4626 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4627 		    ldcp->ldc_id);
4628 		goto send_pkt_exit;
4629 	}
4630 
4631 	/* update stats */
4632 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4633 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4634 
4635 send_pkt_exit:
4636 	if (nmp != NULL)
4637 		freemsg(nmp);
4638 	freemsg(mp);
4639 }
4640 
4641 /*
4642  * Transmit the packet over the given LDC channel.
4643  *
4644  * The 'retries' argument indicates how many times a packet
4645  * is retried before it is dropped. Note, the retry is done
4646  * only for a resource related failure, for all other failures
4647  * the packet is dropped immediately.
4648  */
4649 static int
4650 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4651 {
4652 	int i;
4653 	int rc;
4654 	int status = 0;
4655 	vsw_port_t *port = ldcp->ldc_port;
4656 	dring_info_t *dp = NULL;
4657 
4658 
4659 	for (i = 0; i < retries; ) {
4660 		/*
4661 		 * Send the message out using the appropriate
4662 		 * transmit function which will free mblock when it
4663 		 * is finished with it.
4664 		 */
4665 		mutex_enter(&port->tx_lock);
4666 		if (port->transmit != NULL) {
4667 			status = (*port->transmit)(ldcp, mp);
4668 		}
4669 		if (status == LDC_TX_SUCCESS) {
4670 			mutex_exit(&port->tx_lock);
4671 			break;
4672 		}
4673 		i++;	/* increment the counter here */
4674 
4675 		/* If its the last retry, then update the oerror */
4676 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4677 			ldcp->ldc_stats.oerrors++;
4678 		}
4679 		mutex_exit(&port->tx_lock);
4680 
4681 		if (status != LDC_TX_NORESOURCES) {
4682 			/*
4683 			 * No retrying required for errors un-related
4684 			 * to resources.
4685 			 */
4686 			break;
4687 		}
4688 		READ_ENTER(&ldcp->lane_out.dlistrw);
4689 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4690 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4691 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4692 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4693 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4694 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4695 		} else {
4696 			/*
4697 			 * If there is no dring or the xfer_mode is
4698 			 * set to DESC_MODE(ie., OBP), then simply break here.
4699 			 */
4700 			RW_EXIT(&ldcp->lane_out.dlistrw);
4701 			break;
4702 		}
4703 		RW_EXIT(&ldcp->lane_out.dlistrw);
4704 
4705 		/*
4706 		 * Delay only if none were reclaimed
4707 		 * and its not the last retry.
4708 		 */
4709 		if ((rc == 0) && (i < retries)) {
4710 			delay(drv_usectohz(vsw_ldc_tx_delay));
4711 		}
4712 	}
4713 	freemsg(mp);
4714 	return (status);
4715 }
4716 
4717 /*
4718  * Send packet out via descriptor ring to a logical device.
4719  */
4720 static int
4721 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4722 {
4723 	vio_dring_msg_t		dring_pkt;
4724 	dring_info_t		*dp = NULL;
4725 	vsw_private_desc_t	*priv_desc = NULL;
4726 	vnet_public_desc_t	*pub = NULL;
4727 	vsw_t			*vswp = ldcp->ldc_vswp;
4728 	mblk_t			*bp;
4729 	size_t			n, size;
4730 	caddr_t			bufp;
4731 	int			idx;
4732 	int			status = LDC_TX_SUCCESS;
4733 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4734 	lane_t			*lp = &ldcp->lane_out;
4735 
4736 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4737 
4738 	/* TODO: make test a macro */
4739 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4740 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4741 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4742 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4743 		    ldcp->lane_out.lstate);
4744 		ldcp->ldc_stats.oerrors++;
4745 		return (LDC_TX_FAILURE);
4746 	}
4747 
4748 	/*
4749 	 * Note - using first ring only, this may change
4750 	 * in the future.
4751 	 */
4752 	READ_ENTER(&ldcp->lane_out.dlistrw);
4753 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4754 		RW_EXIT(&ldcp->lane_out.dlistrw);
4755 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4756 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4757 		ldcp->ldc_stats.oerrors++;
4758 		return (LDC_TX_FAILURE);
4759 	}
4760 
4761 	size = msgsize(mp);
4762 	if (size > (size_t)lp->mtu) {
4763 		RW_EXIT(&ldcp->lane_out.dlistrw);
4764 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4765 		    ldcp->ldc_id, size);
4766 		ldcp->ldc_stats.oerrors++;
4767 		return (LDC_TX_FAILURE);
4768 	}
4769 
4770 	/*
4771 	 * Find a free descriptor
4772 	 *
4773 	 * Note: for the moment we are assuming that we will only
4774 	 * have one dring going from the switch to each of its
4775 	 * peers. This may change in the future.
4776 	 */
4777 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4778 		D2(vswp, "%s(%lld): no descriptor available for ring "
4779 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4780 
4781 		/* nothing more we can do */
4782 		status = LDC_TX_NORESOURCES;
4783 		ldcp->ldc_stats.tx_no_desc++;
4784 		goto vsw_dringsend_free_exit;
4785 	} else {
4786 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4787 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4788 	}
4789 
4790 	/* copy data into the descriptor */
4791 	bufp = priv_desc->datap;
4792 	bufp += VNET_IPALIGN;
4793 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4794 		n = MBLKL(bp);
4795 		bcopy(bp->b_rptr, bufp, n);
4796 		bufp += n;
4797 	}
4798 
4799 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4800 
4801 	pub = priv_desc->descp;
4802 	pub->nbytes = priv_desc->datalen;
4803 
4804 	/* update statistics */
4805 	if (IS_BROADCAST(ehp))
4806 		ldcp->ldc_stats.brdcstxmt++;
4807 	else if (IS_MULTICAST(ehp))
4808 		ldcp->ldc_stats.multixmt++;
4809 	ldcp->ldc_stats.opackets++;
4810 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4811 
4812 	mutex_enter(&priv_desc->dstate_lock);
4813 	pub->hdr.dstate = VIO_DESC_READY;
4814 	mutex_exit(&priv_desc->dstate_lock);
4815 
4816 	/*
4817 	 * Determine whether or not we need to send a message to our
4818 	 * peer prompting them to read our newly updated descriptor(s).
4819 	 */
4820 	mutex_enter(&dp->restart_lock);
4821 	if (dp->restart_reqd) {
4822 		dp->restart_reqd = B_FALSE;
4823 		ldcp->ldc_stats.dring_data_msgs++;
4824 		mutex_exit(&dp->restart_lock);
4825 
4826 		/*
4827 		 * Send a vio_dring_msg to peer to prompt them to read
4828 		 * the updated descriptor ring.
4829 		 */
4830 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4831 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4832 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4833 		dring_pkt.tag.vio_sid = ldcp->local_session;
4834 
4835 		/* Note - for now using first ring */
4836 		dring_pkt.dring_ident = dp->ident;
4837 
4838 		/*
4839 		 * If last_ack_recv is -1 then we know we've not
4840 		 * received any ack's yet, so this must be the first
4841 		 * msg sent, so set the start to the begining of the ring.
4842 		 */
4843 		mutex_enter(&dp->dlock);
4844 		if (dp->last_ack_recv == -1) {
4845 			dring_pkt.start_idx = 0;
4846 		} else {
4847 			dring_pkt.start_idx =
4848 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4849 		}
4850 		dring_pkt.end_idx = -1;
4851 		mutex_exit(&dp->dlock);
4852 
4853 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4854 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4855 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4856 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4857 		    dring_pkt.end_idx);
4858 
4859 		RW_EXIT(&ldcp->lane_out.dlistrw);
4860 
4861 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4862 		    sizeof (vio_dring_msg_t), B_TRUE);
4863 
4864 		return (status);
4865 
4866 	} else {
4867 		mutex_exit(&dp->restart_lock);
4868 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4869 		    ldcp->ldc_id, idx);
4870 	}
4871 
4872 vsw_dringsend_free_exit:
4873 
4874 	RW_EXIT(&ldcp->lane_out.dlistrw);
4875 
4876 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4877 	return (status);
4878 }
4879 
4880 /*
4881  * Send an in-band descriptor message over ldc.
4882  */
4883 static int
4884 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4885 {
4886 	vsw_t			*vswp = ldcp->ldc_vswp;
4887 	vnet_ibnd_desc_t	ibnd_msg;
4888 	vsw_private_desc_t	*priv_desc = NULL;
4889 	dring_info_t		*dp = NULL;
4890 	size_t			n, size = 0;
4891 	caddr_t			bufp;
4892 	mblk_t			*bp;
4893 	int			idx, i;
4894 	int			status = LDC_TX_SUCCESS;
4895 	static int		warn_msg = 1;
4896 	lane_t			*lp = &ldcp->lane_out;
4897 
4898 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4899 
4900 	ASSERT(mp != NULL);
4901 
4902 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4903 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4904 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4905 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4906 		    ldcp->lane_out.lstate);
4907 		ldcp->ldc_stats.oerrors++;
4908 		return (LDC_TX_FAILURE);
4909 	}
4910 
4911 	/*
4912 	 * only expect single dring to exist, which we use
4913 	 * as an internal buffer, rather than a transfer channel.
4914 	 */
4915 	READ_ENTER(&ldcp->lane_out.dlistrw);
4916 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4917 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4918 		    __func__, ldcp->ldc_id);
4919 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4920 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4921 		RW_EXIT(&ldcp->lane_out.dlistrw);
4922 		ldcp->ldc_stats.oerrors++;
4923 		return (LDC_TX_FAILURE);
4924 	}
4925 
4926 	size = msgsize(mp);
4927 	if (size > (size_t)lp->mtu) {
4928 		RW_EXIT(&ldcp->lane_out.dlistrw);
4929 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4930 		    ldcp->ldc_id, size);
4931 		ldcp->ldc_stats.oerrors++;
4932 		return (LDC_TX_FAILURE);
4933 	}
4934 
4935 	/*
4936 	 * Find a free descriptor in our buffer ring
4937 	 */
4938 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4939 		RW_EXIT(&ldcp->lane_out.dlistrw);
4940 		if (warn_msg) {
4941 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4942 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4943 			warn_msg = 0;
4944 		}
4945 
4946 		/* nothing more we can do */
4947 		status = LDC_TX_NORESOURCES;
4948 		goto vsw_descrsend_free_exit;
4949 	} else {
4950 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4951 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4952 		warn_msg = 1;
4953 	}
4954 
4955 	/* copy data into the descriptor */
4956 	bufp = priv_desc->datap;
4957 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4958 		n = MBLKL(bp);
4959 		bcopy(bp->b_rptr, bufp, n);
4960 		bufp += n;
4961 	}
4962 
4963 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4964 
4965 	/* create and send the in-band descp msg */
4966 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4967 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4968 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4969 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4970 
4971 	/*
4972 	 * Copy the mem cookies describing the data from the
4973 	 * private region of the descriptor ring into the inband
4974 	 * descriptor.
4975 	 */
4976 	for (i = 0; i < priv_desc->ncookies; i++) {
4977 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4978 		    sizeof (ldc_mem_cookie_t));
4979 	}
4980 
4981 	ibnd_msg.hdr.desc_handle = idx;
4982 	ibnd_msg.ncookies = priv_desc->ncookies;
4983 	ibnd_msg.nbytes = size;
4984 
4985 	ldcp->ldc_stats.opackets++;
4986 	ldcp->ldc_stats.obytes += size;
4987 
4988 	RW_EXIT(&ldcp->lane_out.dlistrw);
4989 
4990 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4991 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4992 
4993 vsw_descrsend_free_exit:
4994 
4995 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4996 	return (status);
4997 }
4998 
4999 static void
5000 vsw_send_ver(void *arg)
5001 {
5002 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
5003 	vsw_t		*vswp = ldcp->ldc_vswp;
5004 	lane_t		*lp = &ldcp->lane_out;
5005 	vio_ver_msg_t	ver_msg;
5006 
5007 	D1(vswp, "%s enter", __func__);
5008 
5009 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5010 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5011 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
5012 	ver_msg.tag.vio_sid = ldcp->local_session;
5013 
5014 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
5015 		ver_msg.ver_major = vsw_versions[0].ver_major;
5016 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
5017 	} else {
5018 		/* use the major,minor that we've ack'd */
5019 		lane_t	*lpi = &ldcp->lane_in;
5020 		ver_msg.ver_major = lpi->ver_major;
5021 		ver_msg.ver_minor = lpi->ver_minor;
5022 	}
5023 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
5024 
5025 	lp->lstate |= VSW_VER_INFO_SENT;
5026 	lp->ver_major = ver_msg.ver_major;
5027 	lp->ver_minor = ver_msg.ver_minor;
5028 
5029 	DUMP_TAG(ver_msg.tag);
5030 
5031 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
5032 
5033 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
5034 }
5035 
5036 static void
5037 vsw_send_attr(vsw_ldc_t *ldcp)
5038 {
5039 	vsw_t			*vswp = ldcp->ldc_vswp;
5040 	lane_t			*lp = &ldcp->lane_out;
5041 	vnet_attr_msg_t		attr_msg;
5042 
5043 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5044 
5045 	/*
5046 	 * Subtype is set to INFO by default
5047 	 */
5048 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5049 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5050 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
5051 	attr_msg.tag.vio_sid = ldcp->local_session;
5052 
5053 	/* payload copied from default settings for lane */
5054 	attr_msg.mtu = lp->mtu;
5055 	attr_msg.addr_type = lp->addr_type;
5056 	attr_msg.xfer_mode = lp->xfer_mode;
5057 	attr_msg.ack_freq = lp->xfer_mode;
5058 
5059 	READ_ENTER(&vswp->if_lockrw);
5060 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
5061 	RW_EXIT(&vswp->if_lockrw);
5062 
5063 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
5064 
5065 	DUMP_TAG(attr_msg.tag);
5066 
5067 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
5068 
5069 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
5070 }
5071 
5072 /*
5073  * Create dring info msg (which also results in the creation of
5074  * a dring).
5075  */
5076 static vio_dring_reg_msg_t *
5077 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
5078 {
5079 	vio_dring_reg_msg_t	*mp;
5080 	dring_info_t		*dp;
5081 	vsw_t			*vswp = ldcp->ldc_vswp;
5082 	int			rv;
5083 
5084 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
5085 
5086 	/*
5087 	 * If we can't create a dring, obviously no point sending
5088 	 * a message.
5089 	 */
5090 	if ((dp = vsw_create_dring(ldcp)) == NULL)
5091 		return (NULL);
5092 
5093 	/* Allocate pools of receive mblks */
5094 	rv = vsw_init_multipools(ldcp, vswp);
5095 	if (rv) {
5096 		/*
5097 		 * We do not return failure if receive mblk pools can't be
5098 		 * allocated, instead allocb(9F) will be used to dynamically
5099 		 * allocate buffers during receive.
5100 		 */
5101 		DWARN(vswp, "%s: unable to create free mblk pools for"
5102 		    " channel %ld (rv %d)", __func__, ldcp->ldc_id, rv);
5103 	}
5104 
5105 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
5106 
5107 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
5108 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
5109 	mp->tag.vio_subtype_env = VIO_DRING_REG;
5110 	mp->tag.vio_sid = ldcp->local_session;
5111 
5112 	/* payload */
5113 	mp->num_descriptors = dp->num_descriptors;
5114 	mp->descriptor_size = dp->descriptor_size;
5115 	mp->options = dp->options;
5116 	mp->ncookies = dp->ncookies;
5117 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
5118 
5119 	mp->dring_ident = 0;
5120 
5121 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
5122 
5123 	return (mp);
5124 }
5125 
5126 static void
5127 vsw_send_dring_info(vsw_ldc_t *ldcp)
5128 {
5129 	vio_dring_reg_msg_t	*dring_msg;
5130 	vsw_t			*vswp = ldcp->ldc_vswp;
5131 
5132 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
5133 
5134 	dring_msg = vsw_create_dring_info_pkt(ldcp);
5135 	if (dring_msg == NULL) {
5136 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
5137 		    vswp->instance, __func__);
5138 		return;
5139 	}
5140 
5141 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
5142 
5143 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
5144 
5145 	(void) vsw_send_msg(ldcp, dring_msg,
5146 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
5147 
5148 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
5149 
5150 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
5151 }
5152 
5153 static void
5154 vsw_send_rdx(vsw_ldc_t *ldcp)
5155 {
5156 	vsw_t		*vswp = ldcp->ldc_vswp;
5157 	vio_rdx_msg_t	rdx_msg;
5158 
5159 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5160 
5161 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5162 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5163 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
5164 	rdx_msg.tag.vio_sid = ldcp->local_session;
5165 
5166 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
5167 
5168 	DUMP_TAG(rdx_msg.tag);
5169 
5170 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
5171 
5172 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
5173 }
5174 
5175 /*
5176  * Generic routine to send message out over ldc channel.
5177  *
5178  * It is possible that when we attempt to write over the ldc channel
5179  * that we get notified that it has been reset. Depending on the value
5180  * of the handle_reset flag we either handle that event here or simply
5181  * notify the caller that the channel was reset.
5182  */
5183 int
5184 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
5185 {
5186 	int			rv;
5187 	size_t			msglen = size;
5188 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
5189 	vsw_t			*vswp = ldcp->ldc_vswp;
5190 	vio_dring_msg_t		*dmsg;
5191 	vio_raw_data_msg_t	*rmsg;
5192 	vnet_ibnd_desc_t	*imsg;
5193 	boolean_t		data_msg = B_FALSE;
5194 
5195 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
5196 	    ldcp->ldc_id, size);
5197 
5198 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
5199 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
5200 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
5201 
5202 	mutex_enter(&ldcp->ldc_txlock);
5203 
5204 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
5205 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
5206 			dmsg = (vio_dring_msg_t *)tag;
5207 			dmsg->seq_num = ldcp->lane_out.seq_num;
5208 			data_msg = B_TRUE;
5209 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
5210 			rmsg = (vio_raw_data_msg_t *)tag;
5211 			rmsg->seq_num = ldcp->lane_out.seq_num;
5212 			data_msg = B_TRUE;
5213 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
5214 			imsg = (vnet_ibnd_desc_t *)tag;
5215 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
5216 			data_msg = B_TRUE;
5217 		}
5218 	}
5219 
5220 	do {
5221 		msglen = size;
5222 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
5223 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
5224 
5225 	if (rv == 0 && data_msg == B_TRUE) {
5226 		ldcp->lane_out.seq_num++;
5227 	}
5228 
5229 	if ((rv != 0) || (msglen != size)) {
5230 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
5231 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
5232 		ldcp->ldc_stats.oerrors++;
5233 	}
5234 
5235 	mutex_exit(&ldcp->ldc_txlock);
5236 
5237 	/*
5238 	 * If channel has been reset we either handle it here or
5239 	 * simply report back that it has been reset and let caller
5240 	 * decide what to do.
5241 	 */
5242 	if (rv == ECONNRESET) {
5243 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
5244 
5245 		/*
5246 		 * N.B - must never be holding the dlistrw lock when
5247 		 * we do a reset of the channel.
5248 		 */
5249 		if (handle_reset) {
5250 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
5251 		}
5252 	}
5253 
5254 	return (rv);
5255 }
5256 
5257 /*
5258  * Remove the specified address from the list of address maintained
5259  * in this port node.
5260  */
5261 mcst_addr_t *
5262 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
5263 {
5264 	vsw_t		*vswp = NULL;
5265 	vsw_port_t	*port = NULL;
5266 	mcst_addr_t	*prev_p = NULL;
5267 	mcst_addr_t	*curr_p = NULL;
5268 
5269 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
5270 	    __func__, devtype, addr);
5271 
5272 	if (devtype == VSW_VNETPORT) {
5273 		port = (vsw_port_t *)arg;
5274 		mutex_enter(&port->mca_lock);
5275 		prev_p = curr_p = port->mcap;
5276 	} else {
5277 		vswp = (vsw_t *)arg;
5278 		mutex_enter(&vswp->mca_lock);
5279 		prev_p = curr_p = vswp->mcap;
5280 	}
5281 
5282 	while (curr_p != NULL) {
5283 		if (curr_p->addr == addr) {
5284 			D2(NULL, "%s: address found", __func__);
5285 			/* match found */
5286 			if (prev_p == curr_p) {
5287 				/* list head */
5288 				if (devtype == VSW_VNETPORT)
5289 					port->mcap = curr_p->nextp;
5290 				else
5291 					vswp->mcap = curr_p->nextp;
5292 			} else {
5293 				prev_p->nextp = curr_p->nextp;
5294 			}
5295 			break;
5296 		} else {
5297 			prev_p = curr_p;
5298 			curr_p = curr_p->nextp;
5299 		}
5300 	}
5301 
5302 	if (devtype == VSW_VNETPORT)
5303 		mutex_exit(&port->mca_lock);
5304 	else
5305 		mutex_exit(&vswp->mca_lock);
5306 
5307 	D1(NULL, "%s: exit", __func__);
5308 
5309 	return (curr_p);
5310 }
5311 
5312 /*
5313  * Creates a descriptor ring (dring) and links it into the
5314  * link of outbound drings for this channel.
5315  *
5316  * Returns NULL if creation failed.
5317  */
5318 static dring_info_t *
5319 vsw_create_dring(vsw_ldc_t *ldcp)
5320 {
5321 	vsw_private_desc_t	*priv_addr = NULL;
5322 	vsw_t			*vswp = ldcp->ldc_vswp;
5323 	ldc_mem_info_t		minfo;
5324 	dring_info_t		*dp, *tp;
5325 	int			i;
5326 
5327 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5328 
5329 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5330 
5331 	/* create public section of ring */
5332 	if ((ldc_mem_dring_create(vsw_ntxds,
5333 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
5334 
5335 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
5336 		    "failed", ldcp->ldc_id);
5337 		goto create_fail_exit;
5338 	}
5339 
5340 	ASSERT(dp->handle != NULL);
5341 
5342 	/*
5343 	 * Get the base address of the public section of the ring.
5344 	 */
5345 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
5346 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
5347 		    ldcp->ldc_id);
5348 		goto dring_fail_exit;
5349 	} else {
5350 		ASSERT(minfo.vaddr != 0);
5351 		dp->pub_addr = minfo.vaddr;
5352 	}
5353 
5354 	dp->num_descriptors = vsw_ntxds;
5355 	dp->descriptor_size = VSW_PUB_SIZE;
5356 	dp->options = VIO_TX_DRING;
5357 	dp->ncookies = 1;	/* guaranteed by ldc */
5358 
5359 	/*
5360 	 * create private portion of ring
5361 	 */
5362 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
5363 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5364 
5365 	if (vsw_setup_ring(ldcp, dp)) {
5366 		DERR(vswp, "%s: unable to setup ring", __func__);
5367 		goto dring_fail_exit;
5368 	}
5369 
5370 	/* haven't used any descriptors yet */
5371 	dp->end_idx = 0;
5372 	dp->last_ack_recv = -1;
5373 
5374 	/* bind dring to the channel */
5375 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
5376 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
5377 	    &dp->cookie[0], &dp->ncookies)) != 0) {
5378 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
5379 		    "%lld", ldcp->ldc_id);
5380 		goto dring_fail_exit;
5381 	}
5382 
5383 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5384 	dp->restart_reqd = B_TRUE;
5385 
5386 	/*
5387 	 * Only ever create rings for outgoing lane. Link it onto
5388 	 * end of list.
5389 	 */
5390 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5391 	if (ldcp->lane_out.dringp == NULL) {
5392 		D2(vswp, "vsw_create_dring: adding first outbound ring");
5393 		ldcp->lane_out.dringp = dp;
5394 	} else {
5395 		tp = ldcp->lane_out.dringp;
5396 		while (tp->next != NULL)
5397 			tp = tp->next;
5398 
5399 		tp->next = dp;
5400 	}
5401 	RW_EXIT(&ldcp->lane_out.dlistrw);
5402 
5403 	return (dp);
5404 
5405 dring_fail_exit:
5406 	(void) ldc_mem_dring_destroy(dp->handle);
5407 
5408 create_fail_exit:
5409 	if (dp->priv_addr != NULL) {
5410 		priv_addr = dp->priv_addr;
5411 		for (i = 0; i < vsw_ntxds; i++) {
5412 			if (priv_addr->memhandle != NULL)
5413 				(void) ldc_mem_free_handle(
5414 				    priv_addr->memhandle);
5415 			priv_addr++;
5416 		}
5417 		kmem_free(dp->priv_addr,
5418 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5419 	}
5420 	mutex_destroy(&dp->dlock);
5421 
5422 	kmem_free(dp, sizeof (dring_info_t));
5423 	return (NULL);
5424 }
5425 
5426 /*
5427  * Create a ring consisting of just a private portion and link
5428  * it into the list of rings for the outbound lane.
5429  *
5430  * These type of rings are used primarily for temporary data
5431  * storage (i.e. as data buffers).
5432  */
5433 void
5434 vsw_create_privring(vsw_ldc_t *ldcp)
5435 {
5436 	dring_info_t		*dp, *tp;
5437 	vsw_t			*vswp = ldcp->ldc_vswp;
5438 
5439 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5440 
5441 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5442 
5443 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5444 
5445 	/* no public section */
5446 	dp->pub_addr = NULL;
5447 
5448 	dp->priv_addr = kmem_zalloc(
5449 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5450 
5451 	dp->num_descriptors = vsw_ntxds;
5452 
5453 	if (vsw_setup_ring(ldcp, dp)) {
5454 		DERR(vswp, "%s: setup of ring failed", __func__);
5455 		kmem_free(dp->priv_addr,
5456 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5457 		mutex_destroy(&dp->dlock);
5458 		kmem_free(dp, sizeof (dring_info_t));
5459 		return;
5460 	}
5461 
5462 	/* haven't used any descriptors yet */
5463 	dp->end_idx = 0;
5464 
5465 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5466 	dp->restart_reqd = B_TRUE;
5467 
5468 	/*
5469 	 * Only ever create rings for outgoing lane. Link it onto
5470 	 * end of list.
5471 	 */
5472 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5473 	if (ldcp->lane_out.dringp == NULL) {
5474 		D2(vswp, "%s: adding first outbound privring", __func__);
5475 		ldcp->lane_out.dringp = dp;
5476 	} else {
5477 		tp = ldcp->lane_out.dringp;
5478 		while (tp->next != NULL)
5479 			tp = tp->next;
5480 
5481 		tp->next = dp;
5482 	}
5483 	RW_EXIT(&ldcp->lane_out.dlistrw);
5484 
5485 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5486 }
5487 
5488 /*
5489  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5490  * failure.
5491  */
5492 int
5493 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5494 {
5495 	vnet_public_desc_t	*pub_addr = NULL;
5496 	vsw_private_desc_t	*priv_addr = NULL;
5497 	vsw_t			*vswp = ldcp->ldc_vswp;
5498 	uint64_t		*tmpp;
5499 	uint64_t		offset = 0;
5500 	uint32_t		ncookies = 0;
5501 	static char		*name = "vsw_setup_ring";
5502 	int			i, j, nc, rv;
5503 	size_t			data_sz;
5504 	void			*data_addr;
5505 
5506 	priv_addr = dp->priv_addr;
5507 	pub_addr = dp->pub_addr;
5508 
5509 	/* public section may be null but private should never be */
5510 	ASSERT(priv_addr != NULL);
5511 
5512 	/*
5513 	 * Allocate the region of memory which will be used to hold
5514 	 * the data the descriptors will refer to.
5515 	 */
5516 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5517 
5518 	/*
5519 	 * In order to ensure that the number of ldc cookies per descriptor is
5520 	 * limited to be within the default MAX_COOKIES (2), we take the steps
5521 	 * outlined below:
5522 	 *
5523 	 * Align the entire data buffer area to 8K and carve out per descriptor
5524 	 * data buffers starting from this 8K aligned base address.
5525 	 *
5526 	 * We round up the mtu specified to be a multiple of 2K or 4K.
5527 	 * For sizes up to 12K we round up the size to the next 2K.
5528 	 * For sizes > 12K we round up to the next 4K (otherwise sizes such as
5529 	 * 14K could end up needing 3 cookies, with the buffer spread across
5530 	 * 3 8K pages:  8K+6K, 2K+8K+2K, 6K+8K, ...).
5531 	 */
5532 	if (data_sz <= VNET_12K) {
5533 		data_sz = VNET_ROUNDUP_2K(data_sz);
5534 	} else {
5535 		data_sz = VNET_ROUNDUP_4K(data_sz);
5536 	}
5537 
5538 	dp->desc_data_sz = data_sz;
5539 
5540 	/* allocate extra 8K bytes for alignment */
5541 	dp->data_sz = (vsw_ntxds * data_sz) + VNET_8K;
5542 	data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5543 	dp->data_addr = data_addr;
5544 
5545 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5546 	    dp->data_sz, dp->data_addr);
5547 
5548 	/* align the starting address of the data area to 8K */
5549 	data_addr = (void *)VNET_ROUNDUP_8K((uintptr_t)data_addr);
5550 
5551 	tmpp = (uint64_t *)data_addr;
5552 	offset = dp->desc_data_sz/sizeof (tmpp);
5553 
5554 	/*
5555 	 * Initialise some of the private and public (if they exist)
5556 	 * descriptor fields.
5557 	 */
5558 	for (i = 0; i < vsw_ntxds; i++) {
5559 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5560 
5561 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5562 		    &priv_addr->memhandle)) != 0) {
5563 			DERR(vswp, "%s: alloc mem handle failed", name);
5564 			goto setup_ring_cleanup;
5565 		}
5566 
5567 		priv_addr->datap = (void *)tmpp;
5568 
5569 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5570 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5571 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5572 		    &(priv_addr->memcookie[0]), &ncookies);
5573 		if (rv != 0) {
5574 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5575 			    "(rv %d)", name, ldcp->ldc_id, rv);
5576 			goto setup_ring_cleanup;
5577 		}
5578 		priv_addr->bound = 1;
5579 
5580 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5581 		    name, i, priv_addr->memcookie[0].addr,
5582 		    priv_addr->memcookie[0].size);
5583 
5584 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5585 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5586 			    "invalid num of cookies (%d) for size 0x%llx",
5587 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5588 
5589 			goto setup_ring_cleanup;
5590 		} else {
5591 			for (j = 1; j < ncookies; j++) {
5592 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5593 				    &(priv_addr->memcookie[j]));
5594 				if (rv != 0) {
5595 					DERR(vswp, "%s: ldc_mem_nextcookie "
5596 					    "failed rv (%d)", name, rv);
5597 					goto setup_ring_cleanup;
5598 				}
5599 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5600 				    "size 0x%llx", name, j,
5601 				    priv_addr->memcookie[j].addr,
5602 				    priv_addr->memcookie[j].size);
5603 			}
5604 
5605 		}
5606 		priv_addr->ncookies = ncookies;
5607 		priv_addr->dstate = VIO_DESC_FREE;
5608 
5609 		if (pub_addr != NULL) {
5610 
5611 			/* link pub and private sides */
5612 			priv_addr->descp = pub_addr;
5613 
5614 			pub_addr->ncookies = priv_addr->ncookies;
5615 
5616 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5617 				bcopy(&priv_addr->memcookie[nc],
5618 				    &pub_addr->memcookie[nc],
5619 				    sizeof (ldc_mem_cookie_t));
5620 			}
5621 
5622 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5623 			pub_addr++;
5624 		}
5625 
5626 		/*
5627 		 * move to next element in the dring and the next
5628 		 * position in the data buffer.
5629 		 */
5630 		priv_addr++;
5631 		tmpp += offset;
5632 	}
5633 
5634 	return (0);
5635 
5636 setup_ring_cleanup:
5637 	priv_addr = dp->priv_addr;
5638 
5639 	for (j = 0; j < i; j++) {
5640 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5641 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5642 
5643 		mutex_destroy(&priv_addr->dstate_lock);
5644 
5645 		priv_addr++;
5646 	}
5647 	kmem_free(dp->data_addr, dp->data_sz);
5648 
5649 	return (1);
5650 }
5651 
5652 /*
5653  * Searches the private section of a ring for a free descriptor,
5654  * starting at the location of the last free descriptor found
5655  * previously.
5656  *
5657  * Returns 0 if free descriptor is available, and updates state
5658  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5659  *
5660  * FUTURE: might need to return contiguous range of descriptors
5661  * as dring info msg assumes all will be contiguous.
5662  */
5663 static int
5664 vsw_dring_find_free_desc(dring_info_t *dringp,
5665 		vsw_private_desc_t **priv_p, int *idx)
5666 {
5667 	vsw_private_desc_t	*addr = NULL;
5668 	int			num = vsw_ntxds;
5669 	int			ret = 1;
5670 
5671 	D1(NULL, "%s enter\n", __func__);
5672 
5673 	ASSERT(dringp->priv_addr != NULL);
5674 
5675 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5676 	    __func__, dringp, dringp->end_idx);
5677 
5678 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5679 
5680 	mutex_enter(&addr->dstate_lock);
5681 	if (addr->dstate == VIO_DESC_FREE) {
5682 		addr->dstate = VIO_DESC_READY;
5683 		*priv_p = addr;
5684 		*idx = dringp->end_idx;
5685 		dringp->end_idx = (dringp->end_idx + 1) % num;
5686 		ret = 0;
5687 
5688 	}
5689 	mutex_exit(&addr->dstate_lock);
5690 
5691 	/* ring full */
5692 	if (ret == 1) {
5693 		D2(NULL, "%s: no desp free: started at %d", __func__,
5694 		    dringp->end_idx);
5695 	}
5696 
5697 	D1(NULL, "%s: exit\n", __func__);
5698 
5699 	return (ret);
5700 }
5701 
5702 /*
5703  * Map from a dring identifier to the ring itself. Returns
5704  * pointer to ring or NULL if no match found.
5705  *
5706  * Should be called with dlistrw rwlock held as reader.
5707  */
5708 static dring_info_t *
5709 vsw_ident2dring(lane_t *lane, uint64_t ident)
5710 {
5711 	dring_info_t	*dp = NULL;
5712 
5713 	if ((dp = lane->dringp) == NULL) {
5714 		return (NULL);
5715 	} else {
5716 		if (dp->ident == ident)
5717 			return (dp);
5718 
5719 		while (dp != NULL) {
5720 			if (dp->ident == ident)
5721 				break;
5722 			dp = dp->next;
5723 		}
5724 	}
5725 
5726 	return (dp);
5727 }
5728 
5729 /*
5730  * Set the default lane attributes. These are copied into
5731  * the attr msg we send to our peer. If they are not acceptable
5732  * then (currently) the handshake ends.
5733  */
5734 static void
5735 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5736 {
5737 	bzero(lp, sizeof (lane_t));
5738 
5739 	READ_ENTER(&vswp->if_lockrw);
5740 	ether_copy(&(vswp->if_addr), &(lp->addr));
5741 	RW_EXIT(&vswp->if_lockrw);
5742 
5743 	lp->mtu = vswp->max_frame_size;
5744 	lp->addr_type = ADDR_TYPE_MAC;
5745 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5746 	lp->ack_freq = 0;	/* for shared mode */
5747 	lp->seq_num = VNET_ISS;
5748 }
5749 
5750 /*
5751  * Verify that the attributes are acceptable.
5752  *
5753  * FUTURE: If some attributes are not acceptable, change them
5754  * our desired values.
5755  */
5756 static int
5757 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5758 {
5759 	int			ret = 0;
5760 	struct ether_addr	ea;
5761 	vsw_port_t		*port = ldcp->ldc_port;
5762 	lane_t			*lp = &ldcp->lane_out;
5763 
5764 	D1(NULL, "vsw_check_attr enter\n");
5765 
5766 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5767 	    (pkt->xfer_mode != lp->xfer_mode)) {
5768 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5769 		ret = 1;
5770 	}
5771 
5772 	/* Only support MAC addresses at moment. */
5773 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5774 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5775 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5776 		ret = 1;
5777 	}
5778 
5779 	/*
5780 	 * MAC address supplied by device should match that stored
5781 	 * in the vsw-port OBP node. Need to decide what to do if they
5782 	 * don't match, for the moment just warn but don't fail.
5783 	 */
5784 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5785 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5786 		DERR(NULL, "vsw_check_attr: device supplied address "
5787 		    "0x%llx doesn't match node address 0x%llx\n",
5788 		    pkt->addr, port->p_macaddr);
5789 	}
5790 
5791 	/*
5792 	 * Ack freq only makes sense in pkt mode, in shared
5793 	 * mode the ring descriptors say whether or not to
5794 	 * send back an ACK.
5795 	 */
5796 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5797 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5798 	    (VSW_VER_LT(ldcp, 1, 2) &&
5799 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5800 		if (pkt->ack_freq > 0) {
5801 			D2(NULL, "vsw_check_attr: non zero ack freq "
5802 			    " in SHM mode\n");
5803 			ret = 1;
5804 		}
5805 	}
5806 
5807 	if (VSW_VER_LT(ldcp, 1, 4)) {
5808 		/* versions < 1.4, mtu must match */
5809 		if (pkt->mtu != lp->mtu) {
5810 			D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5811 			    pkt->mtu);
5812 			ret = 1;
5813 		}
5814 	} else {
5815 		/* Ver >= 1.4, validate mtu of the peer is at least ETHERMAX */
5816 		if (pkt->mtu < ETHERMAX) {
5817 			ret = 1;
5818 		}
5819 	}
5820 
5821 	D1(NULL, "vsw_check_attr exit\n");
5822 
5823 	return (ret);
5824 }
5825 
5826 /*
5827  * Returns 1 if there is a problem, 0 otherwise.
5828  */
5829 static int
5830 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5831 {
5832 	_NOTE(ARGUNUSED(pkt))
5833 
5834 	int	ret = 0;
5835 
5836 	D1(NULL, "vsw_check_dring_info enter\n");
5837 
5838 	if ((pkt->num_descriptors == 0) ||
5839 	    (pkt->descriptor_size == 0) ||
5840 	    (pkt->ncookies != 1)) {
5841 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5842 		ret = 1;
5843 	}
5844 
5845 	D1(NULL, "vsw_check_dring_info exit\n");
5846 
5847 	return (ret);
5848 }
5849 
5850 /*
5851  * Returns 1 if two memory cookies match. Otherwise returns 0.
5852  */
5853 static int
5854 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5855 {
5856 	if ((m1->addr != m2->addr) ||
5857 	    (m2->size != m2->size)) {
5858 		return (0);
5859 	} else {
5860 		return (1);
5861 	}
5862 }
5863 
5864 /*
5865  * Returns 1 if ring described in reg message matches that
5866  * described by dring_info structure. Otherwise returns 0.
5867  */
5868 static int
5869 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5870 {
5871 	if ((msg->descriptor_size != dp->descriptor_size) ||
5872 	    (msg->num_descriptors != dp->num_descriptors) ||
5873 	    (msg->ncookies != dp->ncookies) ||
5874 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5875 		return (0);
5876 	} else {
5877 		return (1);
5878 	}
5879 
5880 }
5881 
5882 /*
5883  * Reset and free all the resources associated with
5884  * the channel.
5885  */
5886 static void
5887 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5888 {
5889 	dring_info_t		*dp, *dpp;
5890 	lane_t			*lp = NULL;
5891 
5892 	ASSERT(ldcp != NULL);
5893 
5894 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5895 
5896 	if (dir == INBOUND) {
5897 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5898 		    " of channel %lld", __func__, ldcp->ldc_id);
5899 		lp = &ldcp->lane_in;
5900 	} else {
5901 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5902 		    " of channel %lld", __func__, ldcp->ldc_id);
5903 		lp = &ldcp->lane_out;
5904 	}
5905 
5906 	lp->lstate = VSW_LANE_INACTIV;
5907 	lp->seq_num = VNET_ISS;
5908 
5909 	if (lp->dringp) {
5910 		if (dir == INBOUND) {
5911 			WRITE_ENTER(&lp->dlistrw);
5912 			dp = lp->dringp;
5913 			while (dp != NULL) {
5914 				dpp = dp->next;
5915 				if (dp->handle != NULL)
5916 					(void) ldc_mem_dring_unmap(dp->handle);
5917 				kmem_free(dp, sizeof (dring_info_t));
5918 				dp = dpp;
5919 			}
5920 			RW_EXIT(&lp->dlistrw);
5921 		} else {
5922 			/*
5923 			 * unbind, destroy exported dring, free dring struct
5924 			 */
5925 			WRITE_ENTER(&lp->dlistrw);
5926 			dp = lp->dringp;
5927 			vsw_free_ring(dp);
5928 			RW_EXIT(&lp->dlistrw);
5929 		}
5930 		lp->dringp = NULL;
5931 	}
5932 
5933 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5934 }
5935 
5936 /*
5937  * Free ring and all associated resources.
5938  *
5939  * Should be called with dlistrw rwlock held as writer.
5940  */
5941 static void
5942 vsw_free_ring(dring_info_t *dp)
5943 {
5944 	vsw_private_desc_t	*paddr = NULL;
5945 	dring_info_t		*dpp;
5946 	int			i;
5947 
5948 	while (dp != NULL) {
5949 		mutex_enter(&dp->dlock);
5950 		dpp = dp->next;
5951 		if (dp->priv_addr != NULL) {
5952 			/*
5953 			 * First unbind and free the memory handles
5954 			 * stored in each descriptor within the ring.
5955 			 */
5956 			for (i = 0; i < vsw_ntxds; i++) {
5957 				paddr = (vsw_private_desc_t *)
5958 				    dp->priv_addr + i;
5959 				if (paddr->memhandle != NULL) {
5960 					if (paddr->bound == 1) {
5961 						if (ldc_mem_unbind_handle(
5962 						    paddr->memhandle) != 0) {
5963 							DERR(NULL, "error "
5964 							"unbinding handle for "
5965 							"ring 0x%llx at pos %d",
5966 							    dp, i);
5967 							continue;
5968 						}
5969 						paddr->bound = 0;
5970 					}
5971 
5972 					if (ldc_mem_free_handle(
5973 					    paddr->memhandle) != 0) {
5974 						DERR(NULL, "error freeing "
5975 						    "handle for ring 0x%llx "
5976 						    "at pos %d", dp, i);
5977 						continue;
5978 					}
5979 					paddr->memhandle = NULL;
5980 				}
5981 				mutex_destroy(&paddr->dstate_lock);
5982 			}
5983 			kmem_free(dp->priv_addr,
5984 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5985 		}
5986 
5987 		/*
5988 		 * Now unbind and destroy the ring itself.
5989 		 */
5990 		if (dp->handle != NULL) {
5991 			(void) ldc_mem_dring_unbind(dp->handle);
5992 			(void) ldc_mem_dring_destroy(dp->handle);
5993 		}
5994 
5995 		if (dp->data_addr != NULL) {
5996 			kmem_free(dp->data_addr, dp->data_sz);
5997 		}
5998 
5999 		mutex_exit(&dp->dlock);
6000 		mutex_destroy(&dp->dlock);
6001 		mutex_destroy(&dp->restart_lock);
6002 		kmem_free(dp, sizeof (dring_info_t));
6003 
6004 		dp = dpp;
6005 	}
6006 }
6007 
6008 /*
6009  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
6010  * This thread is woken up by the LDC interrupt handler to process
6011  * LDC packets and receive data.
6012  */
6013 static void
6014 vsw_ldc_rx_worker(void *arg)
6015 {
6016 	callb_cpr_t	cprinfo;
6017 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
6018 	vsw_t *vswp = ldcp->ldc_vswp;
6019 
6020 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6021 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
6022 	    "vsw_rx_thread");
6023 	mutex_enter(&ldcp->rx_thr_lock);
6024 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
6025 
6026 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
6027 		/*
6028 		 * Wait until the data is received or a stop
6029 		 * request is received.
6030 		 */
6031 		while (!(ldcp->rx_thr_flags &
6032 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
6033 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
6034 		}
6035 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
6036 
6037 		/*
6038 		 * First process the stop request.
6039 		 */
6040 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
6041 			D2(vswp, "%s(%lld):Rx thread stopped\n",
6042 			    __func__, ldcp->ldc_id);
6043 			break;
6044 		}
6045 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
6046 		mutex_exit(&ldcp->rx_thr_lock);
6047 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
6048 		    __func__, ldcp->ldc_id);
6049 		mutex_enter(&ldcp->ldc_cblock);
6050 		vsw_process_pkt(ldcp);
6051 		mutex_exit(&ldcp->ldc_cblock);
6052 		mutex_enter(&ldcp->rx_thr_lock);
6053 	}
6054 
6055 	/*
6056 	 * Update the run status and wakeup the thread that
6057 	 * has sent the stop request.
6058 	 */
6059 	ldcp->rx_thr_flags &= ~VSW_WTHR_STOP;
6060 	ldcp->rx_thread = NULL;
6061 	CALLB_CPR_EXIT(&cprinfo);
6062 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6063 	thread_exit();
6064 }
6065 
6066 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
6067 static void
6068 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
6069 {
6070 	kt_did_t	tid = 0;
6071 	vsw_t		*vswp = ldcp->ldc_vswp;
6072 
6073 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6074 	/*
6075 	 * Send a stop request by setting the stop flag and
6076 	 * wait until the receive thread stops.
6077 	 */
6078 	mutex_enter(&ldcp->rx_thr_lock);
6079 	if (ldcp->rx_thread != NULL) {
6080 		tid = ldcp->rx_thread->t_did;
6081 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
6082 		cv_signal(&ldcp->rx_thr_cv);
6083 	}
6084 	mutex_exit(&ldcp->rx_thr_lock);
6085 
6086 	if (tid != 0) {
6087 		thread_join(tid);
6088 	}
6089 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6090 }
6091 
6092 /*
6093  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
6094  * This thread is woken up by the vsw_portsend to transmit
6095  * packets.
6096  */
6097 static void
6098 vsw_ldc_tx_worker(void *arg)
6099 {
6100 	callb_cpr_t	cprinfo;
6101 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
6102 	vsw_t *vswp = ldcp->ldc_vswp;
6103 	mblk_t *mp;
6104 	mblk_t *tmp;
6105 
6106 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6107 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
6108 	    "vnet_tx_thread");
6109 	mutex_enter(&ldcp->tx_thr_lock);
6110 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
6111 
6112 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
6113 		/*
6114 		 * Wait until the data is received or a stop
6115 		 * request is received.
6116 		 */
6117 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
6118 		    (ldcp->tx_mhead == NULL)) {
6119 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
6120 		}
6121 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
6122 
6123 		/*
6124 		 * First process the stop request.
6125 		 */
6126 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
6127 			D2(vswp, "%s(%lld):tx thread stopped\n",
6128 			    __func__, ldcp->ldc_id);
6129 			break;
6130 		}
6131 		mp = ldcp->tx_mhead;
6132 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
6133 		ldcp->tx_cnt = 0;
6134 		mutex_exit(&ldcp->tx_thr_lock);
6135 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
6136 		    __func__, ldcp->ldc_id);
6137 		while (mp != NULL) {
6138 			tmp = mp->b_next;
6139 			mp->b_next = mp->b_prev = NULL;
6140 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
6141 			mp = tmp;
6142 		}
6143 		mutex_enter(&ldcp->tx_thr_lock);
6144 	}
6145 
6146 	/*
6147 	 * Update the run status and wakeup the thread that
6148 	 * has sent the stop request.
6149 	 */
6150 	ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
6151 	ldcp->tx_thread = NULL;
6152 	CALLB_CPR_EXIT(&cprinfo);
6153 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6154 	thread_exit();
6155 }
6156 
6157 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
6158 static void
6159 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
6160 {
6161 	kt_did_t	tid = 0;
6162 	vsw_t		*vswp = ldcp->ldc_vswp;
6163 
6164 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6165 	/*
6166 	 * Send a stop request by setting the stop flag and
6167 	 * wait until the receive thread stops.
6168 	 */
6169 	mutex_enter(&ldcp->tx_thr_lock);
6170 	if (ldcp->tx_thread != NULL) {
6171 		tid = ldcp->tx_thread->t_did;
6172 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
6173 		cv_signal(&ldcp->tx_thr_cv);
6174 	}
6175 	mutex_exit(&ldcp->tx_thr_lock);
6176 
6177 	if (tid != 0) {
6178 		thread_join(tid);
6179 	}
6180 
6181 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6182 }
6183 
6184 /* vsw_reclaim_dring -- reclaim descriptors */
6185 static int
6186 vsw_reclaim_dring(dring_info_t *dp, int start)
6187 {
6188 	int i, j, len;
6189 	vsw_private_desc_t *priv_addr;
6190 	vnet_public_desc_t *pub_addr;
6191 
6192 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
6193 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6194 	len = dp->num_descriptors;
6195 
6196 	D2(NULL, "%s: start index %ld\n", __func__, start);
6197 
6198 	j = 0;
6199 	for (i = start; j < len; i = (i + 1) % len, j++) {
6200 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6201 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6202 
6203 		mutex_enter(&priv_addr->dstate_lock);
6204 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
6205 			mutex_exit(&priv_addr->dstate_lock);
6206 			break;
6207 		}
6208 		pub_addr->hdr.dstate = VIO_DESC_FREE;
6209 		priv_addr->dstate = VIO_DESC_FREE;
6210 		/* clear all the fields */
6211 		priv_addr->datalen = 0;
6212 		pub_addr->hdr.ack = 0;
6213 		mutex_exit(&priv_addr->dstate_lock);
6214 
6215 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
6216 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
6217 	}
6218 	return (j);
6219 }
6220 
6221 /*
6222  * Debugging routines
6223  */
6224 static void
6225 display_state(void)
6226 {
6227 	vsw_t		*vswp;
6228 	vsw_port_list_t	*plist;
6229 	vsw_port_t 	*port;
6230 	vsw_ldc_list_t	*ldcl;
6231 	vsw_ldc_t 	*ldcp;
6232 	extern vsw_t 	*vsw_head;
6233 
6234 	cmn_err(CE_NOTE, "***** system state *****");
6235 
6236 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
6237 		plist = &vswp->plist;
6238 		READ_ENTER(&plist->lockrw);
6239 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
6240 		    vswp->instance, plist->num_ports);
6241 
6242 		for (port = plist->head; port != NULL; port = port->p_next) {
6243 			ldcl = &port->p_ldclist;
6244 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
6245 			    port->p_instance, port->num_ldcs);
6246 			READ_ENTER(&ldcl->lockrw);
6247 			ldcp = ldcl->head;
6248 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
6249 				cmn_err(CE_CONT, "chan %lu : dev %d : "
6250 				    "status %d : phase %u\n",
6251 				    ldcp->ldc_id, ldcp->dev_class,
6252 				    ldcp->ldc_status, ldcp->hphase);
6253 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
6254 				    "psession %lu\n", ldcp->ldc_id,
6255 				    ldcp->local_session, ldcp->peer_session);
6256 
6257 				cmn_err(CE_CONT, "Inbound lane:\n");
6258 				display_lane(&ldcp->lane_in);
6259 				cmn_err(CE_CONT, "Outbound lane:\n");
6260 				display_lane(&ldcp->lane_out);
6261 			}
6262 			RW_EXIT(&ldcl->lockrw);
6263 		}
6264 		RW_EXIT(&plist->lockrw);
6265 	}
6266 	cmn_err(CE_NOTE, "***** system state *****");
6267 }
6268 
6269 static void
6270 display_lane(lane_t *lp)
6271 {
6272 	dring_info_t	*drp;
6273 
6274 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
6275 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
6276 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
6277 	    lp->addr_type, lp->addr, lp->xfer_mode);
6278 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
6279 
6280 	cmn_err(CE_CONT, "Dring info:\n");
6281 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
6282 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
6283 		    drp->num_descriptors, drp->descriptor_size);
6284 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
6285 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
6286 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
6287 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
6288 		    drp->ident, drp->end_idx);
6289 		display_ring(drp);
6290 	}
6291 }
6292 
6293 static void
6294 display_ring(dring_info_t *dringp)
6295 {
6296 	uint64_t		i;
6297 	uint64_t		priv_count = 0;
6298 	uint64_t		pub_count = 0;
6299 	vnet_public_desc_t	*pub_addr = NULL;
6300 	vsw_private_desc_t	*priv_addr = NULL;
6301 
6302 	for (i = 0; i < vsw_ntxds; i++) {
6303 		if (dringp->pub_addr != NULL) {
6304 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
6305 
6306 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
6307 				pub_count++;
6308 		}
6309 
6310 		if (dringp->priv_addr != NULL) {
6311 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
6312 
6313 			if (priv_addr->dstate == VIO_DESC_FREE)
6314 				priv_count++;
6315 		}
6316 	}
6317 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
6318 	    i, priv_count, pub_count);
6319 }
6320 
6321 static void
6322 dump_flags(uint64_t state)
6323 {
6324 	int	i;
6325 
6326 	typedef struct flag_name {
6327 		int	flag_val;
6328 		char	*flag_name;
6329 	} flag_name_t;
6330 
6331 	flag_name_t	flags[] = {
6332 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
6333 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
6334 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
6335 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
6336 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
6337 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
6338 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
6339 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
6340 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
6341 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
6342 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
6343 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
6344 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
6345 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
6346 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
6347 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
6348 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
6349 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
6350 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
6351 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
6352 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
6353 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
6354 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
6355 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
6356 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
6357 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
6358 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
6359 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
6360 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
6361 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
6362 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
6363 
6364 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
6365 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
6366 		if (state & flags[i].flag_val)
6367 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
6368 	}
6369 }
6370