xref: /titanic_52/usr/src/uts/sun4v/io/vsw_ldc.c (revision 85bcc4e57d6d451b2647973b01b8ab11c489351a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/callb.h>
72 #include <sys/vlan.h>
73 
74 /* Port add/deletion/etc routines */
75 static	void vsw_port_delete(vsw_port_t *port);
76 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
77 static	void vsw_ldc_detach(vsw_ldc_t *ldcp);
78 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
79 static	void vsw_ldc_uninit(vsw_ldc_t *ldcp);
80 static	void vsw_ldc_drain(vsw_ldc_t *ldcp);
81 static	void vsw_drain_port_taskq(vsw_port_t *port);
82 static	void vsw_marker_task(void *);
83 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
84 void vsw_detach_ports(vsw_t *vswp);
85 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
86 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
87 int vsw_port_detach(vsw_t *vswp, int p_instance);
88 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
89 int vsw_port_attach(vsw_port_t *portp);
90 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
91 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
92 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
93 void vsw_reset_ports(vsw_t *vswp);
94 void vsw_port_reset(vsw_port_t *portp);
95 void vsw_physlink_update_ports(vsw_t *vswp);
96 static	void vsw_port_physlink_update(vsw_port_t *portp);
97 
98 /* Interrupt routines */
99 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
100 
101 /* Handshake routines */
102 static	void vsw_ldc_reinit(vsw_ldc_t *);
103 static	void vsw_conn_task(void *);
104 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
105 static	void vsw_next_milestone(vsw_ldc_t *);
106 static	int vsw_supported_version(vio_ver_msg_t *);
107 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
108 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
109 void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
110 
111 /* Data processing routines */
112 void vsw_process_pkt(void *);
113 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *, int);
114 static void vsw_process_ctrl_pkt(void *);
115 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
116 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
117 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
122 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
123 	uint32_t);
124 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
125 static void vsw_process_pkt_data(void *, void *, uint32_t);
126 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
127 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
128 static void vsw_process_evt_read(vsw_ldc_t *ldcp);
129 static void vsw_ldc_rcv(vsw_ldc_t *ldcp);
130 
131 /* Switching/data transmit routines */
132 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
133 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
134 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
135 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
136 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137 
138 /* Packet creation routines */
139 static void vsw_send_ver(void *);
140 static void vsw_send_attr(vsw_ldc_t *);
141 static void vsw_send_dring_info(vsw_ldc_t *);
142 static void vsw_send_rdx(vsw_ldc_t *);
143 static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);
144 
145 /* Dring routines */
146 static void vsw_create_privring(vsw_ldc_t *);
147 static dring_info_t *vsw_map_dring(vsw_ldc_t *ldcp, void *pkt);
148 static void vsw_unmap_dring(vsw_ldc_t *ldcp);
149 static void vsw_destroy_dring(vsw_ldc_t *ldcp);
150 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
151 static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt);
152 static void vsw_set_lane_attr(vsw_t *, lane_t *);
153 dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp,
154     vio_dring_reg_msg_t *dring_pkt);
155 
156 /* tx/msg/rcv thread routines */
157 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
158 static void vsw_ldc_tx_worker(void *arg);
159 
160 /* Misc support routines */
161 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
162 static int vsw_get_same_dest_list(struct ether_header *ehp,
163     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
164 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
165 
166 /* Debugging routines */
167 static void dump_flags(uint64_t);
168 static void display_state(void);
169 static void display_lane(lane_t *);
170 static void display_ring(dring_info_t *);
171 
172 /*
173  * Functions imported from other files.
174  */
175 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
176 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
177 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
178 extern void vsw_del_mcst_port(vsw_port_t *port);
179 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
180 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
181 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
182 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
183 extern void vsw_create_vlans(void *arg, int type);
184 extern void vsw_destroy_vlans(void *arg, int type);
185 extern void vsw_vlan_add_ids(void *arg, int type);
186 extern void vsw_vlan_remove_ids(void *arg, int type);
187 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
188 	struct ether_header *ehp, uint16_t *vidp);
189 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
190 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
191 	mblk_t **npt);
192 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
193 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
194 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
195 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
196 extern void vsw_hio_stop_port(vsw_port_t *portp);
197 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
198 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
199 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
200 extern void vsw_destroy_rxpools(void *arg);
201 extern void vsw_stop_msg_thread(vsw_ldc_t *ldcp);
202 extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
203 extern int vsw_dringsend(vsw_ldc_t *, mblk_t *);
204 extern int vsw_reclaim_dring(dring_info_t *dp, int start);
205 extern int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
206     int *);
207 extern vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *);
208 extern int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp);
209 extern void vsw_destroy_tx_dring(vsw_ldc_t *ldcp);
210 extern dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt);
211 extern void vsw_unmap_rx_dring(vsw_ldc_t *ldcp);
212 extern void vsw_ldc_msg_worker(void *arg);
213 extern void vsw_process_dringdata(void *, void *);
214 extern vio_dring_reg_msg_t *vsw_create_rx_dring_info(vsw_ldc_t *);
215 extern void vsw_destroy_rx_dring(vsw_ldc_t *ldcp);
216 extern dring_info_t *vsw_map_tx_dring(vsw_ldc_t *ldcp, void *pkt);
217 extern void vsw_unmap_tx_dring(vsw_ldc_t *ldcp);
218 extern void vsw_ldc_rcv_worker(void *arg);
219 extern void vsw_stop_rcv_thread(vsw_ldc_t *ldcp);
220 extern int vsw_dringsend_shm(vsw_ldc_t *, mblk_t *);
221 extern void vsw_process_dringdata_shm(void *, void *);
222 
223 /*
224  * Tunables used in this file.
225  */
226 extern int vsw_num_handshakes;
227 extern int vsw_ldc_tx_delay;
228 extern int vsw_ldc_tx_retries;
229 extern int vsw_ldc_retries;
230 extern int vsw_ldc_delay;
231 extern boolean_t vsw_ldc_rxthr_enabled;
232 extern boolean_t vsw_ldc_txthr_enabled;
233 extern uint32_t vsw_num_descriptors;
234 extern uint8_t  vsw_dring_mode;
235 extern uint32_t vsw_max_tx_qcount;
236 extern boolean_t vsw_obp_ver_proto_workaround;
237 extern uint32_t vsw_publish_macaddr_count;
238 
239 #define	LDC_ENTER_LOCK(ldcp)	\
240 				mutex_enter(&((ldcp)->ldc_cblock));\
241 				mutex_enter(&((ldcp)->ldc_rxlock));\
242 				mutex_enter(&((ldcp)->ldc_txlock));
243 #define	LDC_EXIT_LOCK(ldcp)	\
244 				mutex_exit(&((ldcp)->ldc_txlock));\
245 				mutex_exit(&((ldcp)->ldc_rxlock));\
246 				mutex_exit(&((ldcp)->ldc_cblock));
247 
248 #define	VSW_VER_EQ(ldcp, major, minor)	\
249 	((ldcp)->lane_out.ver_major == (major) &&	\
250 	    (ldcp)->lane_out.ver_minor == (minor))
251 
252 #define	VSW_VER_LT(ldcp, major, minor)	\
253 	(((ldcp)->lane_out.ver_major < (major)) ||	\
254 	    ((ldcp)->lane_out.ver_major == (major) &&	\
255 	    (ldcp)->lane_out.ver_minor < (minor)))
256 
257 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
258 	(((ldcp)->lane_out.ver_major > (major)) ||	\
259 	    ((ldcp)->lane_out.ver_major == (major) &&	\
260 	    (ldcp)->lane_out.ver_minor >= (minor)))
261 
262 #define	VSW_VER_LTEQ(ldcp, major, minor)	\
263 	(((ldcp)->lane_out.ver_major < (major)) ||	\
264 	    ((ldcp)->lane_out.ver_major == (major) &&	\
265 	    (ldcp)->lane_out.ver_minor <= (minor)))
266 
267 /*
268  * VIO Protocol Version Info:
269  *
270  * The version specified below represents the version of protocol currently
271  * supported in the driver. It means the driver can negotiate with peers with
272  * versions <= this version. Here is a summary of the feature(s) that are
273  * supported at each version of the protocol:
274  *
275  * 1.0			Basic VIO protocol.
276  * 1.1			vDisk protocol update (no virtual network update).
277  * 1.2			Support for priority frames (priority-ether-types).
278  * 1.3			VLAN and HybridIO support.
279  * 1.4			Jumbo Frame support.
280  * 1.5			Link State Notification support with optional support
281  * 			for Physical Link information.
282  * 1.6			Support for RxDringData mode.
283  */
284 static	ver_sup_t	vsw_versions[] = { {1, 6} };
285 
286 /*
287  * For the moment the state dump routines have their own
288  * private flag.
289  */
290 #define	DUMP_STATE	0
291 
292 #if DUMP_STATE
293 
294 #define	DUMP_TAG(tag) \
295 {			\
296 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
297 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
298 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
299 }
300 
301 #define	DUMP_TAG_PTR(tag) \
302 {			\
303 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
304 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
305 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
306 }
307 
308 #define	DUMP_FLAGS(flags) dump_flags(flags);
309 #define	DISPLAY_STATE()	display_state()
310 
311 #else
312 
313 #define	DUMP_TAG(tag)
314 #define	DUMP_TAG_PTR(tag)
315 #define	DUMP_FLAGS(state)
316 #define	DISPLAY_STATE()
317 
318 #endif	/* DUMP_STATE */
319 
320 /*
321  * Attach the specified port.
322  *
323  * Returns 0 on success, 1 on failure.
324  */
325 int
326 vsw_port_attach(vsw_port_t *port)
327 {
328 	vsw_t			*vswp = port->p_vswp;
329 	vsw_port_list_t		*plist = &vswp->plist;
330 	vsw_port_t		*p, **pp;
331 	int			nids = port->num_ldcs;
332 	uint64_t		*ldcids;
333 	int			rv;
334 
335 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
336 
337 	/* port already exists? */
338 	READ_ENTER(&plist->lockrw);
339 	for (p = plist->head; p != NULL; p = p->p_next) {
340 		if (p->p_instance == port->p_instance) {
341 			DWARN(vswp, "%s: port instance %d already attached",
342 			    __func__, p->p_instance);
343 			RW_EXIT(&plist->lockrw);
344 			return (1);
345 		}
346 	}
347 	RW_EXIT(&plist->lockrw);
348 
349 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
350 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
351 	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
352 
353 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
354 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
355 	port->state = VSW_PORT_INIT;
356 
357 	D2(vswp, "%s: %d nids", __func__, nids);
358 	ldcids = port->ldc_ids;
359 	D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[0]);
360 	if (vsw_ldc_attach(port, (uint64_t)ldcids[0]) != 0) {
361 		DERR(vswp, "%s: ldc_attach failed", __func__);
362 		goto exit_error;
363 	}
364 
365 	if (vswp->switching_setup_done == B_TRUE) {
366 		/*
367 		 * If the underlying network device has been setup,
368 		 * then open a mac client and porgram the mac address
369 		 * for this port.
370 		 */
371 		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
372 		if (rv != 0) {
373 			goto exit_error;
374 		}
375 	}
376 
377 	/* create the fdb entry for this port/mac address */
378 	vsw_fdbe_add(vswp, port);
379 
380 	vsw_create_vlans(port, VSW_VNETPORT);
381 
382 	WRITE_ENTER(&plist->lockrw);
383 
384 	/* link it into the list of ports for this vsw instance */
385 	pp = (vsw_port_t **)(&plist->head);
386 	port->p_next = *pp;
387 	*pp = port;
388 	plist->num_ports++;
389 
390 	RW_EXIT(&plist->lockrw);
391 
392 	/*
393 	 * Initialise the port and any ldc's under it.
394 	 */
395 	(void) vsw_ldc_init(port->ldcp);
396 
397 	/* announce macaddr of vnet to the physical switch */
398 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
399 		vsw_publish_macaddr(vswp, port);
400 	}
401 
402 	D1(vswp, "%s: exit", __func__);
403 	return (0);
404 
405 exit_error:
406 
407 	cv_destroy(&port->state_cv);
408 	mutex_destroy(&port->state_lock);
409 
410 	rw_destroy(&port->maccl_rwlock);
411 	mutex_destroy(&port->tx_lock);
412 	mutex_destroy(&port->mca_lock);
413 	kmem_free(port, sizeof (vsw_port_t));
414 	return (1);
415 }
416 
417 /*
418  * Detach the specified port.
419  *
420  * Returns 0 on success, 1 on failure.
421  */
422 int
423 vsw_port_detach(vsw_t *vswp, int p_instance)
424 {
425 	vsw_port_t	*port = NULL;
426 	vsw_port_list_t	*plist = &vswp->plist;
427 
428 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
429 
430 	WRITE_ENTER(&plist->lockrw);
431 
432 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
433 		RW_EXIT(&plist->lockrw);
434 		return (1);
435 	}
436 
437 	if (vsw_plist_del_node(vswp, port)) {
438 		RW_EXIT(&plist->lockrw);
439 		return (1);
440 	}
441 
442 	/* cleanup any HybridIO for this port */
443 	vsw_hio_stop_port(port);
444 
445 	/*
446 	 * No longer need to hold writer lock on port list now
447 	 * that we have unlinked the target port from the list.
448 	 */
449 	RW_EXIT(&plist->lockrw);
450 
451 	/* Cleanup and close the mac client */
452 	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
453 
454 	/* Remove the fdb entry for this port/mac address */
455 	vsw_fdbe_del(vswp, &(port->p_macaddr));
456 	vsw_destroy_vlans(port, VSW_VNETPORT);
457 
458 	/* Remove any multicast addresses.. */
459 	vsw_del_mcst_port(port);
460 
461 	vsw_port_delete(port);
462 
463 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
464 	return (0);
465 }
466 
467 /*
468  * Detach all active ports.
469  */
470 void
471 vsw_detach_ports(vsw_t *vswp)
472 {
473 	vsw_port_list_t 	*plist = &vswp->plist;
474 	vsw_port_t		*port = NULL;
475 
476 	D1(vswp, "%s: enter", __func__);
477 
478 	WRITE_ENTER(&plist->lockrw);
479 
480 	while ((port = plist->head) != NULL) {
481 		(void) vsw_plist_del_node(vswp, port);
482 
483 		/* cleanup any HybridIO for this port */
484 		vsw_hio_stop_port(port);
485 
486 		/* Cleanup and close the mac client */
487 		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
488 
489 		/* Remove the fdb entry for this port/mac address */
490 		vsw_fdbe_del(vswp, &(port->p_macaddr));
491 		vsw_destroy_vlans(port, VSW_VNETPORT);
492 
493 		/* Remove any multicast addresses.. */
494 		vsw_del_mcst_port(port);
495 
496 		/*
497 		 * No longer need to hold the lock on the port list
498 		 * now that we have unlinked the target port from the
499 		 * list.
500 		 */
501 		RW_EXIT(&plist->lockrw);
502 		vsw_port_delete(port);
503 		WRITE_ENTER(&plist->lockrw);
504 	}
505 	RW_EXIT(&plist->lockrw);
506 
507 	D1(vswp, "%s: exit", __func__);
508 }
509 
510 /*
511  * Delete the specified port.
512  */
513 static void
514 vsw_port_delete(vsw_port_t *port)
515 {
516 	vsw_t			*vswp = port->p_vswp;
517 
518 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
519 
520 	vsw_ldc_uninit(port->ldcp);
521 
522 	/*
523 	 * Wait for any pending ctrl msg tasks which reference this
524 	 * port to finish.
525 	 */
526 	vsw_drain_port_taskq(port);
527 
528 	/*
529 	 * Wait for any active callbacks to finish
530 	 */
531 	vsw_ldc_drain(port->ldcp);
532 
533 	vsw_ldc_detach(port->ldcp);
534 
535 	rw_destroy(&port->maccl_rwlock);
536 	mutex_destroy(&port->mca_lock);
537 	mutex_destroy(&port->tx_lock);
538 
539 	cv_destroy(&port->state_cv);
540 	mutex_destroy(&port->state_lock);
541 
542 	if (port->num_ldcs != 0) {
543 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
544 		port->num_ldcs = 0;
545 	}
546 
547 	if (port->nvids != 0) {
548 		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
549 	}
550 
551 	kmem_free(port, sizeof (vsw_port_t));
552 
553 	D1(vswp, "%s: exit", __func__);
554 }
555 
556 /*
557  * Attach a logical domain channel (ldc) under a specified port.
558  *
559  * Returns 0 on success, 1 on failure.
560  */
561 static int
562 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
563 {
564 	vsw_t 		*vswp = port->p_vswp;
565 	vsw_ldc_t 	*ldcp = NULL;
566 	ldc_attr_t 	attr;
567 	ldc_status_t	istatus;
568 	int 		status = DDI_FAILURE;
569 	char		kname[MAXNAMELEN];
570 	enum		{ PROG_init = 0x0,
571 			    PROG_callback = 0x1,
572 			    PROG_tx_thread = 0x2}
573 			progress;
574 
575 	progress = PROG_init;
576 
577 	D1(vswp, "%s: enter", __func__);
578 
579 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
580 	if (ldcp == NULL) {
581 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
582 		return (1);
583 	}
584 	ldcp->ldc_id = ldc_id;
585 
586 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
587 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
588 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
589 	ldcp->msg_thr_flags = 0;
590 	mutex_init(&ldcp->msg_thr_lock, NULL, MUTEX_DRIVER, NULL);
591 	cv_init(&ldcp->msg_thr_cv, NULL, CV_DRIVER, NULL);
592 	ldcp->rcv_thr_flags = 0;
593 	mutex_init(&ldcp->rcv_thr_lock, NULL, MUTEX_DRIVER, NULL);
594 	cv_init(&ldcp->rcv_thr_cv, NULL, CV_DRIVER, NULL);
595 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
596 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
597 
598 	/* required for handshake with peer */
599 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
600 	ldcp->peer_session = 0;
601 	ldcp->session_status = 0;
602 	ldcp->hss_id = 1;	/* Initial handshake session id */
603 	ldcp->hphase = VSW_MILESTONE0;
604 
605 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
606 
607 	/* only set for outbound lane, inbound set by peer */
608 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
609 
610 	attr.devclass = LDC_DEV_NT_SVC;
611 	attr.instance = ddi_get_instance(vswp->dip);
612 	attr.mode = LDC_MODE_UNRELIABLE;
613 	attr.mtu = VSW_LDC_MTU;
614 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
615 	if (status != 0) {
616 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
617 		    __func__, ldc_id, status);
618 		goto ldc_attach_fail;
619 	}
620 
621 	if (vsw_ldc_txthr_enabled) {
622 		ldcp->tx_thr_flags = 0;
623 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
624 
625 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
626 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
627 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
628 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
629 
630 		progress |= PROG_tx_thread;
631 		if (ldcp->tx_thread == NULL) {
632 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
633 			    __func__, ldc_id);
634 			goto ldc_attach_fail;
635 		}
636 	}
637 
638 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
639 	if (status != 0) {
640 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
641 		    __func__, ldc_id, status);
642 		(void) ldc_fini(ldcp->ldc_handle);
643 		goto ldc_attach_fail;
644 	}
645 	/*
646 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
647 	 * data msgs, including raw data msgs used to recv priority frames.
648 	 */
649 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
650 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
651 
652 	progress |= PROG_callback;
653 
654 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
655 
656 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
657 		DERR(vswp, "%s: ldc_status failed", __func__);
658 		mutex_destroy(&ldcp->status_lock);
659 		goto ldc_attach_fail;
660 	}
661 
662 	ldcp->ldc_status = istatus;
663 	ldcp->ldc_port = port;
664 	ldcp->ldc_vswp = vswp;
665 
666 	vsw_reset_vnet_proto_ops(ldcp);
667 
668 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
669 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
670 	    kname, &ldcp->ldc_stats);
671 	if (ldcp->ksp == NULL) {
672 		DERR(vswp, "%s: kstats setup failed", __func__);
673 		goto ldc_attach_fail;
674 	}
675 
676 	/* link it into this port */
677 	port->ldcp = ldcp;
678 
679 	D1(vswp, "%s: exit", __func__);
680 	return (0);
681 
682 ldc_attach_fail:
683 
684 	if (progress & PROG_callback) {
685 		(void) ldc_unreg_callback(ldcp->ldc_handle);
686 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
687 	}
688 
689 	if (progress & PROG_tx_thread) {
690 		if (ldcp->tx_thread != NULL) {
691 			vsw_stop_tx_thread(ldcp);
692 		}
693 		mutex_destroy(&ldcp->tx_thr_lock);
694 		cv_destroy(&ldcp->tx_thr_cv);
695 	}
696 	if (ldcp->ksp != NULL) {
697 		vgen_destroy_kstats(ldcp->ksp);
698 	}
699 	mutex_destroy(&ldcp->msg_thr_lock);
700 	mutex_destroy(&ldcp->rcv_thr_lock);
701 	mutex_destroy(&ldcp->ldc_txlock);
702 	mutex_destroy(&ldcp->ldc_rxlock);
703 	mutex_destroy(&ldcp->ldc_cblock);
704 	mutex_destroy(&ldcp->drain_cv_lock);
705 	cv_destroy(&ldcp->msg_thr_cv);
706 	cv_destroy(&ldcp->rcv_thr_cv);
707 	cv_destroy(&ldcp->drain_cv);
708 
709 	kmem_free(ldcp, sizeof (vsw_ldc_t));
710 
711 	return (1);
712 }
713 
714 /*
715  * Detach a logical domain channel (ldc) belonging to a
716  * particular port.
717  */
718 static void
719 vsw_ldc_detach(vsw_ldc_t *ldcp)
720 {
721 	int 		rv;
722 	vsw_t 		*vswp = ldcp->ldc_port->p_vswp;
723 	int		retries = 0;
724 
725 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
726 
727 	/* Stop msg/rcv thread */
728 	if (ldcp->rcv_thread != NULL) {
729 		vsw_stop_rcv_thread(ldcp);
730 	} else if (ldcp->msg_thread != NULL) {
731 		vsw_stop_msg_thread(ldcp);
732 	}
733 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
734 
735 	/* Stop the tx thread */
736 	if (ldcp->tx_thread != NULL) {
737 		vsw_stop_tx_thread(ldcp);
738 		mutex_destroy(&ldcp->tx_thr_lock);
739 		cv_destroy(&ldcp->tx_thr_cv);
740 		if (ldcp->tx_mhead != NULL) {
741 			freemsgchain(ldcp->tx_mhead);
742 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
743 			ldcp->tx_cnt = 0;
744 		}
745 	}
746 
747 	/* Destory kstats */
748 	vgen_destroy_kstats(ldcp->ksp);
749 
750 	/*
751 	 * Before we can close the channel we must release any mapped
752 	 * resources (e.g. drings).
753 	 */
754 	vsw_free_lane_resources(ldcp, INBOUND);
755 	vsw_free_lane_resources(ldcp, OUTBOUND);
756 
757 	/*
758 	 * Close the channel, retry on EAAGIN.
759 	 */
760 	while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
761 		if (++retries > vsw_ldc_retries) {
762 			break;
763 		}
764 		drv_usecwait(vsw_ldc_delay);
765 	}
766 	if (rv != 0) {
767 		cmn_err(CE_NOTE,
768 		    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
769 		    vswp->instance, rv, ldcp->ldc_id);
770 	}
771 
772 	(void) ldc_fini(ldcp->ldc_handle);
773 
774 	ldcp->ldc_status = LDC_INIT;
775 	ldcp->ldc_handle = NULL;
776 	ldcp->ldc_vswp = NULL;
777 
778 	mutex_destroy(&ldcp->msg_thr_lock);
779 	mutex_destroy(&ldcp->rcv_thr_lock);
780 	mutex_destroy(&ldcp->ldc_txlock);
781 	mutex_destroy(&ldcp->ldc_rxlock);
782 	mutex_destroy(&ldcp->ldc_cblock);
783 	mutex_destroy(&ldcp->drain_cv_lock);
784 	mutex_destroy(&ldcp->status_lock);
785 	cv_destroy(&ldcp->msg_thr_cv);
786 	cv_destroy(&ldcp->rcv_thr_cv);
787 	cv_destroy(&ldcp->drain_cv);
788 
789 	kmem_free(ldcp, sizeof (vsw_ldc_t));
790 }
791 
792 /*
793  * Open and attempt to bring up the channel. Note that channel
794  * can only be brought up if peer has also opened channel.
795  *
796  * Returns 0 if can open and bring up channel, otherwise
797  * returns 1.
798  */
799 static int
800 vsw_ldc_init(vsw_ldc_t *ldcp)
801 {
802 	vsw_t 		*vswp = ldcp->ldc_vswp;
803 	ldc_status_t	istatus = 0;
804 	int		rv;
805 
806 	D1(vswp, "%s: enter", __func__);
807 
808 	LDC_ENTER_LOCK(ldcp);
809 
810 	/* don't start at 0 in case clients don't like that */
811 	ldcp->next_ident = 1;
812 
813 	rv = ldc_open(ldcp->ldc_handle);
814 	if (rv != 0) {
815 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
816 		    __func__, ldcp->ldc_id, rv);
817 		LDC_EXIT_LOCK(ldcp);
818 		return (1);
819 	}
820 
821 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
822 		DERR(vswp, "%s: unable to get status", __func__);
823 		LDC_EXIT_LOCK(ldcp);
824 		return (1);
825 
826 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
827 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
828 		    __func__, ldcp->ldc_id, istatus);
829 		LDC_EXIT_LOCK(ldcp);
830 		return (1);
831 	}
832 
833 	mutex_enter(&ldcp->status_lock);
834 	ldcp->ldc_status = istatus;
835 	mutex_exit(&ldcp->status_lock);
836 
837 	rv = ldc_up(ldcp->ldc_handle);
838 	if (rv != 0) {
839 		/*
840 		 * Not a fatal error for ldc_up() to fail, as peer
841 		 * end point may simply not be ready yet.
842 		 */
843 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
844 		    ldcp->ldc_id, rv);
845 		LDC_EXIT_LOCK(ldcp);
846 		return (1);
847 	}
848 
849 	/*
850 	 * ldc_up() call is non-blocking so need to explicitly
851 	 * check channel status to see if in fact the channel
852 	 * is UP.
853 	 */
854 	mutex_enter(&ldcp->status_lock);
855 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
856 		DERR(vswp, "%s: unable to get status", __func__);
857 		mutex_exit(&ldcp->status_lock);
858 		LDC_EXIT_LOCK(ldcp);
859 		return (1);
860 
861 	}
862 
863 	if (ldcp->ldc_status == LDC_UP) {
864 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
865 		    ldcp->ldc_id, istatus);
866 		mutex_exit(&ldcp->status_lock);
867 		LDC_EXIT_LOCK(ldcp);
868 
869 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
870 		return (0);
871 	}
872 
873 	mutex_exit(&ldcp->status_lock);
874 	LDC_EXIT_LOCK(ldcp);
875 
876 	D1(vswp, "%s: exit", __func__);
877 	return (0);
878 }
879 
880 /* disable callbacks on the channel */
881 static void
882 vsw_ldc_uninit(vsw_ldc_t *ldcp)
883 {
884 	vsw_t	*vswp = ldcp->ldc_vswp;
885 	int	rv;
886 
887 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
888 
889 	LDC_ENTER_LOCK(ldcp);
890 
891 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
892 	if (rv != 0) {
893 		cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
894 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
895 	}
896 
897 	mutex_enter(&ldcp->status_lock);
898 	ldcp->ldc_status = LDC_INIT;
899 	mutex_exit(&ldcp->status_lock);
900 
901 	LDC_EXIT_LOCK(ldcp);
902 
903 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
904 }
905 
906 /*
907  * Wait until the callback(s) associated with the ldcs under the specified
908  * port have completed.
909  *
910  * Prior to this function being invoked each channel under this port
911  * should have been quiesced via ldc_set_cb_mode(DISABLE).
912  *
913  * A short explaination of what we are doing below..
914  *
915  * The simplest approach would be to have a reference counter in
916  * the ldc structure which is increment/decremented by the callbacks as
917  * they use the channel. The drain function could then simply disable any
918  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
919  * there is a tiny window here - before the callback is able to get the lock
920  * on the channel it is interrupted and this function gets to execute. It
921  * sees that the ref count is zero and believes its free to delete the
922  * associated data structures.
923  *
924  * We get around this by taking advantage of the fact that before the ldc
925  * framework invokes a callback it sets a flag to indicate that there is a
926  * callback active (or about to become active). If when we attempt to
927  * unregister a callback when this active flag is set then the unregister
928  * will fail with EWOULDBLOCK.
929  *
930  * If the unregister fails we do a cv_timedwait. We will either be signaled
931  * by the callback as it is exiting (note we have to wait a short period to
932  * allow the callback to return fully to the ldc framework and it to clear
933  * the active flag), or by the timer expiring. In either case we again attempt
934  * the unregister. We repeat this until we can succesfully unregister the
935  * callback.
936  *
937  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
938  * the case where the callback has finished but the ldc framework has not yet
939  * cleared the active flag. In this case we would never get a cv_signal.
940  */
941 static void
942 vsw_ldc_drain(vsw_ldc_t *ldcp)
943 {
944 	vsw_t	*vswp = ldcp->ldc_port->p_vswp;
945 
946 	D1(vswp, "%s: enter", __func__);
947 
948 	/*
949 	 * If we can unregister the channel callback then we
950 	 * know that there is no callback either running or
951 	 * scheduled to run for this channel so move on to next
952 	 * channel in the list.
953 	 */
954 	mutex_enter(&ldcp->drain_cv_lock);
955 
956 	/* prompt active callbacks to quit */
957 	ldcp->drain_state = VSW_LDC_DRAINING;
958 
959 	if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
960 		D2(vswp, "%s: unreg callback for chan %ld", __func__,
961 		    ldcp->ldc_id);
962 		mutex_exit(&ldcp->drain_cv_lock);
963 	} else {
964 		/*
965 		 * If we end up here we know that either 1) a callback
966 		 * is currently executing, 2) is about to start (i.e.
967 		 * the ldc framework has set the active flag but
968 		 * has not actually invoked the callback yet, or 3)
969 		 * has finished and has returned to the ldc framework
970 		 * but the ldc framework has not yet cleared the
971 		 * active bit.
972 		 *
973 		 * Wait for it to finish.
974 		 */
975 		while (ldc_unreg_callback(ldcp->ldc_handle) == EWOULDBLOCK) {
976 			(void) cv_timedwait(&ldcp->drain_cv,
977 			    &ldcp->drain_cv_lock, ddi_get_lbolt() + hz);
978 		}
979 
980 		mutex_exit(&ldcp->drain_cv_lock);
981 		D2(vswp, "%s: unreg callback for chan %ld after "
982 		    "timeout", __func__, ldcp->ldc_id);
983 	}
984 
985 	D1(vswp, "%s: exit", __func__);
986 }
987 
988 /*
989  * Wait until all tasks which reference this port have completed.
990  *
991  * Prior to this function being invoked each channel under this port
992  * should have been quiesced via ldc_set_cb_mode(DISABLE).
993  */
994 static void
995 vsw_drain_port_taskq(vsw_port_t *port)
996 {
997 	vsw_t		*vswp = port->p_vswp;
998 
999 	D1(vswp, "%s: enter", __func__);
1000 
1001 	/*
1002 	 * Mark the port as in the process of being detached, and
1003 	 * dispatch a marker task to the queue so we know when all
1004 	 * relevant tasks have completed.
1005 	 */
1006 	mutex_enter(&port->state_lock);
1007 	port->state = VSW_PORT_DETACHING;
1008 
1009 	if ((vswp->taskq_p == NULL) ||
1010 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1011 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1012 		cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
1013 		    vswp->instance);
1014 		mutex_exit(&port->state_lock);
1015 		return;
1016 	}
1017 
1018 	/*
1019 	 * Wait for the marker task to finish.
1020 	 */
1021 	while (port->state != VSW_PORT_DETACHABLE)
1022 		cv_wait(&port->state_cv, &port->state_lock);
1023 
1024 	mutex_exit(&port->state_lock);
1025 
1026 	D1(vswp, "%s: exit", __func__);
1027 }
1028 
1029 static void
1030 vsw_marker_task(void *arg)
1031 {
1032 	vsw_port_t	*port = arg;
1033 	vsw_t		*vswp = port->p_vswp;
1034 
1035 	D1(vswp, "%s: enter", __func__);
1036 
1037 	mutex_enter(&port->state_lock);
1038 
1039 	/*
1040 	 * No further tasks should be dispatched which reference
1041 	 * this port so ok to mark it as safe to detach.
1042 	 */
1043 	port->state = VSW_PORT_DETACHABLE;
1044 
1045 	cv_signal(&port->state_cv);
1046 
1047 	mutex_exit(&port->state_lock);
1048 
1049 	D1(vswp, "%s: exit", __func__);
1050 }
1051 
1052 vsw_port_t *
1053 vsw_lookup_port(vsw_t *vswp, int p_instance)
1054 {
1055 	vsw_port_list_t *plist = &vswp->plist;
1056 	vsw_port_t	*port;
1057 
1058 	for (port = plist->head; port != NULL; port = port->p_next) {
1059 		if (port->p_instance == p_instance) {
1060 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1061 			return (port);
1062 		}
1063 	}
1064 
1065 	return (NULL);
1066 }
1067 
1068 void
1069 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1070 {
1071 	vsw_ldc_t	*ldcp = portp->ldcp;
1072 
1073 	mutex_enter(&ldcp->ldc_cblock);
1074 
1075 	/*
1076 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1077 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1078 	 */
1079 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1080 	    portp->nvids != 0) {
1081 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1082 	}
1083 
1084 	mutex_exit(&ldcp->ldc_cblock);
1085 }
1086 
1087 void
1088 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1089 {
1090 	vsw_ldc_t	*ldcp = portp->ldcp;
1091 
1092 	mutex_enter(&ldcp->ldc_cblock);
1093 
1094 	/*
1095 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1096 	 * to trigger re-negotiation, which inturn trigger HybridIO
1097 	 * setup/cleanup.
1098 	 */
1099 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1100 	    (portp->p_hio_capable == B_TRUE)) {
1101 		if (immediate == B_TRUE) {
1102 			(void) ldc_down(ldcp->ldc_handle);
1103 		} else {
1104 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1105 		}
1106 	}
1107 
1108 	mutex_exit(&ldcp->ldc_cblock);
1109 }
1110 
1111 void
1112 vsw_port_reset(vsw_port_t *portp)
1113 {
1114 	vsw_ldc_t	*ldcp = portp->ldcp;
1115 
1116 	mutex_enter(&ldcp->ldc_cblock);
1117 
1118 	/*
1119 	 * reset channel and terminate the connection.
1120 	 */
1121 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1122 
1123 	mutex_exit(&ldcp->ldc_cblock);
1124 }
1125 
1126 void
1127 vsw_reset_ports(vsw_t *vswp)
1128 {
1129 	vsw_port_list_t	*plist = &vswp->plist;
1130 	vsw_port_t	*portp;
1131 
1132 	READ_ENTER(&plist->lockrw);
1133 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1134 		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1135 			vsw_hio_stop_port(portp);
1136 		}
1137 		vsw_port_reset(portp);
1138 	}
1139 	RW_EXIT(&plist->lockrw);
1140 }
1141 
1142 static void
1143 vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
1144 {
1145 	vnet_physlink_msg_t	msg;
1146 	vnet_physlink_msg_t	*msgp = &msg;
1147 	uint32_t		physlink_info = 0;
1148 
1149 	if (plink_state == LINK_STATE_UP) {
1150 		physlink_info |= VNET_PHYSLINK_STATE_UP;
1151 	} else {
1152 		physlink_info |= VNET_PHYSLINK_STATE_DOWN;
1153 	}
1154 
1155 	msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
1156 	msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
1157 	msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
1158 	msgp->tag.vio_sid = ldcp->local_session;
1159 	msgp->physlink_info = physlink_info;
1160 
1161 	(void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
1162 }
1163 
1164 static void
1165 vsw_port_physlink_update(vsw_port_t *portp)
1166 {
1167 	vsw_ldc_t	*ldcp;
1168 	vsw_t		*vswp;
1169 
1170 	vswp = portp->p_vswp;
1171 	ldcp = portp->ldcp;
1172 
1173 	mutex_enter(&ldcp->ldc_cblock);
1174 
1175 	/*
1176 	 * If handshake has completed successfully and if the vnet device
1177 	 * has negotiated to get physical link state updates, send a message
1178 	 * with the current state.
1179 	 */
1180 	if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
1181 		vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
1182 	}
1183 
1184 	mutex_exit(&ldcp->ldc_cblock);
1185 }
1186 
1187 void
1188 vsw_physlink_update_ports(vsw_t *vswp)
1189 {
1190 	vsw_port_list_t	*plist = &vswp->plist;
1191 	vsw_port_t	*portp;
1192 
1193 	READ_ENTER(&plist->lockrw);
1194 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1195 		vsw_port_physlink_update(portp);
1196 	}
1197 	RW_EXIT(&plist->lockrw);
1198 }
1199 
1200 /*
1201  * Search for and remove the specified port from the port
1202  * list. Returns 0 if able to locate and remove port, otherwise
1203  * returns 1.
1204  */
1205 static int
1206 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1207 {
1208 	vsw_port_list_t *plist = &vswp->plist;
1209 	vsw_port_t	*curr_p, *prev_p;
1210 
1211 	if (plist->head == NULL)
1212 		return (1);
1213 
1214 	curr_p = prev_p = plist->head;
1215 
1216 	while (curr_p != NULL) {
1217 		if (curr_p == port) {
1218 			if (prev_p == curr_p) {
1219 				plist->head = curr_p->p_next;
1220 			} else {
1221 				prev_p->p_next = curr_p->p_next;
1222 			}
1223 			plist->num_ports--;
1224 			break;
1225 		} else {
1226 			prev_p = curr_p;
1227 			curr_p = curr_p->p_next;
1228 		}
1229 	}
1230 	return (0);
1231 }
1232 
1233 /*
1234  * Interrupt handler for ldc messages.
1235  */
1236 static uint_t
1237 vsw_ldc_cb(uint64_t event, caddr_t arg)
1238 {
1239 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1240 	vsw_t 		*vswp = ldcp->ldc_vswp;
1241 
1242 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1243 
1244 	mutex_enter(&ldcp->ldc_cblock);
1245 	ldcp->ldc_stats.callbacks++;
1246 
1247 	mutex_enter(&ldcp->status_lock);
1248 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1249 		mutex_exit(&ldcp->status_lock);
1250 		mutex_exit(&ldcp->ldc_cblock);
1251 		return (LDC_SUCCESS);
1252 	}
1253 	mutex_exit(&ldcp->status_lock);
1254 
1255 	if (event & LDC_EVT_UP) {
1256 		/*
1257 		 * Channel has come up.
1258 		 */
1259 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1260 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1261 
1262 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1263 
1264 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1265 	}
1266 
1267 	if (event & LDC_EVT_READ) {
1268 		/*
1269 		 * Data available for reading.
1270 		 */
1271 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1272 		    __func__, ldcp->ldc_id, event);
1273 
1274 		vsw_process_evt_read(ldcp);
1275 
1276 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1277 
1278 		goto vsw_cb_exit;
1279 	}
1280 
1281 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1282 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1283 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1284 
1285 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1286 	}
1287 
1288 	/*
1289 	 * Catch either LDC_EVT_WRITE which we don't support or any
1290 	 * unknown event.
1291 	 */
1292 	if (event &
1293 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1294 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1295 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1296 	}
1297 
1298 vsw_cb_exit:
1299 	mutex_exit(&ldcp->ldc_cblock);
1300 
1301 	/*
1302 	 * Let the drain function know we are finishing if it
1303 	 * is waiting.
1304 	 */
1305 	mutex_enter(&ldcp->drain_cv_lock);
1306 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1307 		cv_signal(&ldcp->drain_cv);
1308 	mutex_exit(&ldcp->drain_cv_lock);
1309 
1310 	return (LDC_SUCCESS);
1311 }
1312 
1313 /*
1314  * Reinitialise data structures associated with the channel.
1315  */
1316 static void
1317 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1318 {
1319 	vsw_t		*vswp = ldcp->ldc_vswp;
1320 	vsw_port_t	*port;
1321 
1322 	D1(vswp, "%s: enter", __func__);
1323 
1324 	port = ldcp->ldc_port;
1325 
1326 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1327 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1328 
1329 	vsw_free_lane_resources(ldcp, INBOUND);
1330 	vsw_free_lane_resources(ldcp, OUTBOUND);
1331 
1332 	ldcp->lane_in.lstate = 0;
1333 	ldcp->lane_out.lstate = 0;
1334 
1335 	/* Remove the fdb entry for this port/mac address */
1336 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1337 
1338 	/* remove the port from vlans it has been assigned to */
1339 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1340 
1341 	/*
1342 	 * Remove parent port from any multicast groups
1343 	 * it may have registered with. Client must resend
1344 	 * multicast add command after handshake completes.
1345 	 */
1346 	vsw_del_mcst_port(port);
1347 
1348 	ldcp->peer_session = 0;
1349 	ldcp->session_status = 0;
1350 	ldcp->hcnt = 0;
1351 	ldcp->hphase = VSW_MILESTONE0;
1352 
1353 	vsw_reset_vnet_proto_ops(ldcp);
1354 
1355 	D1(vswp, "%s: exit", __func__);
1356 }
1357 
1358 /*
1359  * Process a connection event.
1360  */
1361 void
1362 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1363 {
1364 	vsw_t		*vswp = ldcp->ldc_vswp;
1365 	vsw_conn_evt_t	*conn = NULL;
1366 
1367 	D1(vswp, "%s: enter", __func__);
1368 
1369 	/*
1370 	 * Check if either a reset or restart event is pending
1371 	 * or in progress. If so just return.
1372 	 *
1373 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1374 	 * being received by the callback handler, or a ECONNRESET error
1375 	 * code being returned from a ldc_read() or ldc_write() call.
1376 	 *
1377 	 * A VSW_CONN_RESTART event occurs when some error checking code
1378 	 * decides that there is a problem with data from the channel,
1379 	 * and that the handshake should be restarted.
1380 	 */
1381 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1382 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1383 		return;
1384 
1385 	/*
1386 	 * If it is an LDC_UP event we first check the recorded
1387 	 * state of the channel. If this is UP then we know that
1388 	 * the channel moving to the UP state has already been dealt
1389 	 * with and don't need to dispatch a  new task.
1390 	 *
1391 	 * The reason for this check is that when we do a ldc_up(),
1392 	 * depending on the state of the peer, we may or may not get
1393 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1394 	 * every time we do ldc_up() we explicitly check the channel
1395 	 * status to see has it come up (ldc_up() is asynch and will
1396 	 * complete at some undefined time), and take the appropriate
1397 	 * action.
1398 	 *
1399 	 * The flip side of this is that we may get a LDC_UP event
1400 	 * when we have already seen that the channel is up and have
1401 	 * dealt with that.
1402 	 */
1403 	mutex_enter(&ldcp->status_lock);
1404 	if (evt == VSW_CONN_UP) {
1405 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1406 			mutex_exit(&ldcp->status_lock);
1407 			return;
1408 		}
1409 	}
1410 	mutex_exit(&ldcp->status_lock);
1411 
1412 	/*
1413 	 * The transaction group id allows us to identify and discard
1414 	 * any tasks which are still pending on the taskq and refer
1415 	 * to the handshake session we are about to restart or reset.
1416 	 * These stale messages no longer have any real meaning.
1417 	 */
1418 	(void) atomic_inc_32(&ldcp->hss_id);
1419 
1420 	ASSERT(vswp->taskq_p != NULL);
1421 
1422 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1423 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1424 		    " connection event", vswp->instance);
1425 		goto err_exit;
1426 	}
1427 
1428 	conn->evt = evt;
1429 	conn->ldcp = ldcp;
1430 
1431 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1432 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1433 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1434 		    vswp->instance);
1435 
1436 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1437 		goto err_exit;
1438 	}
1439 
1440 	D1(vswp, "%s: exit", __func__);
1441 	return;
1442 
1443 err_exit:
1444 	/*
1445 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1446 	 * that future requests will at least be attempted and will hopefully
1447 	 * succeed.
1448 	 */
1449 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1450 		ldcp->reset_active = 0;
1451 }
1452 
1453 /*
1454  * Deal with events relating to a connection. Invoked from a taskq.
1455  */
1456 static void
1457 vsw_conn_task(void *arg)
1458 {
1459 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1460 	vsw_ldc_t	*ldcp = NULL;
1461 	vsw_port_t	*portp;
1462 	vsw_t		*vswp = NULL;
1463 	uint16_t	evt;
1464 	ldc_status_t	curr_status;
1465 
1466 	ldcp = conn->ldcp;
1467 	evt = conn->evt;
1468 	vswp = ldcp->ldc_vswp;
1469 	portp = ldcp->ldc_port;
1470 
1471 	D1(vswp, "%s: enter", __func__);
1472 
1473 	/* can safely free now have copied out data */
1474 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1475 
1476 	if (ldcp->rcv_thread != NULL) {
1477 		vsw_stop_rcv_thread(ldcp);
1478 	} else if (ldcp->msg_thread != NULL) {
1479 		vsw_stop_msg_thread(ldcp);
1480 	}
1481 
1482 	mutex_enter(&ldcp->status_lock);
1483 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1484 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1485 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1486 		mutex_exit(&ldcp->status_lock);
1487 		return;
1488 	}
1489 
1490 	/*
1491 	 * If we wish to restart the handshake on this channel, then if
1492 	 * the channel is UP we bring it DOWN to flush the underlying
1493 	 * ldc queue.
1494 	 */
1495 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1496 		(void) ldc_down(ldcp->ldc_handle);
1497 
1498 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1499 		vsw_hio_stop(vswp, ldcp);
1500 	}
1501 
1502 	/*
1503 	 * re-init all the associated data structures.
1504 	 */
1505 	vsw_ldc_reinit(ldcp);
1506 
1507 	/*
1508 	 * Bring the channel back up (note it does no harm to
1509 	 * do this even if the channel is already UP, Just
1510 	 * becomes effectively a no-op).
1511 	 */
1512 	(void) ldc_up(ldcp->ldc_handle);
1513 
1514 	/*
1515 	 * Check if channel is now UP. This will only happen if
1516 	 * peer has also done a ldc_up().
1517 	 */
1518 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1519 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1520 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1521 		mutex_exit(&ldcp->status_lock);
1522 		return;
1523 	}
1524 
1525 	ldcp->ldc_status = curr_status;
1526 
1527 	/* channel UP so restart handshake by sending version info */
1528 	if (curr_status == LDC_UP) {
1529 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1530 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1531 			    " handshake attempts (%d) on channel %ld",
1532 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1533 			mutex_exit(&ldcp->status_lock);
1534 			return;
1535 		}
1536 
1537 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1538 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1539 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1540 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1541 			    vswp->instance);
1542 
1543 			/*
1544 			 * Don't count as valid restart attempt if couldn't
1545 			 * send version msg.
1546 			 */
1547 			if (ldcp->hcnt > 0)
1548 				ldcp->hcnt--;
1549 		}
1550 	}
1551 
1552 	/*
1553 	 * Mark that the process is complete by clearing the flag.
1554 	 *
1555 	 * Note is it possible that the taskq dispatch above may have failed,
1556 	 * most likely due to memory shortage. We still clear the flag so
1557 	 * future attempts will at least be attempted and will hopefully
1558 	 * succeed.
1559 	 */
1560 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1561 		ldcp->reset_active = 0;
1562 
1563 	mutex_exit(&ldcp->status_lock);
1564 
1565 	D1(vswp, "%s: exit", __func__);
1566 }
1567 
1568 /*
1569  * returns 0 if legal for event signified by flag to have
1570  * occured at the time it did. Otherwise returns 1.
1571  */
1572 int
1573 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1574 {
1575 	vsw_t		*vswp = ldcp->ldc_vswp;
1576 	uint64_t	state;
1577 	uint64_t	phase;
1578 
1579 	if (dir == INBOUND)
1580 		state = ldcp->lane_in.lstate;
1581 	else
1582 		state = ldcp->lane_out.lstate;
1583 
1584 	phase = ldcp->hphase;
1585 
1586 	switch (flag) {
1587 	case VSW_VER_INFO_RECV:
1588 		if (phase > VSW_MILESTONE0) {
1589 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1590 			    " when in state %d\n", ldcp->ldc_id, phase);
1591 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1592 			return (1);
1593 		}
1594 		break;
1595 
1596 	case VSW_VER_ACK_RECV:
1597 	case VSW_VER_NACK_RECV:
1598 		if (!(state & VSW_VER_INFO_SENT)) {
1599 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1600 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1601 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1602 			return (1);
1603 		} else
1604 			state &= ~VSW_VER_INFO_SENT;
1605 		break;
1606 
1607 	case VSW_ATTR_INFO_RECV:
1608 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1609 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1610 			    " when in state %d\n", ldcp->ldc_id, phase);
1611 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1612 			return (1);
1613 		}
1614 		break;
1615 
1616 	case VSW_ATTR_ACK_RECV:
1617 	case VSW_ATTR_NACK_RECV:
1618 		if (!(state & VSW_ATTR_INFO_SENT)) {
1619 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1620 			    " or ATTR_NACK when in state %d\n",
1621 			    ldcp->ldc_id, phase);
1622 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1623 			return (1);
1624 		} else
1625 			state &= ~VSW_ATTR_INFO_SENT;
1626 		break;
1627 
1628 	case VSW_DRING_INFO_RECV:
1629 		if (phase < VSW_MILESTONE1) {
1630 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1631 			    " when in state %d\n", ldcp->ldc_id, phase);
1632 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1633 			return (1);
1634 		}
1635 		break;
1636 
1637 	case VSW_DRING_ACK_RECV:
1638 	case VSW_DRING_NACK_RECV:
1639 		if (!(state & VSW_DRING_INFO_SENT)) {
1640 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1641 			    " or DRING_NACK when in state %d\n",
1642 			    ldcp->ldc_id, phase);
1643 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1644 			return (1);
1645 		} else
1646 			state &= ~VSW_DRING_INFO_SENT;
1647 		break;
1648 
1649 	case VSW_RDX_INFO_RECV:
1650 		if (phase < VSW_MILESTONE3) {
1651 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1652 			    " when in state %d\n", ldcp->ldc_id, phase);
1653 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1654 			return (1);
1655 		}
1656 		break;
1657 
1658 	case VSW_RDX_ACK_RECV:
1659 	case VSW_RDX_NACK_RECV:
1660 		if (!(state & VSW_RDX_INFO_SENT)) {
1661 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1662 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1663 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1664 			return (1);
1665 		} else
1666 			state &= ~VSW_RDX_INFO_SENT;
1667 		break;
1668 
1669 	case VSW_MCST_INFO_RECV:
1670 		if (phase < VSW_MILESTONE3) {
1671 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1672 			    " when in state %d\n", ldcp->ldc_id, phase);
1673 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1674 			return (1);
1675 		}
1676 		break;
1677 
1678 	default:
1679 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1680 		    ldcp->ldc_id, flag);
1681 		return (1);
1682 	}
1683 
1684 	if (dir == INBOUND)
1685 		ldcp->lane_in.lstate = state;
1686 	else
1687 		ldcp->lane_out.lstate = state;
1688 
1689 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1690 
1691 	return (0);
1692 }
1693 
1694 void
1695 vsw_next_milestone(vsw_ldc_t *ldcp)
1696 {
1697 	vsw_t		*vswp = ldcp->ldc_vswp;
1698 	vsw_port_t	*portp = ldcp->ldc_port;
1699 	lane_t		*lane_out = &ldcp->lane_out;
1700 	lane_t		*lane_in = &ldcp->lane_in;
1701 
1702 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1703 	    ldcp->ldc_id, ldcp->hphase);
1704 
1705 	DUMP_FLAGS(lane_in->lstate);
1706 	DUMP_FLAGS(lane_out->lstate);
1707 
1708 	switch (ldcp->hphase) {
1709 
1710 	case VSW_MILESTONE0:
1711 		/*
1712 		 * If we haven't started to handshake with our peer,
1713 		 * start to do so now.
1714 		 */
1715 		if (lane_out->lstate == 0) {
1716 			D2(vswp, "%s: (chan %lld) starting handshake "
1717 			    "with peer", __func__, ldcp->ldc_id);
1718 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1719 		}
1720 
1721 		/*
1722 		 * Only way to pass this milestone is to have successfully
1723 		 * negotiated version info.
1724 		 */
1725 		if ((lane_in->lstate & VSW_VER_ACK_SENT) &&
1726 		    (lane_out->lstate & VSW_VER_ACK_RECV)) {
1727 
1728 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1729 			    __func__, ldcp->ldc_id);
1730 
1731 			vsw_set_vnet_proto_ops(ldcp);
1732 
1733 			/*
1734 			 * Next milestone is passed when attribute
1735 			 * information has been successfully exchanged.
1736 			 */
1737 			ldcp->hphase = VSW_MILESTONE1;
1738 			vsw_send_attr(ldcp);
1739 
1740 		}
1741 		break;
1742 
1743 	case VSW_MILESTONE1:
1744 		/*
1745 		 * Only way to pass this milestone is to have successfully
1746 		 * negotiated attribute information, in both directions.
1747 		 */
1748 		if (!((lane_in->lstate & VSW_ATTR_ACK_SENT) &&
1749 		    (lane_out->lstate & VSW_ATTR_ACK_RECV))) {
1750 			break;
1751 		}
1752 
1753 		ldcp->hphase = VSW_MILESTONE2;
1754 
1755 		/*
1756 		 * If the peer device has said it wishes to
1757 		 * use descriptor rings then we send it our ring
1758 		 * info, otherwise we just set up a private ring
1759 		 * which we use an internal buffer
1760 		 */
1761 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1762 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1763 		    (VSW_VER_LT(ldcp, 1, 2) &&
1764 		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
1765 			vsw_send_dring_info(ldcp);
1766 			break;
1767 		}
1768 
1769 		/*
1770 		 * The peer doesn't operate in dring mode; we
1771 		 * can simply fallthru to the RDX phase from
1772 		 * here.
1773 		 */
1774 		/*FALLTHRU*/
1775 
1776 	case VSW_MILESTONE2:
1777 		/*
1778 		 * If peer has indicated in its attribute message that
1779 		 * it wishes to use descriptor rings then the only way
1780 		 * to pass this milestone is for us to have received
1781 		 * valid dring info.
1782 		 *
1783 		 * If peer is not using descriptor rings then just fall
1784 		 * through.
1785 		 */
1786 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1787 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1788 		    (VSW_VER_LT(ldcp, 1, 2) &&
1789 		    (lane_in->xfer_mode ==
1790 		    VIO_DRING_MODE_V1_0))) {
1791 			if (!(lane_in->lstate & VSW_DRING_ACK_SENT))
1792 				break;
1793 		}
1794 
1795 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1796 		    __func__, ldcp->ldc_id);
1797 
1798 		ldcp->hphase = VSW_MILESTONE3;
1799 		vsw_send_rdx(ldcp);
1800 		break;
1801 
1802 	case VSW_MILESTONE3:
1803 		/*
1804 		 * Pass this milestone when all paramaters have been
1805 		 * successfully exchanged and RDX sent in both directions.
1806 		 *
1807 		 * Mark the relevant lane as available to transmit data. In
1808 		 * RxDringData mode, lane_in is associated with transmit and
1809 		 * lane_out is associated with receive. It is the reverse in
1810 		 * TxDring mode.
1811 		 */
1812 		if ((lane_out->lstate & VSW_RDX_ACK_SENT) &&
1813 		    (lane_in->lstate & VSW_RDX_ACK_RECV)) {
1814 
1815 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1816 			    __func__, ldcp->ldc_id);
1817 			D2(vswp, "%s: ** handshake complete (0x%llx : "
1818 			    "0x%llx) **", __func__, lane_in->lstate,
1819 			    lane_out->lstate);
1820 			if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
1821 				lane_in->lstate |= VSW_LANE_ACTIVE;
1822 			} else {
1823 				lane_out->lstate |= VSW_LANE_ACTIVE;
1824 			}
1825 			ldcp->hphase = VSW_MILESTONE4;
1826 			ldcp->hcnt = 0;
1827 			DISPLAY_STATE();
1828 			/* Start HIO if enabled and capable */
1829 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1830 				D2(vswp, "%s: start HybridIO setup", __func__);
1831 				vsw_hio_start(vswp, ldcp);
1832 			}
1833 
1834 			if (ldcp->pls_negotiated == B_TRUE) {
1835 				/*
1836 				 * The vnet device has negotiated to get phys
1837 				 * link updates. Now that the handshake with
1838 				 * the vnet device is complete, send an initial
1839 				 * update with the current physical link state.
1840 				 */
1841 				vsw_send_physlink_msg(ldcp,
1842 				    vswp->phys_link_state);
1843 			}
1844 
1845 		} else {
1846 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1847 			    __func__, lane_in->lstate,
1848 			    lane_out->lstate);
1849 		}
1850 		break;
1851 
1852 	case VSW_MILESTONE4:
1853 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1854 		    ldcp->ldc_id);
1855 		break;
1856 
1857 	default:
1858 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1859 		    ldcp->ldc_id, ldcp->hphase);
1860 	}
1861 
1862 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1863 	    ldcp->hphase);
1864 }
1865 
1866 /*
1867  * Check if major version is supported.
1868  *
1869  * Returns 0 if finds supported major number, and if necessary
1870  * adjusts the minor field.
1871  *
1872  * Returns 1 if can't match major number exactly. Sets mjor/minor
1873  * to next lowest support values, or to zero if no other values possible.
1874  */
1875 static int
1876 vsw_supported_version(vio_ver_msg_t *vp)
1877 {
1878 	int	i;
1879 
1880 	D1(NULL, "vsw_supported_version: enter");
1881 
1882 	for (i = 0; i < VSW_NUM_VER; i++) {
1883 		if (vsw_versions[i].ver_major == vp->ver_major) {
1884 			/*
1885 			 * Matching or lower major version found. Update
1886 			 * minor number if necessary.
1887 			 */
1888 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1889 				D2(NULL, "%s: adjusting minor value from %d "
1890 				    "to %d", __func__, vp->ver_minor,
1891 				    vsw_versions[i].ver_minor);
1892 				vp->ver_minor = vsw_versions[i].ver_minor;
1893 			}
1894 
1895 			return (0);
1896 		}
1897 
1898 		/*
1899 		 * If the message contains a higher major version number, set
1900 		 * the message's major/minor versions to the current values
1901 		 * and return false, so this message will get resent with
1902 		 * these values.
1903 		 */
1904 		if (vsw_versions[i].ver_major < vp->ver_major) {
1905 			D2(NULL, "%s: adjusting major and minor "
1906 			    "values to %d, %d\n",
1907 			    __func__, vsw_versions[i].ver_major,
1908 			    vsw_versions[i].ver_minor);
1909 			vp->ver_major = vsw_versions[i].ver_major;
1910 			vp->ver_minor = vsw_versions[i].ver_minor;
1911 			return (1);
1912 		}
1913 	}
1914 
1915 	/* No match was possible, zero out fields */
1916 	vp->ver_major = 0;
1917 	vp->ver_minor = 0;
1918 
1919 	D1(NULL, "vsw_supported_version: exit");
1920 
1921 	return (1);
1922 }
1923 
1924 /*
1925  * Set vnet-protocol-version dependent functions based on version.
1926  */
1927 static void
1928 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1929 {
1930 	vsw_t	*vswp = ldcp->ldc_vswp;
1931 	lane_t	*lp = &ldcp->lane_out;
1932 
1933 	/*
1934 	 * Setup the appropriate dring data processing routine and any
1935 	 * associated thread based on the version.
1936 	 *
1937 	 * In versions < 1.6, we support only TxDring mode. In this mode, the
1938 	 * msg worker thread processes all types of VIO msgs (ctrl and data).
1939 	 *
1940 	 * In versions >= 1.6, we also support RxDringData mode. In this mode,
1941 	 * the rcv worker thread processes dring data messages (msgtype:
1942 	 * VIO_TYPE_DATA, subtype: VIO_SUBTYPE_INFO, env: VIO_DRING_DATA). The
1943 	 * rest of the data messages (including acks) and ctrl messages are
1944 	 * handled directly by the callback (intr) thread.
1945 	 *
1946 	 * However, for versions >= 1.6, we could still fallback to TxDring
1947 	 * mode. This could happen if RxDringData mode has been disabled (see
1948 	 * vsw_dring_mode) on this guest or on the peer guest. This info is
1949 	 * determined as part of attr exchange phase of handshake. Hence, we
1950 	 * setup these pointers for v1.6 after attr msg phase completes during
1951 	 * handshake.
1952 	 */
1953 	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
1954 		/*
1955 		 * Set data dring mode for vsw_send_attr(). We setup msg worker
1956 		 * thread in TxDring mode or rcv worker thread in RxDringData
1957 		 * mode when attr phase of handshake completes.
1958 		 */
1959 		if (vsw_dring_mode == VIO_RX_DRING_DATA) {
1960 			lp->dring_mode = (VIO_RX_DRING_DATA | VIO_TX_DRING);
1961 		} else {
1962 			lp->dring_mode = VIO_TX_DRING;
1963 		}
1964 	} else {
1965 		lp->dring_mode = VIO_TX_DRING;
1966 	}
1967 
1968 	/*
1969 	 * Setup the MTU for attribute negotiation based on the version.
1970 	 */
1971 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
1972 		/*
1973 		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
1974 		 * Support), set the mtu in our attributes to max_frame_size.
1975 		 */
1976 		lp->mtu = vswp->max_frame_size;
1977 	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
1978 		/*
1979 		 * If the version negotiated with peer is == 1.3 (Vlan Tag
1980 		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
1981 		 */
1982 		lp->mtu = ETHERMAX + VLAN_TAGSZ;
1983 	} else {
1984 		vsw_port_t	*portp = ldcp->ldc_port;
1985 		/*
1986 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
1987 		 * We can negotiate that size with those peers provided only
1988 		 * pvid is defined for our peer and there are no vids. Then we
1989 		 * can send/recv only untagged frames of max size ETHERMAX.
1990 		 * Note that pvid of the peer can be different, as vsw has to
1991 		 * serve the vnet in that vlan even if itself is not assigned
1992 		 * to that vlan.
1993 		 */
1994 		if (portp->nvids == 0) {
1995 			lp->mtu = ETHERMAX;
1996 		}
1997 	}
1998 
1999 	/*
2000 	 * Setup version dependent data processing functions.
2001 	 */
2002 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2003 		/* Versions >= 1.2 */
2004 
2005 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2006 			/*
2007 			 * enable priority routines and pkt mode only if
2008 			 * at least one pri-eth-type is specified in MD.
2009 			 */
2010 			ldcp->tx = vsw_ldctx_pri;
2011 			ldcp->rx_pktdata = vsw_process_pkt_data;
2012 
2013 			/* set xfer mode for vsw_send_attr() */
2014 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2015 		} else {
2016 			/* no priority eth types defined in MD */
2017 
2018 			ldcp->tx = vsw_ldctx;
2019 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2020 
2021 			/* set xfer mode for vsw_send_attr() */
2022 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2023 		}
2024 
2025 	} else {
2026 		/* Versions prior to 1.2  */
2027 
2028 		vsw_reset_vnet_proto_ops(ldcp);
2029 	}
2030 }
2031 
2032 /*
2033  * Reset vnet-protocol-version dependent functions to v1.0.
2034  */
2035 static void
2036 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2037 {
2038 	lane_t	*lp = &ldcp->lane_out;
2039 
2040 	ldcp->tx = vsw_ldctx;
2041 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2042 
2043 	/* set xfer mode for vsw_send_attr() */
2044 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2045 }
2046 
2047 static void
2048 vsw_process_evt_read(vsw_ldc_t *ldcp)
2049 {
2050 	if (ldcp->msg_thread != NULL) {
2051 		/*
2052 		 * TxDring mode; wakeup message worker
2053 		 * thread to process the VIO messages.
2054 		 */
2055 		mutex_exit(&ldcp->ldc_cblock);
2056 		mutex_enter(&ldcp->msg_thr_lock);
2057 		if (!(ldcp->msg_thr_flags & VSW_WTHR_DATARCVD)) {
2058 			ldcp->msg_thr_flags |= VSW_WTHR_DATARCVD;
2059 			cv_signal(&ldcp->msg_thr_cv);
2060 		}
2061 		mutex_exit(&ldcp->msg_thr_lock);
2062 		mutex_enter(&ldcp->ldc_cblock);
2063 	} else {
2064 		/*
2065 		 * We invoke vsw_process_pkt() in the context of the LDC
2066 		 * callback (vsw_ldc_cb()) during handshake, until the dring
2067 		 * mode is negotiated. After the dring mode is negotiated, the
2068 		 * msgs are processed by the msg worker thread (above case) if
2069 		 * the dring mode is TxDring. Otherwise (in RxDringData mode)
2070 		 * we continue to process the msgs directly in the callback
2071 		 * context.
2072 		 */
2073 		vsw_process_pkt(ldcp);
2074 	}
2075 }
2076 
2077 /*
2078  * Main routine for processing messages received over LDC.
2079  */
2080 void
2081 vsw_process_pkt(void *arg)
2082 {
2083 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2084 	vsw_t 		*vswp = ldcp->ldc_vswp;
2085 	size_t		msglen;
2086 	vio_msg_tag_t	*tagp;
2087 	uint64_t	*ldcmsg;
2088 	int 		rv = 0;
2089 
2090 
2091 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2092 
2093 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2094 
2095 	ldcmsg = ldcp->ldcmsg;
2096 	/*
2097 	 * If channel is up read messages until channel is empty.
2098 	 */
2099 	do {
2100 		msglen = ldcp->msglen;
2101 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2102 
2103 		if (rv != 0) {
2104 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2105 			    __func__, ldcp->ldc_id, rv, msglen);
2106 		}
2107 
2108 		/* channel has been reset */
2109 		if (rv == ECONNRESET) {
2110 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2111 			break;
2112 		}
2113 
2114 		if (msglen == 0) {
2115 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2116 			    ldcp->ldc_id);
2117 			break;
2118 		}
2119 
2120 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2121 		    ldcp->ldc_id, msglen);
2122 
2123 		/*
2124 		 * Figure out what sort of packet we have gotten by
2125 		 * examining the msg tag, and then switch it appropriately.
2126 		 */
2127 		tagp = (vio_msg_tag_t *)ldcmsg;
2128 
2129 		switch (tagp->vio_msgtype) {
2130 		case VIO_TYPE_CTRL:
2131 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp, msglen);
2132 			break;
2133 		case VIO_TYPE_DATA:
2134 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2135 			break;
2136 		case VIO_TYPE_ERR:
2137 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2138 			break;
2139 		default:
2140 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2141 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2142 			break;
2143 		}
2144 	} while (msglen);
2145 
2146 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2147 }
2148 
2149 /*
2150  * Dispatch a task to process a VIO control message.
2151  */
2152 static void
2153 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp,
2154 	int msglen)
2155 {
2156 	vsw_ctrl_task_t		*ctaskp = NULL;
2157 	vsw_port_t		*port = ldcp->ldc_port;
2158 	vsw_t			*vswp = port->p_vswp;
2159 
2160 	D1(vswp, "%s: enter", __func__);
2161 
2162 	/*
2163 	 * We need to handle RDX ACK messages in-band as once they
2164 	 * are exchanged it is possible that we will get an
2165 	 * immediate (legitimate) data packet.
2166 	 */
2167 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2168 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2169 
2170 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2171 			return;
2172 
2173 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2174 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2175 		    "(ostate 0x%llx : hphase %d)", __func__,
2176 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2177 		vsw_next_milestone(ldcp);
2178 		return;
2179 	}
2180 
2181 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2182 
2183 	if (ctaskp == NULL) {
2184 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2185 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2186 		return;
2187 	}
2188 
2189 	ctaskp->ldcp = ldcp;
2190 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, msglen);
2191 	ctaskp->hss_id = ldcp->hss_id;
2192 
2193 	/*
2194 	 * Dispatch task to processing taskq if port is not in
2195 	 * the process of being detached.
2196 	 */
2197 	mutex_enter(&port->state_lock);
2198 	if (port->state == VSW_PORT_INIT) {
2199 		if ((vswp->taskq_p == NULL) ||
2200 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2201 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2202 			mutex_exit(&port->state_lock);
2203 			DERR(vswp, "%s: unable to dispatch task to taskq",
2204 			    __func__);
2205 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2206 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2207 			return;
2208 		}
2209 	} else {
2210 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2211 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2212 		    "task", __func__, port->p_instance);
2213 	}
2214 
2215 	mutex_exit(&port->state_lock);
2216 
2217 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2218 	    ldcp->ldc_id);
2219 	D1(vswp, "%s: exit", __func__);
2220 }
2221 
2222 /*
2223  * Process a VIO ctrl message. Invoked from taskq.
2224  */
2225 static void
2226 vsw_process_ctrl_pkt(void *arg)
2227 {
2228 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2229 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2230 	vsw_t 		*vswp = ldcp->ldc_vswp;
2231 	vio_msg_tag_t	tag;
2232 	uint16_t	env;
2233 
2234 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2235 
2236 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2237 	env = tag.vio_subtype_env;
2238 
2239 	/* stale pkt check */
2240 	if (ctaskp->hss_id < ldcp->hss_id) {
2241 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2242 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2243 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2244 		return;
2245 	}
2246 
2247 	/* session id check */
2248 	if (ldcp->session_status & VSW_PEER_SESSION) {
2249 		if (ldcp->peer_session != tag.vio_sid) {
2250 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2251 			    __func__, ldcp->ldc_id, tag.vio_sid);
2252 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2253 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2254 			return;
2255 		}
2256 	}
2257 
2258 	/*
2259 	 * Switch on vio_subtype envelope, then let lower routines
2260 	 * decide if its an INFO, ACK or NACK packet.
2261 	 */
2262 	switch (env) {
2263 	case VIO_VER_INFO:
2264 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2265 		break;
2266 	case VIO_DRING_REG:
2267 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2268 		break;
2269 	case VIO_DRING_UNREG:
2270 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2271 		break;
2272 	case VIO_ATTR_INFO:
2273 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2274 		break;
2275 	case VNET_MCAST_INFO:
2276 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2277 		break;
2278 	case VIO_RDX:
2279 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2280 		break;
2281 	case VIO_DDS_INFO:
2282 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2283 		break;
2284 
2285 	case VNET_PHYSLINK_INFO:
2286 		vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
2287 		break;
2288 	default:
2289 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2290 	}
2291 
2292 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2293 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2294 }
2295 
2296 /*
2297  * Version negotiation. We can end up here either because our peer
2298  * has responded to a handshake message we have sent it, or our peer
2299  * has initiated a handshake with us. If its the former then can only
2300  * be ACK or NACK, if its the later can only be INFO.
2301  *
2302  * If its an ACK we move to the next stage of the handshake, namely
2303  * attribute exchange. If its a NACK we see if we can specify another
2304  * version, if we can't we stop.
2305  *
2306  * If it is an INFO we reset all params associated with communication
2307  * in that direction over this channel (remember connection is
2308  * essentially 2 independent simplex channels).
2309  */
2310 void
2311 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2312 {
2313 	vio_ver_msg_t	*ver_pkt;
2314 	vsw_t 		*vswp = ldcp->ldc_vswp;
2315 
2316 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2317 
2318 	/*
2319 	 * We know this is a ctrl/version packet so
2320 	 * cast it into the correct structure.
2321 	 */
2322 	ver_pkt = (vio_ver_msg_t *)pkt;
2323 
2324 	switch (ver_pkt->tag.vio_subtype) {
2325 	case VIO_SUBTYPE_INFO:
2326 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2327 
2328 		/*
2329 		 * Record the session id, which we will use from now
2330 		 * until we see another VER_INFO msg. Even then the
2331 		 * session id in most cases will be unchanged, execpt
2332 		 * if channel was reset.
2333 		 */
2334 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2335 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2336 			DERR(vswp, "%s: updating session id for chan %lld "
2337 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2338 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2339 		}
2340 
2341 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2342 		ldcp->session_status |= VSW_PEER_SESSION;
2343 
2344 		/* Legal message at this time ? */
2345 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2346 			return;
2347 
2348 		/*
2349 		 * First check the device class. Currently only expect
2350 		 * to be talking to a network device. In the future may
2351 		 * also talk to another switch.
2352 		 */
2353 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2354 			DERR(vswp, "%s: illegal device class %d", __func__,
2355 			    ver_pkt->dev_class);
2356 
2357 			ver_pkt->tag.vio_sid = ldcp->local_session;
2358 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2359 
2360 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2361 
2362 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2363 			    sizeof (vio_ver_msg_t), B_TRUE);
2364 
2365 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2366 			vsw_next_milestone(ldcp);
2367 			return;
2368 		} else {
2369 			ldcp->dev_class = ver_pkt->dev_class;
2370 		}
2371 
2372 		/*
2373 		 * Now check the version.
2374 		 */
2375 		if (vsw_supported_version(ver_pkt) == 0) {
2376 			/*
2377 			 * Support this major version and possibly
2378 			 * adjusted minor version.
2379 			 */
2380 
2381 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2382 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2383 
2384 			/* Store accepted values */
2385 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2386 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2387 
2388 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2389 
2390 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2391 
2392 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2393 				/*
2394 				 * Send a version info message
2395 				 * using the accepted version that
2396 				 * we are about to ack. Also note that
2397 				 * we send our ver info before we ack.
2398 				 * Otherwise, as soon as receiving the
2399 				 * ack, obp sends attr info msg, which
2400 				 * breaks vsw_check_flag() invoked
2401 				 * from vsw_process_ctrl_attr_pkt();
2402 				 * as we also need VSW_VER_ACK_RECV to
2403 				 * be set in lane_out.lstate, before
2404 				 * we can receive attr info.
2405 				 */
2406 				vsw_send_ver(ldcp);
2407 			}
2408 		} else {
2409 			/*
2410 			 * NACK back with the next lower major/minor
2411 			 * pairing we support (if don't suuport any more
2412 			 * versions then they will be set to zero.
2413 			 */
2414 
2415 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2416 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2417 
2418 			/* Store updated values */
2419 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2420 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2421 
2422 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2423 
2424 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2425 		}
2426 
2427 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2428 		ver_pkt->tag.vio_sid = ldcp->local_session;
2429 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2430 		    sizeof (vio_ver_msg_t), B_TRUE);
2431 
2432 		vsw_next_milestone(ldcp);
2433 		break;
2434 
2435 	case VIO_SUBTYPE_ACK:
2436 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2437 
2438 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2439 			return;
2440 
2441 		/* Store updated values */
2442 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2443 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2444 
2445 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2446 		vsw_next_milestone(ldcp);
2447 
2448 		break;
2449 
2450 	case VIO_SUBTYPE_NACK:
2451 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2452 
2453 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2454 			return;
2455 
2456 		/*
2457 		 * If our peer sent us a NACK with the ver fields set to
2458 		 * zero then there is nothing more we can do. Otherwise see
2459 		 * if we support either the version suggested, or a lesser
2460 		 * one.
2461 		 */
2462 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2463 			DERR(vswp, "%s: peer unable to negotiate any "
2464 			    "further.", __func__);
2465 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2466 			vsw_next_milestone(ldcp);
2467 			return;
2468 		}
2469 
2470 		/*
2471 		 * Check to see if we support this major version or
2472 		 * a lower one. If we don't then maj/min will be set
2473 		 * to zero.
2474 		 */
2475 		(void) vsw_supported_version(ver_pkt);
2476 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2477 			/* Nothing more we can do */
2478 			DERR(vswp, "%s: version negotiation failed.\n",
2479 			    __func__);
2480 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2481 			vsw_next_milestone(ldcp);
2482 		} else {
2483 			/* found a supported major version */
2484 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2485 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2486 
2487 			D2(vswp, "%s: resending with updated values (%x, %x)",
2488 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2489 
2490 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2491 			ver_pkt->tag.vio_sid = ldcp->local_session;
2492 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2493 
2494 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2495 
2496 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2497 			    sizeof (vio_ver_msg_t), B_TRUE);
2498 
2499 			vsw_next_milestone(ldcp);
2500 
2501 		}
2502 		break;
2503 
2504 	default:
2505 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2506 		    ver_pkt->tag.vio_subtype);
2507 	}
2508 
2509 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2510 }
2511 
2512 static int
2513 vsw_process_attr_info(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2514 {
2515 	vsw_t			*vswp = ldcp->ldc_vswp;
2516 	vsw_port_t		*port = ldcp->ldc_port;
2517 	struct ether_addr	ea;
2518 	uint64_t		macaddr = 0;
2519 	lane_t			*lane_out = &ldcp->lane_out;
2520 	lane_t			*lane_in = &ldcp->lane_in;
2521 	uint32_t		mtu;
2522 	int			i;
2523 	uint8_t			dring_mode;
2524 
2525 	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2526 
2527 	if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) {
2528 		return (1);
2529 	}
2530 
2531 	if ((msg->xfer_mode != VIO_DESC_MODE) &&
2532 	    (msg->xfer_mode != lane_out->xfer_mode)) {
2533 		D2(NULL, "%s: unknown mode %x\n", __func__, msg->xfer_mode);
2534 		return (1);
2535 	}
2536 
2537 	/* Only support MAC addresses at moment. */
2538 	if ((msg->addr_type != ADDR_TYPE_MAC) || (msg->addr == 0)) {
2539 		D2(NULL, "%s: invalid addr_type %x, or address 0x%llx\n",
2540 		    __func__, msg->addr_type, msg->addr);
2541 		return (1);
2542 	}
2543 
2544 	/*
2545 	 * MAC address supplied by device should match that stored
2546 	 * in the vsw-port OBP node. Need to decide what to do if they
2547 	 * don't match, for the moment just warn but don't fail.
2548 	 */
2549 	vnet_macaddr_ultostr(msg->addr, ea.ether_addr_octet);
2550 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
2551 		DERR(NULL, "%s: device supplied address "
2552 		    "0x%llx doesn't match node address 0x%llx\n",
2553 		    __func__, msg->addr, port->p_macaddr);
2554 	}
2555 
2556 	/*
2557 	 * Ack freq only makes sense in pkt mode, in shared
2558 	 * mode the ring descriptors say whether or not to
2559 	 * send back an ACK.
2560 	 */
2561 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2562 	    (msg->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2563 	    (VSW_VER_LT(ldcp, 1, 2) &&
2564 	    (msg->xfer_mode == VIO_DRING_MODE_V1_0))) {
2565 		if (msg->ack_freq > 0) {
2566 			D2(NULL, "%s: non zero ack freq in SHM mode\n",
2567 			    __func__);
2568 			return (1);
2569 		}
2570 	}
2571 
2572 	/*
2573 	 * Process dring mode attribute.
2574 	 */
2575 	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2576 		/*
2577 		 * Versions >= 1.6:
2578 		 * Though we are operating in v1.6 mode, it is possible that
2579 		 * RxDringData mode has been disabled either on this guest or
2580 		 * on the peer guest. If so, we revert to pre v1.6 behavior of
2581 		 * TxDring mode. But this must be agreed upon in both
2582 		 * directions of attr exchange. We first determine the mode
2583 		 * that can be negotiated.
2584 		 */
2585 		if ((msg->options & VIO_RX_DRING_DATA) != 0 &&
2586 		    vsw_dring_mode == VIO_RX_DRING_DATA) {
2587 			/*
2588 			 * The peer is capable of handling RxDringData AND we
2589 			 * are also capable of it; we enable RxDringData mode
2590 			 * on this channel.
2591 			 */
2592 			dring_mode = VIO_RX_DRING_DATA;
2593 		} else if ((msg->options & VIO_TX_DRING) != 0) {
2594 			/*
2595 			 * If the peer is capable of TxDring mode, we
2596 			 * negotiate TxDring mode on this channel.
2597 			 */
2598 			dring_mode = VIO_TX_DRING;
2599 		} else {
2600 			/*
2601 			 * We support only VIO_TX_DRING and VIO_RX_DRING_DATA
2602 			 * modes. We don't support VIO_RX_DRING mode.
2603 			 */
2604 			return (1);
2605 		}
2606 
2607 		/*
2608 		 * If we have received an ack for the attr info that we sent,
2609 		 * then check if the dring mode matches what the peer had ack'd
2610 		 * (saved in lane_out). If they don't match, we fail the
2611 		 * handshake.
2612 		 */
2613 		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2614 			if (msg->options != lane_out->dring_mode) {
2615 				/* send NACK */
2616 				return (1);
2617 			}
2618 		} else {
2619 			/*
2620 			 * Save the negotiated dring mode in our attr
2621 			 * parameters, so it gets sent in the attr info from us
2622 			 * to the peer.
2623 			 */
2624 			lane_out->dring_mode = dring_mode;
2625 		}
2626 
2627 		/* save the negotiated dring mode in the msg to be replied */
2628 		msg->options = dring_mode;
2629 	}
2630 
2631 	/*
2632 	 * Process MTU attribute.
2633 	 */
2634 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2635 		/*
2636 		 * Versions >= 1.4:
2637 		 * Validate mtu of the peer is at least ETHERMAX. Then, the mtu
2638 		 * is negotiated down to the minimum of our mtu and peer's mtu.
2639 		 */
2640 		if (msg->mtu < ETHERMAX) {
2641 			return (1);
2642 		}
2643 
2644 		mtu = MIN(msg->mtu, vswp->max_frame_size);
2645 
2646 		/*
2647 		 * If we have received an ack for the attr info
2648 		 * that we sent, then check if the mtu computed
2649 		 * above matches the mtu that the peer had ack'd
2650 		 * (saved in local hparams). If they don't
2651 		 * match, we fail the handshake.
2652 		 */
2653 		if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2654 			if (mtu != lane_out->mtu) {
2655 				/* send NACK */
2656 				return (1);
2657 			}
2658 		} else {
2659 			/*
2660 			 * Save the mtu computed above in our
2661 			 * attr parameters, so it gets sent in
2662 			 * the attr info from us to the peer.
2663 			 */
2664 			lane_out->mtu = mtu;
2665 		}
2666 
2667 		/* save the MIN mtu in the msg to be replied */
2668 		msg->mtu = mtu;
2669 	} else {
2670 		/* Versions < 1.4, mtu must match */
2671 		if (msg->mtu != lane_out->mtu) {
2672 			D2(NULL, "%s: invalid MTU (0x%llx)\n",
2673 			    __func__, msg->mtu);
2674 			return (1);
2675 		}
2676 	}
2677 
2678 	/*
2679 	 * Otherwise store attributes for this lane and update
2680 	 * lane state.
2681 	 */
2682 	lane_in->mtu = msg->mtu;
2683 	lane_in->addr = msg->addr;
2684 	lane_in->addr_type = msg->addr_type;
2685 	lane_in->xfer_mode = msg->xfer_mode;
2686 	lane_in->ack_freq = msg->ack_freq;
2687 	lane_in->physlink_update = msg->physlink_update;
2688 	lane_in->dring_mode = msg->options;
2689 
2690 	/*
2691 	 * Check if the client has requested physlink state updates.
2692 	 * If there is a physical device bound to this vswitch (L2
2693 	 * mode), set the ack bits to indicate it is supported.
2694 	 * Otherwise, set the nack bits.
2695 	 */
2696 	if (VSW_VER_GTEQ(ldcp, 1, 5)) {	/* Protocol ver >= 1.5 */
2697 
2698 		/* Does the vnet need phys link state updates ? */
2699 		if ((lane_in->physlink_update &
2700 		    PHYSLINK_UPDATE_STATE_MASK) ==
2701 		    PHYSLINK_UPDATE_STATE) {
2702 
2703 			if (vswp->smode & VSW_LAYER2) {
2704 				/* is a net-dev assigned to us ? */
2705 				msg->physlink_update =
2706 				    PHYSLINK_UPDATE_STATE_ACK;
2707 				ldcp->pls_negotiated = B_TRUE;
2708 			} else {
2709 				/* not in L2 mode */
2710 				msg->physlink_update =
2711 				    PHYSLINK_UPDATE_STATE_NACK;
2712 				ldcp->pls_negotiated = B_FALSE;
2713 			}
2714 
2715 		} else {
2716 			msg->physlink_update =
2717 			    PHYSLINK_UPDATE_NONE;
2718 			ldcp->pls_negotiated = B_FALSE;
2719 		}
2720 
2721 	} else {
2722 		/*
2723 		 * physlink_update bits are ignored
2724 		 * if set by clients < v1.5 protocol.
2725 		 */
2726 		msg->physlink_update = PHYSLINK_UPDATE_NONE;
2727 		ldcp->pls_negotiated = B_FALSE;
2728 	}
2729 
2730 	macaddr = lane_in->addr;
2731 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2732 		port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2733 		macaddr >>= 8;
2734 	}
2735 
2736 	/* create the fdb entry for this port/mac address */
2737 	vsw_fdbe_add(vswp, port);
2738 
2739 	/* add the port to the specified vlans */
2740 	vsw_vlan_add_ids(port, VSW_VNETPORT);
2741 
2742 	/*
2743 	 * Setup device specific xmit routines. Note this could be changed
2744 	 * further in vsw_send_dring_info() for versions >= 1.6 if operating in
2745 	 * RxDringData mode.
2746 	 */
2747 	mutex_enter(&port->tx_lock);
2748 
2749 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2750 	    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2751 	    (VSW_VER_LT(ldcp, 1, 2) &&
2752 	    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2753 		D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2754 		port->transmit = vsw_dringsend;
2755 	} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2756 		D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2757 		vsw_create_privring(ldcp);
2758 		port->transmit = vsw_descrsend;
2759 		lane_out->xfer_mode = VIO_DESC_MODE;
2760 	}
2761 
2762 	/*
2763 	 * HybridIO is supported only vnet, not by OBP.
2764 	 * So, set hio_capable to true only when in DRING mode.
2765 	 */
2766 	if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2767 	    (lane_in->xfer_mode != VIO_DESC_MODE)) {
2768 		(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2769 	} else {
2770 		(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2771 	}
2772 
2773 	mutex_exit(&port->tx_lock);
2774 
2775 	return (0);
2776 }
2777 
2778 static int
2779 vsw_process_attr_ack(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2780 {
2781 	vsw_t	*vswp = ldcp->ldc_vswp;
2782 	lane_t	*lane_out = &ldcp->lane_out;
2783 	lane_t	*lane_in = &ldcp->lane_in;
2784 
2785 	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2786 
2787 	if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) {
2788 		return (1);
2789 	}
2790 
2791 	/*
2792 	 * Process dring mode attribute.
2793 	 */
2794 	if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2795 		/*
2796 		 * Versions >= 1.6:
2797 		 * The ack msg sent by the peer contains the negotiated dring
2798 		 * mode between our capability (that we had sent in our attr
2799 		 * info) and the peer's capability.
2800 		 */
2801 		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2802 			/*
2803 			 * If we have sent an ack for the attr info msg from
2804 			 * the peer, check if the dring mode that was
2805 			 * negotiated then (saved in lane_out) matches the
2806 			 * mode that the peer has ack'd. If they don't match,
2807 			 * we fail the handshake.
2808 			 */
2809 			if (lane_out->dring_mode != msg->options) {
2810 				return (1);
2811 			}
2812 		} else {
2813 			if ((msg->options & lane_out->dring_mode) == 0) {
2814 				/*
2815 				 * Peer ack'd with a mode that we don't
2816 				 * support; we fail the handshake.
2817 				 */
2818 				return (1);
2819 			}
2820 			if ((msg->options & (VIO_TX_DRING|VIO_RX_DRING_DATA))
2821 			    == (VIO_TX_DRING|VIO_RX_DRING_DATA)) {
2822 				/*
2823 				 * Peer must ack with only one negotiated mode.
2824 				 * Otherwise fail handshake.
2825 				 */
2826 				return (1);
2827 			}
2828 
2829 			/*
2830 			 * Save the negotiated mode, so we can validate it when
2831 			 * we receive attr info from the peer.
2832 			 */
2833 			lane_out->dring_mode = msg->options;
2834 		}
2835 	}
2836 
2837 	/*
2838 	 * Process MTU attribute.
2839 	 */
2840 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2841 		/*
2842 		 * Versions >= 1.4:
2843 		 * The ack msg sent by the peer contains the minimum of
2844 		 * our mtu (that we had sent in our attr info) and the
2845 		 * peer's mtu.
2846 		 *
2847 		 * If we have sent an ack for the attr info msg from
2848 		 * the peer, check if the mtu that was computed then
2849 		 * (saved in lane_out params) matches the mtu that the
2850 		 * peer has ack'd. If they don't match, we fail the
2851 		 * handshake.
2852 		 */
2853 		if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2854 			if (lane_out->mtu != msg->mtu) {
2855 				return (1);
2856 			}
2857 		} else {
2858 			/*
2859 			 * If the mtu ack'd by the peer is > our mtu
2860 			 * fail handshake. Otherwise, save the mtu, so
2861 			 * we can validate it when we receive attr info
2862 			 * from our peer.
2863 			 */
2864 			if (msg->mtu <= lane_out->mtu) {
2865 				lane_out->mtu = msg->mtu;
2866 			} else {
2867 				return (1);
2868 			}
2869 		}
2870 	}
2871 
2872 	return (0);
2873 }
2874 
2875 /*
2876  * Process an attribute packet. We can end up here either because our peer
2877  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2878  * peer has sent us an attribute INFO message
2879  *
2880  * If its an ACK we then move to the next stage of the handshake which
2881  * is to send our descriptor ring info to our peer. If its a NACK then
2882  * there is nothing more we can (currently) do.
2883  *
2884  * If we get a valid/acceptable INFO packet (and we have already negotiated
2885  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2886  * NACK back and reset channel state to INACTIV.
2887  *
2888  * FUTURE: in time we will probably negotiate over attributes, but for
2889  * the moment unacceptable attributes are regarded as a fatal error.
2890  *
2891  */
2892 void
2893 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2894 {
2895 	vnet_attr_msg_t	*attr_pkt;
2896 	vsw_t		*vswp = ldcp->ldc_vswp;
2897 	lane_t		*lane_out = &ldcp->lane_out;
2898 	lane_t		*lane_in = &ldcp->lane_in;
2899 	int		rv;
2900 
2901 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2902 
2903 	/*
2904 	 * We know this is a ctrl/attr packet so
2905 	 * cast it into the correct structure.
2906 	 */
2907 	attr_pkt = (vnet_attr_msg_t *)pkt;
2908 
2909 	switch (attr_pkt->tag.vio_subtype) {
2910 	case VIO_SUBTYPE_INFO:
2911 
2912 		rv = vsw_process_attr_info(ldcp, attr_pkt);
2913 		if (rv != 0) {
2914 			vsw_free_lane_resources(ldcp, INBOUND);
2915 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2916 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2917 		} else {
2918 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2919 			lane_in->lstate |= VSW_ATTR_ACK_SENT;
2920 		}
2921 		attr_pkt->tag.vio_sid = ldcp->local_session;
2922 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2923 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2924 		    sizeof (vnet_attr_msg_t), B_TRUE);
2925 		vsw_next_milestone(ldcp);
2926 		break;
2927 
2928 	case VIO_SUBTYPE_ACK:
2929 
2930 		rv = vsw_process_attr_ack(ldcp, attr_pkt);
2931 		if (rv != 0) {
2932 			return;
2933 		}
2934 		lane_out->lstate |= VSW_ATTR_ACK_RECV;
2935 		vsw_next_milestone(ldcp);
2936 		break;
2937 
2938 	case VIO_SUBTYPE_NACK:
2939 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2940 
2941 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2942 			return;
2943 
2944 		lane_out->lstate |= VSW_ATTR_NACK_RECV;
2945 		vsw_next_milestone(ldcp);
2946 		break;
2947 
2948 	default:
2949 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2950 		    attr_pkt->tag.vio_subtype);
2951 	}
2952 
2953 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2954 }
2955 
2956 static int
2957 vsw_process_dring_reg_info(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2958 {
2959 	int		rv;
2960 	vsw_t		*vswp = ldcp->ldc_vswp;
2961 	lane_t		*lp = &ldcp->lane_out;
2962 	dring_info_t	*dp = NULL;
2963 
2964 	D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2965 
2966 	rv = vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV);
2967 	if (rv != 0) {
2968 		return (1);
2969 	}
2970 
2971 	if (VSW_VER_GTEQ(ldcp, 1, 6) &&
2972 	    (lp->dring_mode != ((vio_dring_reg_msg_t *)tagp)->options)) {
2973 		/*
2974 		 * The earlier version of Solaris vnet driver doesn't set the
2975 		 * option (VIO_TX_DRING in its case) correctly in its dring reg
2976 		 * message. We workaround that here by doing the check only
2977 		 * for versions >= v1.6.
2978 		 */
2979 		DWARN(vswp, "%s(%lld): Rcvd dring reg option (%d), "
2980 		    "negotiated mode (%d)\n", __func__, ldcp->ldc_id,
2981 		    ((vio_dring_reg_msg_t *)tagp)->options, lp->dring_mode);
2982 		return (1);
2983 	}
2984 
2985 	/*
2986 	 * Map dring exported by the peer.
2987 	 */
2988 	dp = vsw_map_dring(ldcp, (void *)tagp);
2989 	if (dp == NULL) {
2990 		return (1);
2991 	}
2992 
2993 	/*
2994 	 * Map data buffers exported by the peer if we are in RxDringData mode.
2995 	 */
2996 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
2997 		rv = vsw_map_data(ldcp, dp, (void *)tagp);
2998 		if (rv != 0) {
2999 			vsw_unmap_dring(ldcp);
3000 			return (1);
3001 		}
3002 	}
3003 
3004 	return (0);
3005 }
3006 
3007 static int
3008 vsw_process_dring_reg_ack(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
3009 {
3010 	vsw_t		*vswp = ldcp->ldc_vswp;
3011 	dring_info_t	*dp;
3012 
3013 	D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3014 
3015 	if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) {
3016 		return (1);
3017 	}
3018 
3019 	dp = ldcp->lane_out.dringp;
3020 
3021 	/* save dring_ident acked by peer */
3022 	dp->ident = ((vio_dring_reg_msg_t *)tagp)->dring_ident;
3023 
3024 	return (0);
3025 }
3026 
3027 /*
3028  * Process a dring info packet. We can end up here either because our peer
3029  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3030  * peer has sent us a dring INFO message.
3031  *
3032  * If we get a valid/acceptable INFO packet (and we have already negotiated
3033  * a version) we ACK back and update the lane state, otherwise we NACK back.
3034  *
3035  * FUTURE: nothing to stop client from sending us info on multiple dring's
3036  * but for the moment we will just use the first one we are given.
3037  *
3038  */
3039 void
3040 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3041 {
3042 	int		rv;
3043 	int		msgsize;
3044 	dring_info_t	*dp;
3045 	vio_msg_tag_t	*tagp = (vio_msg_tag_t *)pkt;
3046 	vsw_t		*vswp = ldcp->ldc_vswp;
3047 	lane_t		*lane_out = &ldcp->lane_out;
3048 	lane_t		*lane_in = &ldcp->lane_in;
3049 
3050 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3051 
3052 	switch (tagp->vio_subtype) {
3053 	case VIO_SUBTYPE_INFO:
3054 		rv = vsw_process_dring_reg_info(ldcp, tagp);
3055 		if (rv != 0) {
3056 			vsw_free_lane_resources(ldcp, INBOUND);
3057 			tagp->vio_subtype = VIO_SUBTYPE_NACK;
3058 			lane_in->lstate |= VSW_DRING_NACK_SENT;
3059 		} else {
3060 			tagp->vio_subtype = VIO_SUBTYPE_ACK;
3061 			lane_in->lstate |= VSW_DRING_ACK_SENT;
3062 		}
3063 		tagp->vio_sid = ldcp->local_session;
3064 		DUMP_TAG_PTR(tagp);
3065 		if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
3066 			dp = lane_in->dringp;
3067 			msgsize =
3068 			    VNET_DRING_REG_EXT_MSG_SIZE(dp->data_ncookies);
3069 		} else {
3070 			msgsize = sizeof (vio_dring_reg_msg_t);
3071 		}
3072 		(void) vsw_send_msg(ldcp, (void *)tagp, msgsize, B_TRUE);
3073 		vsw_next_milestone(ldcp);
3074 		break;
3075 
3076 	case VIO_SUBTYPE_ACK:
3077 		rv = vsw_process_dring_reg_ack(ldcp, tagp);
3078 		if (rv != 0) {
3079 			return;
3080 		}
3081 		lane_out->lstate |= VSW_DRING_ACK_RECV;
3082 		vsw_next_milestone(ldcp);
3083 		break;
3084 
3085 	case VIO_SUBTYPE_NACK:
3086 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3087 
3088 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3089 			return;
3090 
3091 		lane_out->lstate |= VSW_DRING_NACK_RECV;
3092 		vsw_next_milestone(ldcp);
3093 		break;
3094 
3095 	default:
3096 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3097 		    tagp->vio_subtype);
3098 	}
3099 
3100 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3101 }
3102 
3103 /*
3104  * Process a request from peer to unregister a dring.
3105  *
3106  * For the moment we just restart the handshake if our
3107  * peer endpoint attempts to unregister a dring.
3108  */
3109 void
3110 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3111 {
3112 	vsw_t			*vswp = ldcp->ldc_vswp;
3113 	vio_dring_unreg_msg_t	*dring_pkt;
3114 
3115 	/*
3116 	 * We know this is a ctrl/dring packet so
3117 	 * cast it into the correct structure.
3118 	 */
3119 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3120 
3121 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3122 
3123 	switch (dring_pkt->tag.vio_subtype) {
3124 	case VIO_SUBTYPE_INFO:
3125 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3126 
3127 		DWARN(vswp, "%s: restarting handshake..", __func__);
3128 		break;
3129 
3130 	case VIO_SUBTYPE_ACK:
3131 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3132 
3133 		DWARN(vswp, "%s: restarting handshake..", __func__);
3134 		break;
3135 
3136 	case VIO_SUBTYPE_NACK:
3137 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3138 
3139 		DWARN(vswp, "%s: restarting handshake..", __func__);
3140 		break;
3141 
3142 	default:
3143 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3144 		    dring_pkt->tag.vio_subtype);
3145 	}
3146 
3147 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3148 
3149 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3150 }
3151 
3152 #define	SND_MCST_NACK(ldcp, pkt) \
3153 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3154 	pkt->tag.vio_sid = ldcp->local_session; \
3155 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3156 			sizeof (vnet_mcast_msg_t), B_TRUE);
3157 
3158 /*
3159  * Process a multicast request from a vnet.
3160  *
3161  * Vnet's specify a multicast address that they are interested in. This
3162  * address is used as a key into the hash table which forms the multicast
3163  * forwarding database (mFDB).
3164  *
3165  * The table keys are the multicast addresses, while the table entries
3166  * are pointers to lists of ports which wish to receive packets for the
3167  * specified multicast address.
3168  *
3169  * When a multicast packet is being switched we use the address as a key
3170  * into the hash table, and then walk the appropriate port list forwarding
3171  * the pkt to each port in turn.
3172  *
3173  * If a vnet is no longer interested in a particular multicast grouping
3174  * we simply find the correct location in the hash table and then delete
3175  * the relevant port from the port list.
3176  *
3177  * To deal with the case whereby a port is being deleted without first
3178  * removing itself from the lists in the hash table, we maintain a list
3179  * of multicast addresses the port has registered an interest in, within
3180  * the port structure itself. We then simply walk that list of addresses
3181  * using them as keys into the hash table and remove the port from the
3182  * appropriate lists.
3183  */
3184 static void
3185 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3186 {
3187 	vnet_mcast_msg_t	*mcst_pkt;
3188 	vsw_port_t		*port = ldcp->ldc_port;
3189 	vsw_t			*vswp = ldcp->ldc_vswp;
3190 	int			i;
3191 
3192 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3193 
3194 	/*
3195 	 * We know this is a ctrl/mcast packet so
3196 	 * cast it into the correct structure.
3197 	 */
3198 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3199 
3200 	switch (mcst_pkt->tag.vio_subtype) {
3201 	case VIO_SUBTYPE_INFO:
3202 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3203 
3204 		/*
3205 		 * Check if in correct state to receive a multicast
3206 		 * message (i.e. handshake complete). If not reset
3207 		 * the handshake.
3208 		 */
3209 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3210 			return;
3211 
3212 		/*
3213 		 * Before attempting to add or remove address check
3214 		 * that they are valid multicast addresses.
3215 		 * If not, then NACK back.
3216 		 */
3217 		for (i = 0; i < mcst_pkt->count; i++) {
3218 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3219 				DERR(vswp, "%s: invalid multicast address",
3220 				    __func__);
3221 				SND_MCST_NACK(ldcp, mcst_pkt);
3222 				return;
3223 			}
3224 		}
3225 
3226 		/*
3227 		 * Now add/remove the addresses. If this fails we
3228 		 * NACK back.
3229 		 */
3230 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3231 			SND_MCST_NACK(ldcp, mcst_pkt);
3232 			return;
3233 		}
3234 
3235 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3236 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3237 
3238 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3239 
3240 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3241 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3242 		break;
3243 
3244 	case VIO_SUBTYPE_ACK:
3245 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3246 
3247 		/*
3248 		 * We shouldn't ever get a multicast ACK message as
3249 		 * at the moment we never request multicast addresses
3250 		 * to be set on some other device. This may change in
3251 		 * the future if we have cascading switches.
3252 		 */
3253 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3254 			return;
3255 
3256 				/* Do nothing */
3257 		break;
3258 
3259 	case VIO_SUBTYPE_NACK:
3260 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3261 
3262 		/*
3263 		 * We shouldn't get a multicast NACK packet for the
3264 		 * same reasons as we shouldn't get a ACK packet.
3265 		 */
3266 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3267 			return;
3268 
3269 				/* Do nothing */
3270 		break;
3271 
3272 	default:
3273 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3274 		    mcst_pkt->tag.vio_subtype);
3275 	}
3276 
3277 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3278 }
3279 
3280 static void
3281 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3282 {
3283 	vio_rdx_msg_t	*rdx_pkt;
3284 	vsw_t		*vswp = ldcp->ldc_vswp;
3285 
3286 	/*
3287 	 * We know this is a ctrl/rdx packet so
3288 	 * cast it into the correct structure.
3289 	 */
3290 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3291 
3292 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3293 
3294 	switch (rdx_pkt->tag.vio_subtype) {
3295 	case VIO_SUBTYPE_INFO:
3296 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3297 
3298 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3299 			return;
3300 
3301 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3302 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3303 
3304 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3305 
3306 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3307 
3308 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3309 		    sizeof (vio_rdx_msg_t), B_TRUE);
3310 
3311 		vsw_next_milestone(ldcp);
3312 		break;
3313 
3314 	case VIO_SUBTYPE_ACK:
3315 		/*
3316 		 * Should be handled in-band by callback handler.
3317 		 */
3318 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3319 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3320 		break;
3321 
3322 	case VIO_SUBTYPE_NACK:
3323 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3324 
3325 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3326 			return;
3327 
3328 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3329 		vsw_next_milestone(ldcp);
3330 		break;
3331 
3332 	default:
3333 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3334 		    rdx_pkt->tag.vio_subtype);
3335 	}
3336 
3337 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3338 }
3339 
3340 static void
3341 vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
3342 {
3343 	vnet_physlink_msg_t	*msgp;
3344 	vsw_t			*vswp = ldcp->ldc_vswp;
3345 
3346 	msgp = (vnet_physlink_msg_t *)pkt;
3347 
3348 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3349 
3350 	switch (msgp->tag.vio_subtype) {
3351 	case VIO_SUBTYPE_INFO:
3352 
3353 		/* vsw shouldn't recv physlink info */
3354 		DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
3355 		break;
3356 
3357 	case VIO_SUBTYPE_ACK:
3358 
3359 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3360 		break;
3361 
3362 	case VIO_SUBTYPE_NACK:
3363 
3364 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3365 		break;
3366 
3367 	default:
3368 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3369 		    msgp->tag.vio_subtype);
3370 	}
3371 
3372 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3373 }
3374 
3375 static void
3376 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3377 	uint32_t msglen)
3378 {
3379 	uint16_t	env = tagp->vio_subtype_env;
3380 	vsw_t		*vswp = ldcp->ldc_vswp;
3381 	lane_t		*lp = &ldcp->lane_out;
3382 	uint8_t		dring_mode = lp->dring_mode;
3383 
3384 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3385 
3386 	/* session id check */
3387 	if (ldcp->session_status & VSW_PEER_SESSION) {
3388 		if (ldcp->peer_session != tagp->vio_sid) {
3389 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3390 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3391 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3392 			return;
3393 		}
3394 	}
3395 
3396 	/*
3397 	 * It is an error for us to be getting data packets
3398 	 * before the handshake has completed.
3399 	 */
3400 	if (ldcp->hphase != VSW_MILESTONE4) {
3401 		DERR(vswp, "%s: got data packet before handshake complete "
3402 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3403 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3404 		DUMP_FLAGS(ldcp->lane_in.lstate);
3405 		DUMP_FLAGS(ldcp->lane_out.lstate);
3406 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3407 		return;
3408 	}
3409 	if (dring_mode == VIO_TX_DRING) {
3410 		/*
3411 		 * To reduce the locking contention, release the ldc_cblock
3412 		 * here and re-acquire it once we are done receiving packets.
3413 		 * We do this only in TxDring mode to allow further callbaks to
3414 		 * continue while the msg worker thread processes the messages.
3415 		 * In RxDringData mode, we process the messages in the callback
3416 		 * itself and wake up rcv worker thread to process only data
3417 		 * info messages.
3418 		 */
3419 		mutex_exit(&ldcp->ldc_cblock);
3420 		mutex_enter(&ldcp->ldc_rxlock);
3421 	}
3422 
3423 	/*
3424 	 * Switch on vio_subtype envelope, then let lower routines
3425 	 * decide if its an INFO, ACK or NACK packet.
3426 	 */
3427 	if (env == VIO_DRING_DATA) {
3428 		ldcp->rx_dringdata(ldcp, dpkt);
3429 	} else if (env == VIO_PKT_DATA) {
3430 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3431 	} else if (env == VIO_DESC_DATA) {
3432 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3433 	} else {
3434 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n",
3435 		    __func__, env);
3436 	}
3437 
3438 	if (dring_mode == VIO_TX_DRING) {
3439 		mutex_exit(&ldcp->ldc_rxlock);
3440 		mutex_enter(&ldcp->ldc_cblock);
3441 	}
3442 
3443 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3444 }
3445 
3446 /*
3447  * dummy pkt data handler function for vnet protocol version 1.0
3448  */
3449 static void
3450 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3451 {
3452 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3453 }
3454 
3455 /*
3456  * This function handles raw pkt data messages received over the channel.
3457  * Currently, only priority-eth-type frames are received through this mechanism.
3458  * In this case, the frame(data) is present within the message itself which
3459  * is copied into an mblk before switching it.
3460  */
3461 static void
3462 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3463 {
3464 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3465 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3466 	uint32_t		size;
3467 	mblk_t			*mp;
3468 	vio_mblk_t		*vmp;
3469 	vsw_t			*vswp = ldcp->ldc_vswp;
3470 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3471 	lane_t			*lp = &ldcp->lane_out;
3472 
3473 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3474 	if (size < ETHERMIN || size > lp->mtu) {
3475 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3476 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3477 		    ldcp->ldc_id, size);
3478 		return;
3479 	}
3480 
3481 	vmp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3482 	if (vmp == NULL) {
3483 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3484 		if (mp == NULL) {
3485 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3486 			DWARN(vswp, "%s(%lld) allocb failure, "
3487 			    "unable to process priority frame\n", __func__,
3488 			    ldcp->ldc_id);
3489 			return;
3490 		}
3491 	} else {
3492 		mp = vmp->mp;
3493 	}
3494 
3495 	/* skip over the extra space for vlan tag */
3496 	mp->b_rptr += VLAN_TAGSZ;
3497 
3498 	/* copy the frame from the payload of raw data msg into the mblk */
3499 	bcopy(dpkt->data, mp->b_rptr, size);
3500 	mp->b_wptr = mp->b_rptr + size;
3501 
3502 	if (vmp != NULL) {
3503 		vmp->state = VIO_MBLK_HAS_DATA;
3504 	}
3505 
3506 	/* update stats */
3507 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3508 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3509 
3510 	/*
3511 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3512 	 */
3513 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3514 
3515 	/* switch the frame to destination */
3516 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3517 }
3518 
3519 /*
3520  * Process an in-band descriptor message (most likely from
3521  * OBP).
3522  */
3523 static void
3524 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3525 {
3526 	vnet_ibnd_desc_t	*ibnd_desc;
3527 	dring_info_t		*dp = NULL;
3528 	vsw_private_desc_t	*priv_addr = NULL;
3529 	vsw_t			*vswp = ldcp->ldc_vswp;
3530 	mblk_t			*mp = NULL;
3531 	size_t			nbytes = 0;
3532 	size_t			off = 0;
3533 	uint64_t		idx = 0;
3534 	uint32_t		num = 1, len, datalen = 0;
3535 	uint64_t		ncookies = 0;
3536 	int			i, rv;
3537 	int			j = 0;
3538 
3539 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3540 
3541 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3542 
3543 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3544 	case VIO_SUBTYPE_INFO:
3545 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3546 
3547 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3548 			return;
3549 
3550 		/*
3551 		 * Data is padded to align on a 8 byte boundary,
3552 		 * nbytes is actual data length, i.e. minus that
3553 		 * padding.
3554 		 */
3555 		datalen = ibnd_desc->nbytes;
3556 
3557 		D2(vswp, "%s(%lld): processing inband desc : "
3558 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3559 
3560 		ncookies = ibnd_desc->ncookies;
3561 
3562 		/*
3563 		 * allocb(9F) returns an aligned data block. We
3564 		 * need to ensure that we ask ldc for an aligned
3565 		 * number of bytes also.
3566 		 */
3567 		nbytes = datalen;
3568 		if (nbytes & 0x7) {
3569 			off = 8 - (nbytes & 0x7);
3570 			nbytes += off;
3571 		}
3572 
3573 		/* alloc extra space for VLAN_TAG */
3574 		mp = allocb(datalen + 8, BPRI_MED);
3575 		if (mp == NULL) {
3576 			DERR(vswp, "%s(%lld): allocb failed",
3577 			    __func__, ldcp->ldc_id);
3578 			ldcp->ldc_stats.rx_allocb_fail++;
3579 			return;
3580 		}
3581 
3582 		/* skip over the extra space for VLAN_TAG */
3583 		mp->b_rptr += 8;
3584 
3585 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3586 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3587 		    LDC_COPY_IN);
3588 
3589 		if (rv != 0) {
3590 			DERR(vswp, "%s(%d): unable to copy in data from "
3591 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3592 			freemsg(mp);
3593 			ldcp->ldc_stats.ierrors++;
3594 			return;
3595 		}
3596 
3597 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3598 		    __func__, ldcp->ldc_id, nbytes, ncookies);
3599 
3600 		/* point to the actual end of data */
3601 		mp->b_wptr = mp->b_rptr + datalen;
3602 		ldcp->ldc_stats.ipackets++;
3603 		ldcp->ldc_stats.rbytes += datalen;
3604 
3605 		/*
3606 		 * We ACK back every in-band descriptor message we process
3607 		 */
3608 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3609 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3610 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3611 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3612 
3613 		/*
3614 		 * there is extra space alloc'd for VLAN_TAG
3615 		 */
3616 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3617 
3618 		/* send the packet to be switched */
3619 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3620 		    ldcp->ldc_port, NULL);
3621 
3622 		break;
3623 
3624 	case VIO_SUBTYPE_ACK:
3625 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3626 
3627 		/* Verify the ACK is valid */
3628 		idx = ibnd_desc->hdr.desc_handle;
3629 
3630 		if (idx >= vsw_num_descriptors) {
3631 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3632 			    "(idx %ld)", vswp->instance, idx);
3633 			return;
3634 		}
3635 
3636 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3637 			DERR(vswp, "%s: no dring found", __func__);
3638 			return;
3639 		}
3640 
3641 		len = dp->num_descriptors;
3642 		/*
3643 		 * If the descriptor we are being ACK'ed for is not the
3644 		 * one we expected, then pkts were lost somwhere, either
3645 		 * when we tried to send a msg, or a previous ACK msg from
3646 		 * our peer. In either case we now reclaim the descriptors
3647 		 * in the range from the last ACK we received up to the
3648 		 * current ACK.
3649 		 */
3650 		if (idx != dp->last_ack_recv) {
3651 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3652 			    __func__, dp->last_ack_recv, idx);
3653 			num = idx >= dp->last_ack_recv ?
3654 			    idx - dp->last_ack_recv + 1:
3655 			    (len - dp->last_ack_recv + 1) + idx;
3656 		}
3657 
3658 		/*
3659 		 * When we sent the in-band message to our peer we
3660 		 * marked the copy in our private ring as READY. We now
3661 		 * check that the descriptor we are being ACK'ed for is in
3662 		 * fact READY, i.e. it is one we have shared with our peer.
3663 		 *
3664 		 * If its not we flag an error, but still reset the descr
3665 		 * back to FREE.
3666 		 */
3667 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3668 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3669 			mutex_enter(&priv_addr->dstate_lock);
3670 			if (priv_addr->dstate != VIO_DESC_READY) {
3671 				DERR(vswp, "%s: (%ld) desc at index %ld not "
3672 				    "READY (0x%lx)", __func__,
3673 				    ldcp->ldc_id, idx, priv_addr->dstate);
3674 				DERR(vswp, "%s: bound %d: ncookies %ld : "
3675 				    "datalen %ld", __func__,
3676 				    priv_addr->bound, priv_addr->ncookies,
3677 				    priv_addr->datalen);
3678 			}
3679 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3680 			    ldcp->ldc_id, idx);
3681 			/* release resources associated with sent msg */
3682 			priv_addr->datalen = 0;
3683 			priv_addr->dstate = VIO_DESC_FREE;
3684 			mutex_exit(&priv_addr->dstate_lock);
3685 		}
3686 		/* update to next expected value */
3687 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3688 
3689 		break;
3690 
3691 	case VIO_SUBTYPE_NACK:
3692 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3693 
3694 		/*
3695 		 * We should only get a NACK if our peer doesn't like
3696 		 * something about a message we have sent it. If this
3697 		 * happens we just release the resources associated with
3698 		 * the message. (We are relying on higher layers to decide
3699 		 * whether or not to resend.
3700 		 */
3701 
3702 		/* limit check */
3703 		idx = ibnd_desc->hdr.desc_handle;
3704 
3705 		if (idx >= vsw_num_descriptors) {
3706 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3707 			    __func__, idx);
3708 			return;
3709 		}
3710 
3711 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3712 			DERR(vswp, "%s: no dring found", __func__);
3713 			return;
3714 		}
3715 
3716 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3717 
3718 		/* move to correct location in ring */
3719 		priv_addr += idx;
3720 
3721 		/* release resources associated with sent msg */
3722 		mutex_enter(&priv_addr->dstate_lock);
3723 		priv_addr->datalen = 0;
3724 		priv_addr->dstate = VIO_DESC_FREE;
3725 		mutex_exit(&priv_addr->dstate_lock);
3726 
3727 		break;
3728 
3729 	default:
3730 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3731 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3732 	}
3733 
3734 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3735 }
3736 
3737 static void
3738 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3739 {
3740 	_NOTE(ARGUNUSED(epkt))
3741 
3742 	vsw_t		*vswp = ldcp->ldc_vswp;
3743 	uint16_t	env = tagp->vio_subtype_env;
3744 
3745 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3746 
3747 	/*
3748 	 * Error vio_subtypes have yet to be defined. So for
3749 	 * the moment we can't do anything.
3750 	 */
3751 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3752 
3753 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3754 }
3755 
3756 /* transmit the packet over the given port */
3757 int
3758 vsw_portsend(vsw_port_t *port, mblk_t *mp)
3759 {
3760 	mblk_t		*mpt;
3761 	int		count;
3762 	vsw_ldc_t 	*ldcp = port->ldcp;
3763 	int		status = 0;
3764 
3765 	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3766 	if (count != 0) {
3767 		status = ldcp->tx(ldcp, mp, mpt, count);
3768 	}
3769 	return (status);
3770 }
3771 
3772 /*
3773  * Break up frames into 2 seperate chains: normal and
3774  * priority, based on the frame type. The number of
3775  * priority frames is also counted and returned.
3776  *
3777  * Params:
3778  * 	vswp:	pointer to the instance of vsw
3779  *	np:	head of packet chain to be broken
3780  *	npt:	tail of packet chain to be broken
3781  *
3782  * Returns:
3783  *	np:	head of normal data packets
3784  *	npt:	tail of normal data packets
3785  *	hp:	head of high priority packets
3786  *	hpt:	tail of high priority packets
3787  */
3788 static uint32_t
3789 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
3790 	mblk_t **hp, mblk_t **hpt)
3791 {
3792 	mblk_t			*tmp = NULL;
3793 	mblk_t			*smp = NULL;
3794 	mblk_t			*hmp = NULL;	/* high prio pkts head */
3795 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
3796 	mblk_t			*nmp = NULL;	/* normal pkts head */
3797 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
3798 	uint32_t		count = 0;
3799 	int			i;
3800 	struct ether_header	*ehp;
3801 	uint32_t		num_types;
3802 	uint16_t		*types;
3803 
3804 	tmp = *np;
3805 	while (tmp != NULL) {
3806 
3807 		smp = tmp;
3808 		tmp = tmp->b_next;
3809 		smp->b_next = NULL;
3810 		smp->b_prev = NULL;
3811 
3812 		ehp = (struct ether_header *)smp->b_rptr;
3813 		num_types = vswp->pri_num_types;
3814 		types = vswp->pri_types;
3815 		for (i = 0; i < num_types; i++) {
3816 			if (ehp->ether_type == types[i]) {
3817 				/* high priority frame */
3818 
3819 				if (hmp != NULL) {
3820 					hmpt->b_next = smp;
3821 					hmpt = smp;
3822 				} else {
3823 					hmp = hmpt = smp;
3824 				}
3825 				count++;
3826 				break;
3827 			}
3828 		}
3829 		if (i == num_types) {
3830 			/* normal data frame */
3831 
3832 			if (nmp != NULL) {
3833 				nmpt->b_next = smp;
3834 				nmpt = smp;
3835 			} else {
3836 				nmp = nmpt = smp;
3837 			}
3838 		}
3839 	}
3840 
3841 	*hp = hmp;
3842 	*hpt = hmpt;
3843 	*np = nmp;
3844 	*npt = nmpt;
3845 
3846 	return (count);
3847 }
3848 
3849 /*
3850  * Wrapper function to transmit normal and/or priority frames over the channel.
3851  */
3852 static int
3853 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3854 {
3855 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
3856 	mblk_t			*tmp;
3857 	mblk_t			*smp;
3858 	mblk_t			*hmp;	/* high prio pkts head */
3859 	mblk_t			*hmpt;	/* high prio pkts tail */
3860 	mblk_t			*nmp;	/* normal pkts head */
3861 	mblk_t			*nmpt;	/* normal pkts tail */
3862 	uint32_t		n = 0;
3863 	vsw_t			*vswp = ldcp->ldc_vswp;
3864 
3865 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
3866 	ASSERT(count != 0);
3867 
3868 	nmp = mp;
3869 	nmpt = mpt;
3870 
3871 	/* gather any priority frames from the chain of packets */
3872 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
3873 
3874 	/* transmit priority frames */
3875 	tmp = hmp;
3876 	while (tmp != NULL) {
3877 		smp = tmp;
3878 		tmp = tmp->b_next;
3879 		smp->b_next = NULL;
3880 		vsw_ldcsend_pkt(ldcp, smp);
3881 	}
3882 
3883 	count -= n;
3884 
3885 	if (count == 0) {
3886 		/* no normal data frames to process */
3887 		return (0);
3888 	}
3889 
3890 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
3891 }
3892 
3893 /*
3894  * Wrapper function to transmit normal frames over the channel.
3895  */
3896 static int
3897 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3898 {
3899 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
3900 	mblk_t		*tmp = NULL;
3901 
3902 	ASSERT(count != 0);
3903 	/*
3904 	 * If the TX thread is enabled, then queue the
3905 	 * ordinary frames and signal the tx thread.
3906 	 */
3907 	if (ldcp->tx_thread != NULL) {
3908 
3909 		mutex_enter(&ldcp->tx_thr_lock);
3910 
3911 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
3912 			/*
3913 			 * If we reached queue limit,
3914 			 * do not queue new packets,
3915 			 * drop them.
3916 			 */
3917 			ldcp->ldc_stats.tx_qfull += count;
3918 			mutex_exit(&ldcp->tx_thr_lock);
3919 			freemsgchain(mp);
3920 			goto exit;
3921 		}
3922 		if (ldcp->tx_mhead == NULL) {
3923 			ldcp->tx_mhead = mp;
3924 			ldcp->tx_mtail = mpt;
3925 			cv_signal(&ldcp->tx_thr_cv);
3926 		} else {
3927 			ldcp->tx_mtail->b_next = mp;
3928 			ldcp->tx_mtail = mpt;
3929 		}
3930 		ldcp->tx_cnt += count;
3931 		mutex_exit(&ldcp->tx_thr_lock);
3932 	} else {
3933 		while (mp != NULL) {
3934 			tmp = mp->b_next;
3935 			mp->b_next = mp->b_prev = NULL;
3936 			(void) vsw_ldcsend(ldcp, mp, 1);
3937 			mp = tmp;
3938 		}
3939 	}
3940 
3941 exit:
3942 	return (0);
3943 }
3944 
3945 /*
3946  * This function transmits the frame in the payload of a raw data
3947  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
3948  * send special frames with high priorities, without going through
3949  * the normal data path which uses descriptor ring mechanism.
3950  */
3951 static void
3952 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
3953 {
3954 	vio_raw_data_msg_t	*pkt;
3955 	mblk_t			*bp;
3956 	mblk_t			*nmp = NULL;
3957 	vio_mblk_t		*vmp;
3958 	caddr_t			dst;
3959 	uint32_t		mblksz;
3960 	uint32_t		size;
3961 	uint32_t		nbytes;
3962 	int			rv;
3963 	vsw_t			*vswp = ldcp->ldc_vswp;
3964 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3965 
3966 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3967 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3968 		(void) atomic_inc_32(&statsp->tx_pri_fail);
3969 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
3970 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
3971 		    ldcp->lane_out.lstate);
3972 		goto send_pkt_exit;
3973 	}
3974 
3975 	size = msgsize(mp);
3976 
3977 	/* frame size bigger than available payload len of raw data msg ? */
3978 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
3979 		(void) atomic_inc_32(&statsp->tx_pri_fail);
3980 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3981 		    ldcp->ldc_id, size);
3982 		goto send_pkt_exit;
3983 	}
3984 
3985 	if (size < ETHERMIN)
3986 		size = ETHERMIN;
3987 
3988 	/* alloc space for a raw data message */
3989 	vmp = vio_allocb(vswp->pri_tx_vmp);
3990 	if (vmp == NULL) {
3991 		(void) atomic_inc_32(&statsp->tx_pri_fail);
3992 		DWARN(vswp, "vio_allocb failed\n");
3993 		goto send_pkt_exit;
3994 	} else {
3995 		nmp = vmp->mp;
3996 	}
3997 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
3998 
3999 	/* copy frame into the payload of raw data message */
4000 	dst = (caddr_t)pkt->data;
4001 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4002 		mblksz = MBLKL(bp);
4003 		bcopy(bp->b_rptr, dst, mblksz);
4004 		dst += mblksz;
4005 	}
4006 
4007 	vmp->state = VIO_MBLK_HAS_DATA;
4008 
4009 	/* setup the raw data msg */
4010 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4011 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4012 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4013 	pkt->tag.vio_sid = ldcp->local_session;
4014 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4015 
4016 	/* send the msg over ldc */
4017 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4018 	if (rv != 0) {
4019 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4020 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4021 		    ldcp->ldc_id);
4022 		goto send_pkt_exit;
4023 	}
4024 
4025 	/* update stats */
4026 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4027 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4028 
4029 send_pkt_exit:
4030 	if (nmp != NULL)
4031 		freemsg(nmp);
4032 	freemsg(mp);
4033 }
4034 
4035 /*
4036  * Transmit the packet over the given LDC channel.
4037  *
4038  * The 'retries' argument indicates how many times a packet
4039  * is retried before it is dropped. Note, the retry is done
4040  * only for a resource related failure, for all other failures
4041  * the packet is dropped immediately.
4042  */
4043 static int
4044 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4045 {
4046 	int		i;
4047 	int		rc;
4048 	int		status = 0;
4049 	vsw_port_t	*port = ldcp->ldc_port;
4050 	dring_info_t	*dp = NULL;
4051 	lane_t		*lp = &ldcp->lane_out;
4052 
4053 	for (i = 0; i < retries; ) {
4054 		/*
4055 		 * Send the message out using the appropriate
4056 		 * transmit function which will free mblock when it
4057 		 * is finished with it.
4058 		 */
4059 		mutex_enter(&port->tx_lock);
4060 		if (port->transmit != NULL) {
4061 			status = (*port->transmit)(ldcp, mp);
4062 		}
4063 		if (status == LDC_TX_SUCCESS) {
4064 			mutex_exit(&port->tx_lock);
4065 			break;
4066 		}
4067 		i++;	/* increment the counter here */
4068 
4069 		/* If its the last retry, then update the oerror */
4070 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4071 			ldcp->ldc_stats.oerrors++;
4072 		}
4073 		mutex_exit(&port->tx_lock);
4074 
4075 		if (status != LDC_TX_NORESOURCES) {
4076 			/*
4077 			 * No retrying required for errors un-related
4078 			 * to resources.
4079 			 */
4080 			break;
4081 		}
4082 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4083 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4084 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4085 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4086 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4087 
4088 			/* Need to reclaim in TxDring mode. */
4089 			if (lp->dring_mode == VIO_TX_DRING) {
4090 				rc = vsw_reclaim_dring(dp, dp->end_idx);
4091 			}
4092 
4093 		} else {
4094 			/*
4095 			 * If there is no dring or the xfer_mode is
4096 			 * set to DESC_MODE(ie., OBP), then simply break here.
4097 			 */
4098 			break;
4099 		}
4100 
4101 		/*
4102 		 * Delay only if none were reclaimed
4103 		 * and its not the last retry.
4104 		 */
4105 		if ((rc == 0) && (i < retries)) {
4106 			delay(drv_usectohz(vsw_ldc_tx_delay));
4107 		}
4108 	}
4109 	freemsg(mp);
4110 	return (status);
4111 }
4112 
4113 /*
4114  * Send an in-band descriptor message over ldc.
4115  */
4116 static int
4117 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4118 {
4119 	vsw_t			*vswp = ldcp->ldc_vswp;
4120 	vnet_ibnd_desc_t	ibnd_msg;
4121 	vsw_private_desc_t	*priv_desc = NULL;
4122 	dring_info_t		*dp = NULL;
4123 	size_t			n, size = 0;
4124 	caddr_t			bufp;
4125 	mblk_t			*bp;
4126 	int			idx, i;
4127 	int			status = LDC_TX_SUCCESS;
4128 	static int		warn_msg = 1;
4129 	lane_t			*lp = &ldcp->lane_out;
4130 
4131 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4132 
4133 	ASSERT(mp != NULL);
4134 
4135 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4136 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4137 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4138 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4139 		    ldcp->lane_out.lstate);
4140 		ldcp->ldc_stats.oerrors++;
4141 		return (LDC_TX_FAILURE);
4142 	}
4143 
4144 	/*
4145 	 * The dring here is as an internal buffer,
4146 	 * rather than a transfer channel.
4147 	 */
4148 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4149 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4150 		    __func__, ldcp->ldc_id);
4151 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4152 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4153 		ldcp->ldc_stats.oerrors++;
4154 		return (LDC_TX_FAILURE);
4155 	}
4156 
4157 	size = msgsize(mp);
4158 	if (size > (size_t)lp->mtu) {
4159 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4160 		    ldcp->ldc_id, size);
4161 		ldcp->ldc_stats.oerrors++;
4162 		return (LDC_TX_FAILURE);
4163 	}
4164 
4165 	/*
4166 	 * Find a free descriptor in our buffer ring
4167 	 */
4168 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4169 		if (warn_msg) {
4170 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4171 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4172 			warn_msg = 0;
4173 		}
4174 
4175 		/* nothing more we can do */
4176 		status = LDC_TX_NORESOURCES;
4177 		goto vsw_descrsend_free_exit;
4178 	} else {
4179 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4180 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4181 		warn_msg = 1;
4182 	}
4183 
4184 	/* copy data into the descriptor */
4185 	bufp = priv_desc->datap;
4186 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4187 		n = MBLKL(bp);
4188 		bcopy(bp->b_rptr, bufp, n);
4189 		bufp += n;
4190 	}
4191 
4192 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4193 
4194 	/* create and send the in-band descp msg */
4195 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4196 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4197 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4198 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4199 
4200 	/*
4201 	 * Copy the mem cookies describing the data from the
4202 	 * private region of the descriptor ring into the inband
4203 	 * descriptor.
4204 	 */
4205 	for (i = 0; i < priv_desc->ncookies; i++) {
4206 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4207 		    sizeof (ldc_mem_cookie_t));
4208 	}
4209 
4210 	ibnd_msg.hdr.desc_handle = idx;
4211 	ibnd_msg.ncookies = priv_desc->ncookies;
4212 	ibnd_msg.nbytes = size;
4213 
4214 	ldcp->ldc_stats.opackets++;
4215 	ldcp->ldc_stats.obytes += size;
4216 
4217 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4218 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4219 
4220 vsw_descrsend_free_exit:
4221 
4222 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4223 	return (status);
4224 }
4225 
4226 static void
4227 vsw_send_ver(void *arg)
4228 {
4229 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4230 	vsw_t		*vswp = ldcp->ldc_vswp;
4231 	lane_t		*lp = &ldcp->lane_out;
4232 	vio_ver_msg_t	ver_msg;
4233 
4234 	D1(vswp, "%s enter", __func__);
4235 
4236 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4237 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4238 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4239 	ver_msg.tag.vio_sid = ldcp->local_session;
4240 
4241 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4242 		ver_msg.ver_major = vsw_versions[0].ver_major;
4243 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4244 	} else {
4245 		/* use the major,minor that we've ack'd */
4246 		lane_t	*lpi = &ldcp->lane_in;
4247 		ver_msg.ver_major = lpi->ver_major;
4248 		ver_msg.ver_minor = lpi->ver_minor;
4249 	}
4250 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4251 
4252 	lp->lstate |= VSW_VER_INFO_SENT;
4253 	lp->ver_major = ver_msg.ver_major;
4254 	lp->ver_minor = ver_msg.ver_minor;
4255 
4256 	DUMP_TAG(ver_msg.tag);
4257 
4258 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4259 
4260 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4261 }
4262 
4263 static void
4264 vsw_send_attr(vsw_ldc_t *ldcp)
4265 {
4266 	vsw_t			*vswp = ldcp->ldc_vswp;
4267 	lane_t			*lp = &ldcp->lane_out;
4268 	vnet_attr_msg_t		attr_msg;
4269 
4270 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4271 
4272 	/*
4273 	 * Subtype is set to INFO by default
4274 	 */
4275 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4276 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4277 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4278 	attr_msg.tag.vio_sid = ldcp->local_session;
4279 
4280 	/* payload copied from default settings for lane */
4281 	attr_msg.mtu = lp->mtu;
4282 	attr_msg.addr_type = lp->addr_type;
4283 	attr_msg.xfer_mode = lp->xfer_mode;
4284 	attr_msg.ack_freq = lp->xfer_mode;
4285 	attr_msg.options = lp->dring_mode;
4286 
4287 	READ_ENTER(&vswp->if_lockrw);
4288 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4289 	RW_EXIT(&vswp->if_lockrw);
4290 
4291 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4292 
4293 	DUMP_TAG(attr_msg.tag);
4294 
4295 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4296 
4297 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4298 }
4299 
4300 static void
4301 vsw_send_dring_info(vsw_ldc_t *ldcp)
4302 {
4303 	int		msgsize;
4304 	void		*msg;
4305 	vsw_t		*vswp = ldcp->ldc_vswp;
4306 	vsw_port_t	*port = ldcp->ldc_port;
4307 	lane_t		*lp = &ldcp->lane_out;
4308 	vgen_stats_t	*statsp = &ldcp->ldc_stats;
4309 
4310 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4311 
4312 	/* dring mode has been negotiated in attr phase; save in stats */
4313 	statsp->dring_mode = lp->dring_mode;
4314 
4315 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4316 		/*
4317 		 * Change the transmit routine for RxDringData mode.
4318 		 */
4319 		port->transmit = vsw_dringsend_shm;
4320 		msg = (void *) vsw_create_rx_dring_info(ldcp);
4321 		if (msg == NULL) {
4322 			return;
4323 		}
4324 		msgsize =
4325 		    VNET_DRING_REG_EXT_MSG_SIZE(lp->dringp->data_ncookies);
4326 		ldcp->rcv_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4327 		    vsw_ldc_rcv_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4328 		ldcp->rx_dringdata = vsw_process_dringdata_shm;
4329 	} else {
4330 		msg = (void *) vsw_create_tx_dring_info(ldcp);
4331 		if (msg == NULL) {
4332 			return;
4333 		}
4334 		msgsize = sizeof (vio_dring_reg_msg_t);
4335 		ldcp->msg_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4336 		    vsw_ldc_msg_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4337 		ldcp->rx_dringdata = vsw_process_dringdata;
4338 	}
4339 
4340 	lp->lstate |= VSW_DRING_INFO_SENT;
4341 	DUMP_TAG_PTR((vio_msg_tag_t *)msg);
4342 	(void) vsw_send_msg(ldcp, msg, msgsize, B_TRUE);
4343 	kmem_free(msg, msgsize);
4344 
4345 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4346 }
4347 
4348 static void
4349 vsw_send_rdx(vsw_ldc_t *ldcp)
4350 {
4351 	vsw_t		*vswp = ldcp->ldc_vswp;
4352 	vio_rdx_msg_t	rdx_msg;
4353 
4354 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4355 
4356 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4357 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4358 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4359 	rdx_msg.tag.vio_sid = ldcp->local_session;
4360 
4361 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4362 
4363 	DUMP_TAG(rdx_msg.tag);
4364 
4365 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4366 
4367 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4368 }
4369 
4370 /*
4371  * Remove the specified address from the list of address maintained
4372  * in this port node.
4373  */
4374 mcst_addr_t *
4375 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4376 {
4377 	vsw_t		*vswp = NULL;
4378 	vsw_port_t	*port = NULL;
4379 	mcst_addr_t	*prev_p = NULL;
4380 	mcst_addr_t	*curr_p = NULL;
4381 
4382 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4383 	    __func__, devtype, addr);
4384 
4385 	if (devtype == VSW_VNETPORT) {
4386 		port = (vsw_port_t *)arg;
4387 		mutex_enter(&port->mca_lock);
4388 		prev_p = curr_p = port->mcap;
4389 	} else {
4390 		vswp = (vsw_t *)arg;
4391 		mutex_enter(&vswp->mca_lock);
4392 		prev_p = curr_p = vswp->mcap;
4393 	}
4394 
4395 	while (curr_p != NULL) {
4396 		if (curr_p->addr == addr) {
4397 			D2(NULL, "%s: address found", __func__);
4398 			/* match found */
4399 			if (prev_p == curr_p) {
4400 				/* list head */
4401 				if (devtype == VSW_VNETPORT)
4402 					port->mcap = curr_p->nextp;
4403 				else
4404 					vswp->mcap = curr_p->nextp;
4405 			} else {
4406 				prev_p->nextp = curr_p->nextp;
4407 			}
4408 			break;
4409 		} else {
4410 			prev_p = curr_p;
4411 			curr_p = curr_p->nextp;
4412 		}
4413 	}
4414 
4415 	if (devtype == VSW_VNETPORT)
4416 		mutex_exit(&port->mca_lock);
4417 	else
4418 		mutex_exit(&vswp->mca_lock);
4419 
4420 	D1(NULL, "%s: exit", __func__);
4421 
4422 	return (curr_p);
4423 }
4424 
4425 /*
4426  * Create a ring consisting of just a private portion and link
4427  * it into the list of rings for the outbound lane.
4428  *
4429  * These type of rings are used primarily for temporary data
4430  * storage (i.e. as data buffers).
4431  */
4432 void
4433 vsw_create_privring(vsw_ldc_t *ldcp)
4434 {
4435 	dring_info_t		*dp;
4436 	vsw_t			*vswp = ldcp->ldc_vswp;
4437 
4438 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4439 
4440 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4441 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4442 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4443 	ldcp->lane_out.dringp = dp;
4444 
4445 	/* no public section */
4446 	dp->pub_addr = NULL;
4447 	dp->priv_addr = kmem_zalloc(
4448 	    (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP);
4449 	dp->num_descriptors = vsw_num_descriptors;
4450 
4451 	if (vsw_setup_tx_dring(ldcp, dp)) {
4452 		DERR(vswp, "%s: setup of ring failed", __func__);
4453 		vsw_destroy_tx_dring(ldcp);
4454 		return;
4455 	}
4456 
4457 	/* haven't used any descriptors yet */
4458 	dp->end_idx = 0;
4459 	dp->restart_reqd = B_TRUE;
4460 
4461 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4462 }
4463 
4464 /*
4465  * Set the default lane attributes. These are copied into
4466  * the attr msg we send to our peer. If they are not acceptable
4467  * then (currently) the handshake ends.
4468  */
4469 static void
4470 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
4471 {
4472 	bzero(lp, sizeof (lane_t));
4473 
4474 	READ_ENTER(&vswp->if_lockrw);
4475 	ether_copy(&(vswp->if_addr), &(lp->addr));
4476 	RW_EXIT(&vswp->if_lockrw);
4477 
4478 	lp->mtu = vswp->max_frame_size;
4479 	lp->addr_type = ADDR_TYPE_MAC;
4480 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
4481 	lp->ack_freq = 0;	/* for shared mode */
4482 	lp->seq_num = VNET_ISS;
4483 }
4484 
4485 /*
4486  * Map the descriptor ring exported by the peer.
4487  */
4488 static dring_info_t *
4489 vsw_map_dring(vsw_ldc_t *ldcp, void *pkt)
4490 {
4491 	dring_info_t	*dp = NULL;
4492 	lane_t		*lp = &ldcp->lane_out;
4493 
4494 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4495 		/*
4496 		 * In RxDringData mode, dring that we map in
4497 		 * becomes our transmit descriptor ring.
4498 		 */
4499 		dp =  vsw_map_tx_dring(ldcp, pkt);
4500 	} else {
4501 		/*
4502 		 * In TxDring mode, dring that we map in
4503 		 * becomes our receive descriptor ring.
4504 		 */
4505 		dp =  vsw_map_rx_dring(ldcp, pkt);
4506 	}
4507 	return (dp);
4508 }
4509 
4510 /*
4511  * Common dring mapping function used in both TxDring and RxDringData modes.
4512  */
4513 dring_info_t *
4514 vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt)
4515 {
4516 	int		rv;
4517 	dring_info_t	*dp;
4518 	ldc_mem_info_t	minfo;
4519 	vsw_t		*vswp = ldcp->ldc_vswp;
4520 
4521 	/*
4522 	 * If the dring params are unacceptable then we NACK back.
4523 	 */
4524 	if ((dring_pkt->num_descriptors == 0) ||
4525 	    (dring_pkt->descriptor_size == 0) ||
4526 	    (dring_pkt->ncookies != 1)) {
4527 		DERR(vswp, "%s (%lld): invalid dring info",
4528 		    __func__, ldcp->ldc_id);
4529 		return (NULL);
4530 	}
4531 
4532 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4533 
4534 	dp->num_descriptors = dring_pkt->num_descriptors;
4535 	dp->descriptor_size = dring_pkt->descriptor_size;
4536 	dp->options = dring_pkt->options;
4537 	dp->dring_ncookies = dring_pkt->ncookies;
4538 
4539 	/*
4540 	 * Note: should only get one cookie. Enforced in
4541 	 * the ldc layer.
4542 	 */
4543 	bcopy(&dring_pkt->cookie[0], &dp->dring_cookie[0],
4544 	    sizeof (ldc_mem_cookie_t));
4545 
4546 	rv = ldc_mem_dring_map(ldcp->ldc_handle, &dp->dring_cookie[0],
4547 	    dp->dring_ncookies, dp->num_descriptors, dp->descriptor_size,
4548 	    LDC_DIRECT_MAP, &(dp->dring_handle));
4549 	if (rv != 0) {
4550 		goto fail;
4551 	}
4552 
4553 	rv = ldc_mem_dring_info(dp->dring_handle, &minfo);
4554 	if (rv != 0) {
4555 		goto fail;
4556 	}
4557 	/* store the address of the ring */
4558 	dp->pub_addr = minfo.vaddr;
4559 
4560 	/* cache the dring mtype */
4561 	dp->dring_mtype = minfo.mtype;
4562 
4563 	/* no private section as we are importing */
4564 	dp->priv_addr = NULL;
4565 
4566 	/*
4567 	 * Using simple mono increasing int for ident at the moment.
4568 	 */
4569 	dp->ident = ldcp->next_ident;
4570 	ldcp->next_ident++;
4571 
4572 	/*
4573 	 * Acknowledge it; we send back a unique dring identifier that
4574 	 * the sending side will use in future to refer to this
4575 	 * descriptor ring.
4576 	 */
4577 	dring_pkt->dring_ident = dp->ident;
4578 
4579 	return (dp);
4580 fail:
4581 	if (dp->dring_handle != NULL) {
4582 		(void) ldc_mem_dring_unmap(dp->dring_handle);
4583 	}
4584 	kmem_free(dp, sizeof (*dp));
4585 	return (NULL);
4586 }
4587 
4588 /*
4589  * Unmap the descriptor ring exported by the peer.
4590  */
4591 static void
4592 vsw_unmap_dring(vsw_ldc_t *ldcp)
4593 {
4594 	lane_t	*lane_out = &ldcp->lane_out;
4595 
4596 	if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
4597 		vsw_unmap_tx_dring(ldcp);
4598 	} else {
4599 		vsw_unmap_rx_dring(ldcp);
4600 	}
4601 }
4602 
4603 /*
4604  * Map the shared memory data buffer area exported by the peer.
4605  * Used in RxDringData mode only.
4606  */
4607 static int
4608 vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt)
4609 {
4610 	int			rv;
4611 	vio_dring_reg_ext_msg_t	*emsg;
4612 	vio_dring_reg_msg_t	*msg = pkt;
4613 	uint8_t			*buf = (uint8_t *)msg->cookie;
4614 	vsw_t			*vswp = ldcp->ldc_vswp;
4615 
4616 	/* skip over dring cookies */
4617 	ASSERT(msg->ncookies == 1);
4618 	buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));
4619 
4620 	emsg = (vio_dring_reg_ext_msg_t *)buf;
4621 	if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
4622 		return (1);
4623 	}
4624 
4625 	/* save # of data area cookies */
4626 	dp->data_ncookies = emsg->data_ncookies;
4627 
4628 	/* save data area size */
4629 	dp->data_sz = emsg->data_area_size;
4630 
4631 	/* allocate ldc mem handle for data area */
4632 	rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &dp->data_handle);
4633 	if (rv != 0) {
4634 		cmn_err(CE_WARN, "ldc_mem_alloc_handle failed\n");
4635 		DWARN(vswp, "%s (%lld) ldc_mem_alloc_handle() failed: %d\n",
4636 		    __func__, ldcp->ldc_id, rv);
4637 		return (1);
4638 	}
4639 
4640 	/* map the data area */
4641 	rv = ldc_mem_map(dp->data_handle, emsg->data_cookie,
4642 	    emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_R,
4643 	    (caddr_t *)&dp->data_addr, NULL);
4644 	if (rv != 0) {
4645 		cmn_err(CE_WARN, "ldc_mem_map failed\n");
4646 		DWARN(vswp, "%s (%lld) ldc_mem_map() failed: %d\n",
4647 		    __func__, ldcp->ldc_id, rv);
4648 		return (1);
4649 	}
4650 
4651 	/* allocate memory for data area cookies */
4652 	dp->data_cookie = kmem_zalloc(emsg->data_ncookies *
4653 	    sizeof (ldc_mem_cookie_t), KM_SLEEP);
4654 
4655 	/* save data area cookies */
4656 	bcopy(emsg->data_cookie, dp->data_cookie,
4657 	    emsg->data_ncookies * sizeof (ldc_mem_cookie_t));
4658 
4659 	return (0);
4660 }
4661 
4662 /*
4663  * Reset and free all the resources associated with the channel.
4664  */
4665 static void
4666 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
4667 {
4668 	lane_t	*lp;
4669 
4670 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
4671 
4672 	if (dir == INBOUND) {
4673 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
4674 		    " of channel %lld", __func__, ldcp->ldc_id);
4675 		lp = &ldcp->lane_in;
4676 	} else {
4677 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
4678 		    " of channel %lld", __func__, ldcp->ldc_id);
4679 		lp = &ldcp->lane_out;
4680 	}
4681 
4682 	lp->lstate = VSW_LANE_INACTIV;
4683 	lp->seq_num = VNET_ISS;
4684 
4685 	if (dir == INBOUND) {
4686 		/* Unmap the remote dring which is imported from the peer */
4687 		vsw_unmap_dring(ldcp);
4688 	} else {
4689 		/* Destroy the local dring which is exported to the peer */
4690 		vsw_destroy_dring(ldcp);
4691 	}
4692 
4693 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
4694 }
4695 
4696 /*
4697  * Destroy the descriptor ring.
4698  */
4699 static void
4700 vsw_destroy_dring(vsw_ldc_t *ldcp)
4701 {
4702 	lane_t	*lp = &ldcp->lane_out;
4703 
4704 	if (lp->dring_mode == VIO_RX_DRING_DATA) {
4705 		vsw_destroy_rx_dring(ldcp);
4706 	} else {
4707 		vsw_destroy_tx_dring(ldcp);
4708 	}
4709 }
4710 
4711 /*
4712  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
4713  * This thread is woken up by the vsw_portsend to transmit
4714  * packets.
4715  */
4716 static void
4717 vsw_ldc_tx_worker(void *arg)
4718 {
4719 	callb_cpr_t	cprinfo;
4720 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
4721 	vsw_t *vswp = ldcp->ldc_vswp;
4722 	mblk_t *mp;
4723 	mblk_t *tmp;
4724 
4725 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4726 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
4727 	    "vnet_tx_thread");
4728 	mutex_enter(&ldcp->tx_thr_lock);
4729 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
4730 
4731 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4732 		/*
4733 		 * Wait until the data is received or a stop
4734 		 * request is received.
4735 		 */
4736 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
4737 		    (ldcp->tx_mhead == NULL)) {
4738 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
4739 		}
4740 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
4741 
4742 		/*
4743 		 * First process the stop request.
4744 		 */
4745 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
4746 			D2(vswp, "%s(%lld):tx thread stopped\n",
4747 			    __func__, ldcp->ldc_id);
4748 			break;
4749 		}
4750 		mp = ldcp->tx_mhead;
4751 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
4752 		ldcp->tx_cnt = 0;
4753 		mutex_exit(&ldcp->tx_thr_lock);
4754 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
4755 		    __func__, ldcp->ldc_id);
4756 		while (mp != NULL) {
4757 			tmp = mp->b_next;
4758 			mp->b_next = mp->b_prev = NULL;
4759 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
4760 			mp = tmp;
4761 		}
4762 		mutex_enter(&ldcp->tx_thr_lock);
4763 	}
4764 
4765 	/*
4766 	 * Update the run status and wakeup the thread that
4767 	 * has sent the stop request.
4768 	 */
4769 	ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
4770 	ldcp->tx_thread = NULL;
4771 	CALLB_CPR_EXIT(&cprinfo);
4772 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4773 	thread_exit();
4774 }
4775 
4776 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
4777 static void
4778 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
4779 {
4780 	kt_did_t	tid = 0;
4781 	vsw_t		*vswp = ldcp->ldc_vswp;
4782 
4783 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4784 	/*
4785 	 * Send a stop request by setting the stop flag and
4786 	 * wait until the receive thread stops.
4787 	 */
4788 	mutex_enter(&ldcp->tx_thr_lock);
4789 	if (ldcp->tx_thread != NULL) {
4790 		tid = ldcp->tx_thread->t_did;
4791 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
4792 		cv_signal(&ldcp->tx_thr_cv);
4793 	}
4794 	mutex_exit(&ldcp->tx_thr_lock);
4795 
4796 	if (tid != 0) {
4797 		thread_join(tid);
4798 	}
4799 
4800 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4801 }
4802 
4803 /*
4804  * Debugging routines
4805  */
4806 static void
4807 display_state(void)
4808 {
4809 	vsw_t		*vswp;
4810 	vsw_port_list_t	*plist;
4811 	vsw_port_t 	*port;
4812 	vsw_ldc_t 	*ldcp;
4813 	extern vsw_t 	*vsw_head;
4814 
4815 	cmn_err(CE_NOTE, "***** system state *****");
4816 
4817 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
4818 		plist = &vswp->plist;
4819 		READ_ENTER(&plist->lockrw);
4820 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
4821 		    vswp->instance, plist->num_ports);
4822 
4823 		for (port = plist->head; port != NULL; port = port->p_next) {
4824 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
4825 			    port->p_instance, port->num_ldcs);
4826 			ldcp = port->ldcp;
4827 			cmn_err(CE_CONT, "chan %lu : dev %d : "
4828 			    "status %d : phase %u\n",
4829 			    ldcp->ldc_id, ldcp->dev_class,
4830 			    ldcp->ldc_status, ldcp->hphase);
4831 			cmn_err(CE_CONT, "chan %lu : lsession %lu : "
4832 			    "psession %lu\n", ldcp->ldc_id,
4833 			    ldcp->local_session, ldcp->peer_session);
4834 
4835 			cmn_err(CE_CONT, "Inbound lane:\n");
4836 			display_lane(&ldcp->lane_in);
4837 			cmn_err(CE_CONT, "Outbound lane:\n");
4838 			display_lane(&ldcp->lane_out);
4839 		}
4840 		RW_EXIT(&plist->lockrw);
4841 	}
4842 	cmn_err(CE_NOTE, "***** system state *****");
4843 }
4844 
4845 static void
4846 display_lane(lane_t *lp)
4847 {
4848 	dring_info_t	*drp = lp->dringp;
4849 
4850 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
4851 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
4852 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
4853 	    lp->addr_type, lp->addr, lp->xfer_mode);
4854 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
4855 
4856 	cmn_err(CE_CONT, "Dring info:\n");
4857 	cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
4858 	    drp->num_descriptors, drp->descriptor_size);
4859 	cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->dring_handle);
4860 	cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
4861 	    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
4862 	cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
4863 	    drp->ident, drp->end_idx);
4864 	display_ring(drp);
4865 }
4866 
4867 static void
4868 display_ring(dring_info_t *dringp)
4869 {
4870 	uint64_t		i;
4871 	uint64_t		priv_count = 0;
4872 	uint64_t		pub_count = 0;
4873 	vnet_public_desc_t	*pub_addr = NULL;
4874 	vsw_private_desc_t	*priv_addr = NULL;
4875 
4876 	for (i = 0; i < vsw_num_descriptors; i++) {
4877 		if (dringp->pub_addr != NULL) {
4878 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
4879 
4880 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
4881 				pub_count++;
4882 		}
4883 
4884 		if (dringp->priv_addr != NULL) {
4885 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
4886 
4887 			if (priv_addr->dstate == VIO_DESC_FREE)
4888 				priv_count++;
4889 		}
4890 	}
4891 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
4892 	    i, priv_count, pub_count);
4893 }
4894 
4895 static void
4896 dump_flags(uint64_t state)
4897 {
4898 	int	i;
4899 
4900 	typedef struct flag_name {
4901 		int	flag_val;
4902 		char	*flag_name;
4903 	} flag_name_t;
4904 
4905 	flag_name_t	flags[] = {
4906 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
4907 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
4908 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
4909 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
4910 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
4911 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
4912 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
4913 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
4914 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
4915 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
4916 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
4917 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
4918 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
4919 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
4920 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
4921 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
4922 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
4923 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
4924 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
4925 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
4926 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
4927 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
4928 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
4929 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
4930 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
4931 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
4932 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
4933 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
4934 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
4935 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
4936 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
4937 
4938 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
4939 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
4940 		if (state & flags[i].flag_val)
4941 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
4942 	}
4943 }
4944