xref: /titanic_51/usr/src/uts/sun4v/io/vsw_ldc.c (revision b9bd317cda1afb3a01f4812de73e8cec888cbbd7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 #include <sys/callb.h>
75 #include <sys/vlan.h>
76 
77 /* Port add/deletion/etc routines */
78 static	int vsw_port_delete(vsw_port_t *port);
79 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
80 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
81 static	int vsw_init_ldcs(vsw_port_t *port);
82 static	int vsw_uninit_ldcs(vsw_port_t *port);
83 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
84 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
85 static	int vsw_drain_ldcs(vsw_port_t *port);
86 static	int vsw_drain_port_taskq(vsw_port_t *port);
87 static	void vsw_marker_task(void *);
88 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
89 int vsw_detach_ports(vsw_t *vswp);
90 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
91 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
92 int vsw_port_detach(vsw_t *vswp, int p_instance);
93 int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
94 int vsw_port_attach(vsw_port_t *portp);
95 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
96 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
97 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
98 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
99 
100 /* Interrupt routines */
101 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
102 
103 /* Handshake routines */
104 static	void vsw_ldc_reinit(vsw_ldc_t *);
105 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
106 static	void vsw_conn_task(void *);
107 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
108 static	void vsw_next_milestone(vsw_ldc_t *);
109 static	int vsw_supported_version(vio_ver_msg_t *);
110 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
111 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
112 
113 /* Data processing routines */
114 static void vsw_process_pkt(void *);
115 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
116 static void vsw_process_ctrl_pkt(void *);
117 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
123 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
124 	uint32_t);
125 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
126 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
127 static void vsw_process_pkt_data(void *, void *, uint32_t);
128 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
129 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
130 
131 /* Switching/data transmit routines */
132 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
133 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
134 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
135 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
136 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
138 
139 /* Packet creation routines */
140 static void vsw_send_ver(void *);
141 static void vsw_send_attr(vsw_ldc_t *);
142 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
143 static void vsw_send_dring_info(vsw_ldc_t *);
144 static void vsw_send_rdx(vsw_ldc_t *);
145 
146 /* Dring routines */
147 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
148 static void vsw_create_privring(vsw_ldc_t *);
149 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
150 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
151     int *);
152 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
153 static int vsw_reclaim_dring(dring_info_t *dp, int start);
154 
155 static void vsw_set_lane_attr(vsw_t *, lane_t *);
156 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
157 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
158 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
159 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
160 
161 /* Rcv/Tx thread routines */
162 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
163 static void vsw_ldc_tx_worker(void *arg);
164 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
165 static void vsw_ldc_rx_worker(void *arg);
166 
167 /* Misc support routines */
168 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
169 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
170 static int vsw_free_ring(dring_info_t *);
171 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
172 static int vsw_get_same_dest_list(struct ether_header *ehp,
173     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
174 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
175 
176 /* Debugging routines */
177 static void dump_flags(uint64_t);
178 static void display_state(void);
179 static void display_lane(lane_t *);
180 static void display_ring(dring_info_t *);
181 
182 /*
183  * Functions imported from other files.
184  */
185 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
186 extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
187 extern void vsw_reconfig_hw(vsw_t *);
188 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
189 extern void vsw_del_mcst_port(vsw_port_t *port);
190 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
191 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
192 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
193 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
194 extern void vsw_create_vlans(void *arg, int type);
195 extern void vsw_destroy_vlans(void *arg, int type);
196 extern void vsw_vlan_add_ids(void *arg, int type);
197 extern void vsw_vlan_remove_ids(void *arg, int type);
198 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
199 	struct ether_header *ehp, uint16_t *vidp);
200 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
201 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
202 	mblk_t **npt);
203 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
204 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
205 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
206 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
207 extern void vsw_hio_stop_port(vsw_port_t *portp);
208 
209 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
210 
211 /*
212  * Tunables used in this file.
213  */
214 extern int vsw_num_handshakes;
215 extern int vsw_wretries;
216 extern int vsw_desc_delay;
217 extern int vsw_read_attempts;
218 extern int vsw_ldc_tx_delay;
219 extern int vsw_ldc_tx_retries;
220 extern boolean_t vsw_ldc_rxthr_enabled;
221 extern boolean_t vsw_ldc_txthr_enabled;
222 extern uint32_t vsw_ntxds;
223 extern uint32_t vsw_max_tx_qcount;
224 extern uint32_t vsw_chain_len;
225 extern uint32_t vsw_mblk_size1;
226 extern uint32_t vsw_mblk_size2;
227 extern uint32_t vsw_mblk_size3;
228 extern uint32_t vsw_num_mblks1;
229 extern uint32_t vsw_num_mblks2;
230 extern uint32_t vsw_num_mblks3;
231 extern boolean_t vsw_obp_ver_proto_workaround;
232 
233 #define	LDC_ENTER_LOCK(ldcp)	\
234 				mutex_enter(&((ldcp)->ldc_cblock));\
235 				mutex_enter(&((ldcp)->ldc_rxlock));\
236 				mutex_enter(&((ldcp)->ldc_txlock));
237 #define	LDC_EXIT_LOCK(ldcp)	\
238 				mutex_exit(&((ldcp)->ldc_txlock));\
239 				mutex_exit(&((ldcp)->ldc_rxlock));\
240 				mutex_exit(&((ldcp)->ldc_cblock));
241 
242 #define	VSW_VER_EQ(ldcp, major, minor)	\
243 	((ldcp)->lane_out.ver_major == (major) &&	\
244 	    (ldcp)->lane_out.ver_minor == (minor))
245 
246 #define	VSW_VER_LT(ldcp, major, minor)	\
247 	(((ldcp)->lane_out.ver_major < (major)) ||	\
248 	    ((ldcp)->lane_out.ver_major == (major) &&	\
249 	    (ldcp)->lane_out.ver_minor < (minor)))
250 
251 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
252 	(((ldcp)->lane_out.ver_major > (major)) ||	\
253 	    ((ldcp)->lane_out.ver_major == (major) &&	\
254 	    (ldcp)->lane_out.ver_minor >= (minor)))
255 
256 /* supported versions */
257 static	ver_sup_t	vsw_versions[] = { {1, 3} };
258 
259 /*
260  * For the moment the state dump routines have their own
261  * private flag.
262  */
263 #define	DUMP_STATE	0
264 
265 #if DUMP_STATE
266 
267 #define	DUMP_TAG(tag) \
268 {			\
269 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
270 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
271 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
272 }
273 
274 #define	DUMP_TAG_PTR(tag) \
275 {			\
276 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
277 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
278 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
279 }
280 
281 #define	DUMP_FLAGS(flags) dump_flags(flags);
282 #define	DISPLAY_STATE()	display_state()
283 
284 #else
285 
286 #define	DUMP_TAG(tag)
287 #define	DUMP_TAG_PTR(tag)
288 #define	DUMP_FLAGS(state)
289 #define	DISPLAY_STATE()
290 
291 #endif	/* DUMP_STATE */
292 
293 /*
294  * Attach the specified port.
295  *
296  * Returns 0 on success, 1 on failure.
297  */
298 int
299 vsw_port_attach(vsw_port_t *port)
300 {
301 	vsw_t			*vswp = port->p_vswp;
302 	vsw_port_list_t		*plist = &vswp->plist;
303 	vsw_port_t		*p, **pp;
304 	int			i;
305 	int			nids = port->num_ldcs;
306 	uint64_t		*ldcids;
307 
308 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
309 
310 	/* port already exists? */
311 	READ_ENTER(&plist->lockrw);
312 	for (p = plist->head; p != NULL; p = p->p_next) {
313 		if (p->p_instance == port->p_instance) {
314 			DWARN(vswp, "%s: port instance %d already attached",
315 			    __func__, p->p_instance);
316 			RW_EXIT(&plist->lockrw);
317 			return (1);
318 		}
319 	}
320 	RW_EXIT(&plist->lockrw);
321 
322 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
323 
324 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
325 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
326 
327 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
328 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
329 	port->state = VSW_PORT_INIT;
330 
331 	D2(vswp, "%s: %d nids", __func__, nids);
332 	ldcids = port->ldc_ids;
333 	for (i = 0; i < nids; i++) {
334 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
335 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
336 			DERR(vswp, "%s: ldc_attach failed", __func__);
337 
338 			rw_destroy(&port->p_ldclist.lockrw);
339 
340 			cv_destroy(&port->state_cv);
341 			mutex_destroy(&port->state_lock);
342 
343 			mutex_destroy(&port->tx_lock);
344 			mutex_destroy(&port->mca_lock);
345 			kmem_free(port, sizeof (vsw_port_t));
346 			return (1);
347 		}
348 	}
349 
350 	if (vswp->switching_setup_done == B_TRUE) {
351 		/*
352 		 * If the underlying physical device has been setup,
353 		 * program the mac address of this port in it.
354 		 * Otherwise, port macaddr will be set after the physical
355 		 * device is successfully setup by the timeout handler.
356 		 */
357 		mutex_enter(&vswp->hw_lock);
358 		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
359 		mutex_exit(&vswp->hw_lock);
360 	}
361 
362 	/* create the fdb entry for this port/mac address */
363 	vsw_fdbe_add(vswp, port);
364 
365 	vsw_create_vlans(port, VSW_VNETPORT);
366 
367 	WRITE_ENTER(&plist->lockrw);
368 
369 	/* link it into the list of ports for this vsw instance */
370 	pp = (vsw_port_t **)(&plist->head);
371 	port->p_next = *pp;
372 	*pp = port;
373 	plist->num_ports++;
374 
375 	RW_EXIT(&plist->lockrw);
376 
377 	/*
378 	 * Initialise the port and any ldc's under it.
379 	 */
380 	(void) vsw_init_ldcs(port);
381 
382 	D1(vswp, "%s: exit", __func__);
383 	return (0);
384 }
385 
386 /*
387  * Detach the specified port.
388  *
389  * Returns 0 on success, 1 on failure.
390  */
391 int
392 vsw_port_detach(vsw_t *vswp, int p_instance)
393 {
394 	vsw_port_t	*port = NULL;
395 	vsw_port_list_t	*plist = &vswp->plist;
396 
397 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
398 
399 	WRITE_ENTER(&plist->lockrw);
400 
401 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
402 		RW_EXIT(&plist->lockrw);
403 		return (1);
404 	}
405 
406 	if (vsw_plist_del_node(vswp, port)) {
407 		RW_EXIT(&plist->lockrw);
408 		return (1);
409 	}
410 
411 	/* cleanup any HybridIO for this port */
412 	vsw_hio_stop_port(port);
413 
414 	/*
415 	 * No longer need to hold writer lock on port list now
416 	 * that we have unlinked the target port from the list.
417 	 */
418 	RW_EXIT(&plist->lockrw);
419 
420 	/* Remove the fdb entry for this port/mac address */
421 	vsw_fdbe_del(vswp, &(port->p_macaddr));
422 	vsw_destroy_vlans(port, VSW_VNETPORT);
423 
424 	/* Remove any multicast addresses.. */
425 	vsw_del_mcst_port(port);
426 
427 	/* Remove address if was programmed into HW. */
428 	mutex_enter(&vswp->hw_lock);
429 
430 	/*
431 	 * Port's address may not have been set in hardware. This could
432 	 * happen if the underlying physical device is not yet available and
433 	 * vsw_setup_switching_timeout() may be in progress.
434 	 * We remove its addr from hardware only if it has been set before.
435 	 */
436 	if (port->addr_set != VSW_ADDR_UNSET)
437 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
438 
439 	if (vswp->recfg_reqd)
440 		vsw_reconfig_hw(vswp);
441 
442 	mutex_exit(&vswp->hw_lock);
443 
444 	if (vsw_port_delete(port)) {
445 		return (1);
446 	}
447 
448 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
449 	return (0);
450 }
451 
452 /*
453  * Detach all active ports.
454  *
455  * Returns 0 on success, 1 on failure.
456  */
457 int
458 vsw_detach_ports(vsw_t *vswp)
459 {
460 	vsw_port_list_t 	*plist = &vswp->plist;
461 	vsw_port_t		*port = NULL;
462 
463 	D1(vswp, "%s: enter", __func__);
464 
465 	WRITE_ENTER(&plist->lockrw);
466 
467 	while ((port = plist->head) != NULL) {
468 		if (vsw_plist_del_node(vswp, port)) {
469 			DERR(vswp, "%s: Error deleting port %d"
470 			    " from port list", __func__, port->p_instance);
471 			RW_EXIT(&plist->lockrw);
472 			return (1);
473 		}
474 
475 		/* Remove address if was programmed into HW. */
476 		mutex_enter(&vswp->hw_lock);
477 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
478 		mutex_exit(&vswp->hw_lock);
479 
480 		/* Remove the fdb entry for this port/mac address */
481 		vsw_fdbe_del(vswp, &(port->p_macaddr));
482 		vsw_destroy_vlans(port, VSW_VNETPORT);
483 
484 		/* Remove any multicast addresses.. */
485 		vsw_del_mcst_port(port);
486 
487 		/*
488 		 * No longer need to hold the lock on the port list
489 		 * now that we have unlinked the target port from the
490 		 * list.
491 		 */
492 		RW_EXIT(&plist->lockrw);
493 		if (vsw_port_delete(port)) {
494 			DERR(vswp, "%s: Error deleting port %d",
495 			    __func__, port->p_instance);
496 			return (1);
497 		}
498 		WRITE_ENTER(&plist->lockrw);
499 	}
500 	RW_EXIT(&plist->lockrw);
501 
502 	D1(vswp, "%s: exit", __func__);
503 
504 	return (0);
505 }
506 
507 /*
508  * Delete the specified port.
509  *
510  * Returns 0 on success, 1 on failure.
511  */
512 static int
513 vsw_port_delete(vsw_port_t *port)
514 {
515 	vsw_ldc_list_t 		*ldcl;
516 	vsw_t			*vswp = port->p_vswp;
517 	int			num_ldcs;
518 
519 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
520 
521 	(void) vsw_uninit_ldcs(port);
522 
523 	/*
524 	 * Wait for any pending ctrl msg tasks which reference this
525 	 * port to finish.
526 	 */
527 	if (vsw_drain_port_taskq(port))
528 		return (1);
529 
530 	/*
531 	 * Wait for any active callbacks to finish
532 	 */
533 	if (vsw_drain_ldcs(port))
534 		return (1);
535 
536 	ldcl = &port->p_ldclist;
537 	num_ldcs = port->num_ldcs;
538 	WRITE_ENTER(&ldcl->lockrw);
539 	while (num_ldcs > 0) {
540 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
541 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
542 			    vswp->instance, ldcl->head->ldc_id);
543 			RW_EXIT(&ldcl->lockrw);
544 			port->num_ldcs = num_ldcs;
545 			return (1);
546 		}
547 		num_ldcs--;
548 	}
549 	RW_EXIT(&ldcl->lockrw);
550 
551 	rw_destroy(&port->p_ldclist.lockrw);
552 
553 	mutex_destroy(&port->mca_lock);
554 	mutex_destroy(&port->tx_lock);
555 
556 	cv_destroy(&port->state_cv);
557 	mutex_destroy(&port->state_lock);
558 
559 	if (port->num_ldcs != 0) {
560 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
561 		port->num_ldcs = 0;
562 	}
563 	kmem_free(port, sizeof (vsw_port_t));
564 
565 	D1(vswp, "%s: exit", __func__);
566 
567 	return (0);
568 }
569 
570 /*
571  * Attach a logical domain channel (ldc) under a specified port.
572  *
573  * Returns 0 on success, 1 on failure.
574  */
575 static int
576 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
577 {
578 	vsw_t 		*vswp = port->p_vswp;
579 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
580 	vsw_ldc_t 	*ldcp = NULL;
581 	ldc_attr_t 	attr;
582 	ldc_status_t	istatus;
583 	int 		status = DDI_FAILURE;
584 	int		rv;
585 	char		kname[MAXNAMELEN];
586 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
587 			    PROG_callback = 0x2, PROG_rx_thread = 0x4,
588 			    PROG_tx_thread = 0x8}
589 			progress;
590 
591 	progress = PROG_init;
592 
593 	D1(vswp, "%s: enter", __func__);
594 
595 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
596 	if (ldcp == NULL) {
597 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
598 		return (1);
599 	}
600 	ldcp->ldc_id = ldc_id;
601 
602 	/* Allocate pools of receive mblks */
603 	rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
604 	    vsw_mblk_size1, vsw_mblk_size2, vsw_mblk_size3,
605 	    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
606 	if (rv) {
607 		DWARN(vswp, "%s: unable to create free mblk pools for"
608 		    " channel %ld (rv %d)", __func__, ldc_id, rv);
609 		kmem_free(ldcp, sizeof (vsw_ldc_t));
610 		return (1);
611 	}
612 
613 	progress |= PROG_mblks;
614 
615 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
616 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
617 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
618 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
619 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
620 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
621 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
622 
623 	/* required for handshake with peer */
624 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
625 	ldcp->peer_session = 0;
626 	ldcp->session_status = 0;
627 	ldcp->hss_id = 1;	/* Initial handshake session id */
628 
629 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
630 
631 	/* only set for outbound lane, inbound set by peer */
632 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
633 
634 	attr.devclass = LDC_DEV_NT_SVC;
635 	attr.instance = ddi_get_instance(vswp->dip);
636 	attr.mode = LDC_MODE_UNRELIABLE;
637 	attr.mtu = VSW_LDC_MTU;
638 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
639 	if (status != 0) {
640 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
641 		    __func__, ldc_id, status);
642 		goto ldc_attach_fail;
643 	}
644 
645 	if (vsw_ldc_rxthr_enabled) {
646 		ldcp->rx_thr_flags = 0;
647 
648 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
649 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
650 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
651 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
652 
653 		progress |= PROG_rx_thread;
654 		if (ldcp->rx_thread == NULL) {
655 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
656 			    __func__, ldc_id);
657 			goto ldc_attach_fail;
658 		}
659 	}
660 
661 	if (vsw_ldc_txthr_enabled) {
662 		ldcp->tx_thr_flags = 0;
663 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
664 
665 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
666 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
667 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
668 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
669 
670 		progress |= PROG_tx_thread;
671 		if (ldcp->tx_thread == NULL) {
672 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
673 			    __func__, ldc_id);
674 			goto ldc_attach_fail;
675 		}
676 	}
677 
678 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
679 	if (status != 0) {
680 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
681 		    __func__, ldc_id, status);
682 		(void) ldc_fini(ldcp->ldc_handle);
683 		goto ldc_attach_fail;
684 	}
685 	/*
686 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
687 	 * data msgs, including raw data msgs used to recv priority frames.
688 	 */
689 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
690 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
691 
692 	progress |= PROG_callback;
693 
694 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
695 
696 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
697 		DERR(vswp, "%s: ldc_status failed", __func__);
698 		mutex_destroy(&ldcp->status_lock);
699 		goto ldc_attach_fail;
700 	}
701 
702 	ldcp->ldc_status = istatus;
703 	ldcp->ldc_port = port;
704 	ldcp->ldc_vswp = vswp;
705 
706 	vsw_reset_vnet_proto_ops(ldcp);
707 
708 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
709 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
710 	    kname, &ldcp->ldc_stats);
711 	if (ldcp->ksp == NULL) {
712 		DERR(vswp, "%s: kstats setup failed", __func__);
713 		goto ldc_attach_fail;
714 	}
715 
716 	/* link it into the list of channels for this port */
717 	WRITE_ENTER(&ldcl->lockrw);
718 	ldcp->ldc_next = ldcl->head;
719 	ldcl->head = ldcp;
720 	RW_EXIT(&ldcl->lockrw);
721 
722 	D1(vswp, "%s: exit", __func__);
723 	return (0);
724 
725 ldc_attach_fail:
726 
727 	if (progress & PROG_callback) {
728 		(void) ldc_unreg_callback(ldcp->ldc_handle);
729 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
730 	}
731 
732 	if (progress & PROG_rx_thread) {
733 		if (ldcp->rx_thread != NULL) {
734 			vsw_stop_rx_thread(ldcp);
735 		}
736 		mutex_destroy(&ldcp->rx_thr_lock);
737 		cv_destroy(&ldcp->rx_thr_cv);
738 	}
739 
740 	if (progress & PROG_tx_thread) {
741 		if (ldcp->tx_thread != NULL) {
742 			vsw_stop_tx_thread(ldcp);
743 		}
744 		mutex_destroy(&ldcp->tx_thr_lock);
745 		cv_destroy(&ldcp->tx_thr_cv);
746 	}
747 	if (ldcp->ksp != NULL) {
748 		vgen_destroy_kstats(ldcp->ksp);
749 	}
750 	mutex_destroy(&ldcp->ldc_txlock);
751 	mutex_destroy(&ldcp->ldc_rxlock);
752 	mutex_destroy(&ldcp->ldc_cblock);
753 	mutex_destroy(&ldcp->drain_cv_lock);
754 
755 	cv_destroy(&ldcp->drain_cv);
756 
757 	rw_destroy(&ldcp->lane_in.dlistrw);
758 	rw_destroy(&ldcp->lane_out.dlistrw);
759 
760 	if (progress & PROG_mblks) {
761 		vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
762 	}
763 	kmem_free(ldcp, sizeof (vsw_ldc_t));
764 
765 	return (1);
766 }
767 
768 /*
769  * Detach a logical domain channel (ldc) belonging to a
770  * particular port.
771  *
772  * Returns 0 on success, 1 on failure.
773  */
774 static int
775 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
776 {
777 	vsw_t 		*vswp = port->p_vswp;
778 	vsw_ldc_t 	*ldcp, *prev_ldcp;
779 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
780 	int 		rv;
781 
782 	prev_ldcp = ldcl->head;
783 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
784 		if (ldcp->ldc_id == ldc_id) {
785 			break;
786 		}
787 	}
788 
789 	/* specified ldc id not found */
790 	if (ldcp == NULL) {
791 		DERR(vswp, "%s: ldcp = NULL", __func__);
792 		return (1);
793 	}
794 
795 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
796 
797 	/* Stop the receive thread */
798 	if (ldcp->rx_thread != NULL) {
799 		vsw_stop_rx_thread(ldcp);
800 		mutex_destroy(&ldcp->rx_thr_lock);
801 		cv_destroy(&ldcp->rx_thr_cv);
802 	}
803 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
804 
805 	/* Stop the tx thread */
806 	if (ldcp->tx_thread != NULL) {
807 		vsw_stop_tx_thread(ldcp);
808 		mutex_destroy(&ldcp->tx_thr_lock);
809 		cv_destroy(&ldcp->tx_thr_cv);
810 		if (ldcp->tx_mhead != NULL) {
811 			freemsgchain(ldcp->tx_mhead);
812 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
813 			ldcp->tx_cnt = 0;
814 		}
815 	}
816 
817 	/* Destory kstats */
818 	vgen_destroy_kstats(ldcp->ksp);
819 
820 	/*
821 	 * Before we can close the channel we must release any mapped
822 	 * resources (e.g. drings).
823 	 */
824 	vsw_free_lane_resources(ldcp, INBOUND);
825 	vsw_free_lane_resources(ldcp, OUTBOUND);
826 
827 	/*
828 	 * If the close fails we are in serious trouble, as won't
829 	 * be able to delete the parent port.
830 	 */
831 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
832 		DERR(vswp, "%s: error %d closing channel %lld",
833 		    __func__, rv, ldcp->ldc_id);
834 		return (1);
835 	}
836 
837 	(void) ldc_fini(ldcp->ldc_handle);
838 
839 	ldcp->ldc_status = LDC_INIT;
840 	ldcp->ldc_handle = NULL;
841 	ldcp->ldc_vswp = NULL;
842 
843 
844 	/*
845 	 * Most likely some mblks are still in use and
846 	 * have not been returned to the pool. These mblks are
847 	 * added to the pool that is maintained in the device instance.
848 	 * Another attempt will be made to destroy the pool
849 	 * when the device detaches.
850 	 */
851 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
852 
853 	/* unlink it from the list */
854 	prev_ldcp = ldcp->ldc_next;
855 
856 	mutex_destroy(&ldcp->ldc_txlock);
857 	mutex_destroy(&ldcp->ldc_rxlock);
858 	mutex_destroy(&ldcp->ldc_cblock);
859 	cv_destroy(&ldcp->drain_cv);
860 	mutex_destroy(&ldcp->drain_cv_lock);
861 	mutex_destroy(&ldcp->status_lock);
862 	rw_destroy(&ldcp->lane_in.dlistrw);
863 	rw_destroy(&ldcp->lane_out.dlistrw);
864 
865 	kmem_free(ldcp, sizeof (vsw_ldc_t));
866 
867 	return (0);
868 }
869 
870 /*
871  * Open and attempt to bring up the channel. Note that channel
872  * can only be brought up if peer has also opened channel.
873  *
874  * Returns 0 if can open and bring up channel, otherwise
875  * returns 1.
876  */
877 static int
878 vsw_ldc_init(vsw_ldc_t *ldcp)
879 {
880 	vsw_t 		*vswp = ldcp->ldc_vswp;
881 	ldc_status_t	istatus = 0;
882 	int		rv;
883 
884 	D1(vswp, "%s: enter", __func__);
885 
886 	LDC_ENTER_LOCK(ldcp);
887 
888 	/* don't start at 0 in case clients don't like that */
889 	ldcp->next_ident = 1;
890 
891 	rv = ldc_open(ldcp->ldc_handle);
892 	if (rv != 0) {
893 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
894 		    __func__, ldcp->ldc_id, rv);
895 		LDC_EXIT_LOCK(ldcp);
896 		return (1);
897 	}
898 
899 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
900 		DERR(vswp, "%s: unable to get status", __func__);
901 		LDC_EXIT_LOCK(ldcp);
902 		return (1);
903 
904 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
905 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
906 		    __func__, ldcp->ldc_id, istatus);
907 		LDC_EXIT_LOCK(ldcp);
908 		return (1);
909 	}
910 
911 	mutex_enter(&ldcp->status_lock);
912 	ldcp->ldc_status = istatus;
913 	mutex_exit(&ldcp->status_lock);
914 
915 	rv = ldc_up(ldcp->ldc_handle);
916 	if (rv != 0) {
917 		/*
918 		 * Not a fatal error for ldc_up() to fail, as peer
919 		 * end point may simply not be ready yet.
920 		 */
921 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
922 		    ldcp->ldc_id, rv);
923 		LDC_EXIT_LOCK(ldcp);
924 		return (1);
925 	}
926 
927 	/*
928 	 * ldc_up() call is non-blocking so need to explicitly
929 	 * check channel status to see if in fact the channel
930 	 * is UP.
931 	 */
932 	mutex_enter(&ldcp->status_lock);
933 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
934 		DERR(vswp, "%s: unable to get status", __func__);
935 		mutex_exit(&ldcp->status_lock);
936 		LDC_EXIT_LOCK(ldcp);
937 		return (1);
938 
939 	}
940 
941 	if (ldcp->ldc_status == LDC_UP) {
942 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
943 		    ldcp->ldc_id, istatus);
944 		mutex_exit(&ldcp->status_lock);
945 		LDC_EXIT_LOCK(ldcp);
946 
947 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
948 		return (0);
949 	}
950 
951 	mutex_exit(&ldcp->status_lock);
952 	LDC_EXIT_LOCK(ldcp);
953 
954 	D1(vswp, "%s: exit", __func__);
955 	return (0);
956 }
957 
958 /* disable callbacks on the channel */
959 static int
960 vsw_ldc_uninit(vsw_ldc_t *ldcp)
961 {
962 	vsw_t	*vswp = ldcp->ldc_vswp;
963 	int	rv;
964 
965 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
966 
967 	LDC_ENTER_LOCK(ldcp);
968 
969 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
970 	if (rv != 0) {
971 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
972 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
973 		LDC_EXIT_LOCK(ldcp);
974 		return (1);
975 	}
976 
977 	mutex_enter(&ldcp->status_lock);
978 	ldcp->ldc_status = LDC_INIT;
979 	mutex_exit(&ldcp->status_lock);
980 
981 	LDC_EXIT_LOCK(ldcp);
982 
983 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
984 
985 	return (0);
986 }
987 
988 static int
989 vsw_init_ldcs(vsw_port_t *port)
990 {
991 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
992 	vsw_ldc_t	*ldcp;
993 
994 	READ_ENTER(&ldcl->lockrw);
995 	ldcp =  ldcl->head;
996 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
997 		(void) vsw_ldc_init(ldcp);
998 	}
999 	RW_EXIT(&ldcl->lockrw);
1000 
1001 	return (0);
1002 }
1003 
1004 static int
1005 vsw_uninit_ldcs(vsw_port_t *port)
1006 {
1007 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1008 	vsw_ldc_t	*ldcp;
1009 
1010 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1011 
1012 	READ_ENTER(&ldcl->lockrw);
1013 	ldcp =  ldcl->head;
1014 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1015 		(void) vsw_ldc_uninit(ldcp);
1016 	}
1017 	RW_EXIT(&ldcl->lockrw);
1018 
1019 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1020 
1021 	return (0);
1022 }
1023 
1024 /*
1025  * Wait until the callback(s) associated with the ldcs under the specified
1026  * port have completed.
1027  *
1028  * Prior to this function being invoked each channel under this port
1029  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1030  *
1031  * A short explaination of what we are doing below..
1032  *
1033  * The simplest approach would be to have a reference counter in
1034  * the ldc structure which is increment/decremented by the callbacks as
1035  * they use the channel. The drain function could then simply disable any
1036  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1037  * there is a tiny window here - before the callback is able to get the lock
1038  * on the channel it is interrupted and this function gets to execute. It
1039  * sees that the ref count is zero and believes its free to delete the
1040  * associated data structures.
1041  *
1042  * We get around this by taking advantage of the fact that before the ldc
1043  * framework invokes a callback it sets a flag to indicate that there is a
1044  * callback active (or about to become active). If when we attempt to
1045  * unregister a callback when this active flag is set then the unregister
1046  * will fail with EWOULDBLOCK.
1047  *
1048  * If the unregister fails we do a cv_timedwait. We will either be signaled
1049  * by the callback as it is exiting (note we have to wait a short period to
1050  * allow the callback to return fully to the ldc framework and it to clear
1051  * the active flag), or by the timer expiring. In either case we again attempt
1052  * the unregister. We repeat this until we can succesfully unregister the
1053  * callback.
1054  *
1055  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1056  * the case where the callback has finished but the ldc framework has not yet
1057  * cleared the active flag. In this case we would never get a cv_signal.
1058  */
1059 static int
1060 vsw_drain_ldcs(vsw_port_t *port)
1061 {
1062 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1063 	vsw_ldc_t	*ldcp;
1064 	vsw_t		*vswp = port->p_vswp;
1065 
1066 	D1(vswp, "%s: enter", __func__);
1067 
1068 	READ_ENTER(&ldcl->lockrw);
1069 
1070 	ldcp = ldcl->head;
1071 
1072 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1073 		/*
1074 		 * If we can unregister the channel callback then we
1075 		 * know that there is no callback either running or
1076 		 * scheduled to run for this channel so move on to next
1077 		 * channel in the list.
1078 		 */
1079 		mutex_enter(&ldcp->drain_cv_lock);
1080 
1081 		/* prompt active callbacks to quit */
1082 		ldcp->drain_state = VSW_LDC_DRAINING;
1083 
1084 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1085 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1086 			    ldcp->ldc_id);
1087 			mutex_exit(&ldcp->drain_cv_lock);
1088 			continue;
1089 		} else {
1090 			/*
1091 			 * If we end up here we know that either 1) a callback
1092 			 * is currently executing, 2) is about to start (i.e.
1093 			 * the ldc framework has set the active flag but
1094 			 * has not actually invoked the callback yet, or 3)
1095 			 * has finished and has returned to the ldc framework
1096 			 * but the ldc framework has not yet cleared the
1097 			 * active bit.
1098 			 *
1099 			 * Wait for it to finish.
1100 			 */
1101 			while (ldc_unreg_callback(ldcp->ldc_handle)
1102 			    == EWOULDBLOCK)
1103 				(void) cv_timedwait(&ldcp->drain_cv,
1104 				    &ldcp->drain_cv_lock, lbolt + hz);
1105 
1106 			mutex_exit(&ldcp->drain_cv_lock);
1107 			D2(vswp, "%s: unreg callback for chan %ld after "
1108 			    "timeout", __func__, ldcp->ldc_id);
1109 		}
1110 	}
1111 	RW_EXIT(&ldcl->lockrw);
1112 
1113 	D1(vswp, "%s: exit", __func__);
1114 	return (0);
1115 }
1116 
1117 /*
1118  * Wait until all tasks which reference this port have completed.
1119  *
1120  * Prior to this function being invoked each channel under this port
1121  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1122  */
1123 static int
1124 vsw_drain_port_taskq(vsw_port_t *port)
1125 {
1126 	vsw_t		*vswp = port->p_vswp;
1127 
1128 	D1(vswp, "%s: enter", __func__);
1129 
1130 	/*
1131 	 * Mark the port as in the process of being detached, and
1132 	 * dispatch a marker task to the queue so we know when all
1133 	 * relevant tasks have completed.
1134 	 */
1135 	mutex_enter(&port->state_lock);
1136 	port->state = VSW_PORT_DETACHING;
1137 
1138 	if ((vswp->taskq_p == NULL) ||
1139 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1140 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1141 		DERR(vswp, "%s: unable to dispatch marker task",
1142 		    __func__);
1143 		mutex_exit(&port->state_lock);
1144 		return (1);
1145 	}
1146 
1147 	/*
1148 	 * Wait for the marker task to finish.
1149 	 */
1150 	while (port->state != VSW_PORT_DETACHABLE)
1151 		cv_wait(&port->state_cv, &port->state_lock);
1152 
1153 	mutex_exit(&port->state_lock);
1154 
1155 	D1(vswp, "%s: exit", __func__);
1156 
1157 	return (0);
1158 }
1159 
1160 static void
1161 vsw_marker_task(void *arg)
1162 {
1163 	vsw_port_t	*port = arg;
1164 	vsw_t		*vswp = port->p_vswp;
1165 
1166 	D1(vswp, "%s: enter", __func__);
1167 
1168 	mutex_enter(&port->state_lock);
1169 
1170 	/*
1171 	 * No further tasks should be dispatched which reference
1172 	 * this port so ok to mark it as safe to detach.
1173 	 */
1174 	port->state = VSW_PORT_DETACHABLE;
1175 
1176 	cv_signal(&port->state_cv);
1177 
1178 	mutex_exit(&port->state_lock);
1179 
1180 	D1(vswp, "%s: exit", __func__);
1181 }
1182 
1183 vsw_port_t *
1184 vsw_lookup_port(vsw_t *vswp, int p_instance)
1185 {
1186 	vsw_port_list_t *plist = &vswp->plist;
1187 	vsw_port_t	*port;
1188 
1189 	for (port = plist->head; port != NULL; port = port->p_next) {
1190 		if (port->p_instance == p_instance) {
1191 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1192 			return (port);
1193 		}
1194 	}
1195 
1196 	return (NULL);
1197 }
1198 
1199 void
1200 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1201 {
1202 	vsw_ldc_list_t 	*ldclp;
1203 	vsw_ldc_t	*ldcp;
1204 
1205 	ldclp = &portp->p_ldclist;
1206 
1207 	READ_ENTER(&ldclp->lockrw);
1208 
1209 	/*
1210 	 * NOTE: for now, we will assume we have a single channel.
1211 	 */
1212 	if (ldclp->head == NULL) {
1213 		RW_EXIT(&ldclp->lockrw);
1214 		return;
1215 	}
1216 	ldcp = ldclp->head;
1217 
1218 	mutex_enter(&ldcp->ldc_cblock);
1219 
1220 	/*
1221 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1222 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1223 	 */
1224 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1225 	    portp->nvids != 0) {
1226 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1227 	}
1228 
1229 	mutex_exit(&ldcp->ldc_cblock);
1230 
1231 	RW_EXIT(&ldclp->lockrw);
1232 }
1233 
1234 void
1235 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1236 {
1237 	vsw_ldc_list_t	*ldclp;
1238 	vsw_ldc_t	*ldcp;
1239 
1240 	ldclp = &portp->p_ldclist;
1241 
1242 	READ_ENTER(&ldclp->lockrw);
1243 
1244 	/*
1245 	 * NOTE: for now, we will assume we have a single channel.
1246 	 */
1247 	if (ldclp->head == NULL) {
1248 		RW_EXIT(&ldclp->lockrw);
1249 		return;
1250 	}
1251 	ldcp = ldclp->head;
1252 
1253 	mutex_enter(&ldcp->ldc_cblock);
1254 
1255 	/*
1256 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1257 	 * to trigger re-negotiation, which inturn trigger HybridIO
1258 	 * setup/cleanup.
1259 	 */
1260 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1261 	    (portp->p_hio_capable == B_TRUE)) {
1262 		if (immediate == B_TRUE) {
1263 			(void) ldc_down(ldcp->ldc_handle);
1264 		} else {
1265 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1266 		}
1267 	}
1268 
1269 	mutex_exit(&ldcp->ldc_cblock);
1270 
1271 	RW_EXIT(&ldclp->lockrw);
1272 }
1273 
1274 /*
1275  * Search for and remove the specified port from the port
1276  * list. Returns 0 if able to locate and remove port, otherwise
1277  * returns 1.
1278  */
1279 static int
1280 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1281 {
1282 	vsw_port_list_t *plist = &vswp->plist;
1283 	vsw_port_t	*curr_p, *prev_p;
1284 
1285 	if (plist->head == NULL)
1286 		return (1);
1287 
1288 	curr_p = prev_p = plist->head;
1289 
1290 	while (curr_p != NULL) {
1291 		if (curr_p == port) {
1292 			if (prev_p == curr_p) {
1293 				plist->head = curr_p->p_next;
1294 			} else {
1295 				prev_p->p_next = curr_p->p_next;
1296 			}
1297 			plist->num_ports--;
1298 			break;
1299 		} else {
1300 			prev_p = curr_p;
1301 			curr_p = curr_p->p_next;
1302 		}
1303 	}
1304 	return (0);
1305 }
1306 
1307 /*
1308  * Interrupt handler for ldc messages.
1309  */
1310 static uint_t
1311 vsw_ldc_cb(uint64_t event, caddr_t arg)
1312 {
1313 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1314 	vsw_t 		*vswp = ldcp->ldc_vswp;
1315 
1316 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1317 
1318 	mutex_enter(&ldcp->ldc_cblock);
1319 	ldcp->ldc_stats.callbacks++;
1320 
1321 	mutex_enter(&ldcp->status_lock);
1322 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1323 		mutex_exit(&ldcp->status_lock);
1324 		mutex_exit(&ldcp->ldc_cblock);
1325 		return (LDC_SUCCESS);
1326 	}
1327 	mutex_exit(&ldcp->status_lock);
1328 
1329 	if (event & LDC_EVT_UP) {
1330 		/*
1331 		 * Channel has come up.
1332 		 */
1333 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1334 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1335 
1336 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1337 
1338 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1339 	}
1340 
1341 	if (event & LDC_EVT_READ) {
1342 		/*
1343 		 * Data available for reading.
1344 		 */
1345 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1346 		    __func__, ldcp->ldc_id, event);
1347 
1348 		if (ldcp->rx_thread != NULL) {
1349 			/*
1350 			 * If the receive thread is enabled, then
1351 			 * wakeup the receive thread to process the
1352 			 * LDC messages.
1353 			 */
1354 			mutex_exit(&ldcp->ldc_cblock);
1355 			mutex_enter(&ldcp->rx_thr_lock);
1356 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1357 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1358 				cv_signal(&ldcp->rx_thr_cv);
1359 			}
1360 			mutex_exit(&ldcp->rx_thr_lock);
1361 			mutex_enter(&ldcp->ldc_cblock);
1362 		} else {
1363 			vsw_process_pkt(ldcp);
1364 		}
1365 
1366 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1367 
1368 		goto vsw_cb_exit;
1369 	}
1370 
1371 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1372 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1373 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1374 
1375 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1376 	}
1377 
1378 	/*
1379 	 * Catch either LDC_EVT_WRITE which we don't support or any
1380 	 * unknown event.
1381 	 */
1382 	if (event &
1383 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1384 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1385 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1386 	}
1387 
1388 vsw_cb_exit:
1389 	mutex_exit(&ldcp->ldc_cblock);
1390 
1391 	/*
1392 	 * Let the drain function know we are finishing if it
1393 	 * is waiting.
1394 	 */
1395 	mutex_enter(&ldcp->drain_cv_lock);
1396 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1397 		cv_signal(&ldcp->drain_cv);
1398 	mutex_exit(&ldcp->drain_cv_lock);
1399 
1400 	return (LDC_SUCCESS);
1401 }
1402 
1403 /*
1404  * Reinitialise data structures associated with the channel.
1405  */
1406 static void
1407 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1408 {
1409 	vsw_t		*vswp = ldcp->ldc_vswp;
1410 	vsw_port_t	*port;
1411 	vsw_ldc_list_t	*ldcl;
1412 
1413 	D1(vswp, "%s: enter", __func__);
1414 
1415 	port = ldcp->ldc_port;
1416 	ldcl = &port->p_ldclist;
1417 
1418 	READ_ENTER(&ldcl->lockrw);
1419 
1420 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1421 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1422 
1423 	vsw_free_lane_resources(ldcp, INBOUND);
1424 	vsw_free_lane_resources(ldcp, OUTBOUND);
1425 	RW_EXIT(&ldcl->lockrw);
1426 
1427 	ldcp->lane_in.lstate = 0;
1428 	ldcp->lane_out.lstate = 0;
1429 
1430 	/* Remove the fdb entry for this port/mac address */
1431 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1432 
1433 	/* remove the port from vlans it has been assigned to */
1434 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1435 
1436 	/*
1437 	 * Remove parent port from any multicast groups
1438 	 * it may have registered with. Client must resend
1439 	 * multicast add command after handshake completes.
1440 	 */
1441 	vsw_del_mcst_port(port);
1442 
1443 	ldcp->peer_session = 0;
1444 	ldcp->session_status = 0;
1445 	ldcp->hcnt = 0;
1446 	ldcp->hphase = VSW_MILESTONE0;
1447 
1448 	vsw_reset_vnet_proto_ops(ldcp);
1449 
1450 	D1(vswp, "%s: exit", __func__);
1451 }
1452 
1453 /*
1454  * Process a connection event.
1455  *
1456  * Note - care must be taken to ensure that this function is
1457  * not called with the dlistrw lock held.
1458  */
1459 static void
1460 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1461 {
1462 	vsw_t		*vswp = ldcp->ldc_vswp;
1463 	vsw_conn_evt_t	*conn = NULL;
1464 
1465 	D1(vswp, "%s: enter", __func__);
1466 
1467 	/*
1468 	 * Check if either a reset or restart event is pending
1469 	 * or in progress. If so just return.
1470 	 *
1471 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1472 	 * being received by the callback handler, or a ECONNRESET error
1473 	 * code being returned from a ldc_read() or ldc_write() call.
1474 	 *
1475 	 * A VSW_CONN_RESTART event occurs when some error checking code
1476 	 * decides that there is a problem with data from the channel,
1477 	 * and that the handshake should be restarted.
1478 	 */
1479 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1480 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1481 		return;
1482 
1483 	/*
1484 	 * If it is an LDC_UP event we first check the recorded
1485 	 * state of the channel. If this is UP then we know that
1486 	 * the channel moving to the UP state has already been dealt
1487 	 * with and don't need to dispatch a  new task.
1488 	 *
1489 	 * The reason for this check is that when we do a ldc_up(),
1490 	 * depending on the state of the peer, we may or may not get
1491 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1492 	 * every time we do ldc_up() we explicitly check the channel
1493 	 * status to see has it come up (ldc_up() is asynch and will
1494 	 * complete at some undefined time), and take the appropriate
1495 	 * action.
1496 	 *
1497 	 * The flip side of this is that we may get a LDC_UP event
1498 	 * when we have already seen that the channel is up and have
1499 	 * dealt with that.
1500 	 */
1501 	mutex_enter(&ldcp->status_lock);
1502 	if (evt == VSW_CONN_UP) {
1503 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1504 			mutex_exit(&ldcp->status_lock);
1505 			return;
1506 		}
1507 	}
1508 	mutex_exit(&ldcp->status_lock);
1509 
1510 	/*
1511 	 * The transaction group id allows us to identify and discard
1512 	 * any tasks which are still pending on the taskq and refer
1513 	 * to the handshake session we are about to restart or reset.
1514 	 * These stale messages no longer have any real meaning.
1515 	 */
1516 	(void) atomic_inc_32(&ldcp->hss_id);
1517 
1518 	ASSERT(vswp->taskq_p != NULL);
1519 
1520 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1521 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1522 		    " connection event", vswp->instance);
1523 		goto err_exit;
1524 	}
1525 
1526 	conn->evt = evt;
1527 	conn->ldcp = ldcp;
1528 
1529 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1530 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1531 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1532 		    vswp->instance);
1533 
1534 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1535 		goto err_exit;
1536 	}
1537 
1538 	D1(vswp, "%s: exit", __func__);
1539 	return;
1540 
1541 err_exit:
1542 	/*
1543 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1544 	 * that future requests will at least be attempted and will hopefully
1545 	 * succeed.
1546 	 */
1547 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1548 		ldcp->reset_active = 0;
1549 }
1550 
1551 /*
1552  * Deal with events relating to a connection. Invoked from a taskq.
1553  */
1554 static void
1555 vsw_conn_task(void *arg)
1556 {
1557 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1558 	vsw_ldc_t	*ldcp = NULL;
1559 	vsw_port_t	*portp;
1560 	vsw_t		*vswp = NULL;
1561 	uint16_t	evt;
1562 	ldc_status_t	curr_status;
1563 
1564 	ldcp = conn->ldcp;
1565 	evt = conn->evt;
1566 	vswp = ldcp->ldc_vswp;
1567 	portp = ldcp->ldc_port;
1568 
1569 	D1(vswp, "%s: enter", __func__);
1570 
1571 	/* can safely free now have copied out data */
1572 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1573 
1574 	mutex_enter(&ldcp->status_lock);
1575 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1576 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1577 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1578 		mutex_exit(&ldcp->status_lock);
1579 		return;
1580 	}
1581 
1582 	/*
1583 	 * If we wish to restart the handshake on this channel, then if
1584 	 * the channel is UP we bring it DOWN to flush the underlying
1585 	 * ldc queue.
1586 	 */
1587 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1588 		(void) ldc_down(ldcp->ldc_handle);
1589 
1590 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1591 		vsw_hio_stop(vswp, ldcp);
1592 	}
1593 
1594 	/*
1595 	 * re-init all the associated data structures.
1596 	 */
1597 	vsw_ldc_reinit(ldcp);
1598 
1599 	/*
1600 	 * Bring the channel back up (note it does no harm to
1601 	 * do this even if the channel is already UP, Just
1602 	 * becomes effectively a no-op).
1603 	 */
1604 	(void) ldc_up(ldcp->ldc_handle);
1605 
1606 	/*
1607 	 * Check if channel is now UP. This will only happen if
1608 	 * peer has also done a ldc_up().
1609 	 */
1610 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1611 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1612 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1613 		mutex_exit(&ldcp->status_lock);
1614 		return;
1615 	}
1616 
1617 	ldcp->ldc_status = curr_status;
1618 
1619 	/* channel UP so restart handshake by sending version info */
1620 	if (curr_status == LDC_UP) {
1621 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1622 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1623 			    " handshake attempts (%d) on channel %ld",
1624 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1625 			mutex_exit(&ldcp->status_lock);
1626 			return;
1627 		}
1628 
1629 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1630 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1631 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1632 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1633 			    vswp->instance);
1634 
1635 			/*
1636 			 * Don't count as valid restart attempt if couldn't
1637 			 * send version msg.
1638 			 */
1639 			if (ldcp->hcnt > 0)
1640 				ldcp->hcnt--;
1641 		}
1642 	}
1643 
1644 	/*
1645 	 * Mark that the process is complete by clearing the flag.
1646 	 *
1647 	 * Note is it possible that the taskq dispatch above may have failed,
1648 	 * most likely due to memory shortage. We still clear the flag so
1649 	 * future attempts will at least be attempted and will hopefully
1650 	 * succeed.
1651 	 */
1652 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1653 		ldcp->reset_active = 0;
1654 
1655 	mutex_exit(&ldcp->status_lock);
1656 
1657 	D1(vswp, "%s: exit", __func__);
1658 }
1659 
1660 /*
1661  * returns 0 if legal for event signified by flag to have
1662  * occured at the time it did. Otherwise returns 1.
1663  */
1664 int
1665 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1666 {
1667 	vsw_t		*vswp = ldcp->ldc_vswp;
1668 	uint64_t	state;
1669 	uint64_t	phase;
1670 
1671 	if (dir == INBOUND)
1672 		state = ldcp->lane_in.lstate;
1673 	else
1674 		state = ldcp->lane_out.lstate;
1675 
1676 	phase = ldcp->hphase;
1677 
1678 	switch (flag) {
1679 	case VSW_VER_INFO_RECV:
1680 		if (phase > VSW_MILESTONE0) {
1681 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1682 			    " when in state %d\n", ldcp->ldc_id, phase);
1683 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1684 			return (1);
1685 		}
1686 		break;
1687 
1688 	case VSW_VER_ACK_RECV:
1689 	case VSW_VER_NACK_RECV:
1690 		if (!(state & VSW_VER_INFO_SENT)) {
1691 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1692 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1693 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1694 			return (1);
1695 		} else
1696 			state &= ~VSW_VER_INFO_SENT;
1697 		break;
1698 
1699 	case VSW_ATTR_INFO_RECV:
1700 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1701 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1702 			    " when in state %d\n", ldcp->ldc_id, phase);
1703 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1704 			return (1);
1705 		}
1706 		break;
1707 
1708 	case VSW_ATTR_ACK_RECV:
1709 	case VSW_ATTR_NACK_RECV:
1710 		if (!(state & VSW_ATTR_INFO_SENT)) {
1711 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1712 			    " or ATTR_NACK when in state %d\n",
1713 			    ldcp->ldc_id, phase);
1714 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1715 			return (1);
1716 		} else
1717 			state &= ~VSW_ATTR_INFO_SENT;
1718 		break;
1719 
1720 	case VSW_DRING_INFO_RECV:
1721 		if (phase < VSW_MILESTONE1) {
1722 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1723 			    " when in state %d\n", ldcp->ldc_id, phase);
1724 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1725 			return (1);
1726 		}
1727 		break;
1728 
1729 	case VSW_DRING_ACK_RECV:
1730 	case VSW_DRING_NACK_RECV:
1731 		if (!(state & VSW_DRING_INFO_SENT)) {
1732 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1733 			    " or DRING_NACK when in state %d\n",
1734 			    ldcp->ldc_id, phase);
1735 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1736 			return (1);
1737 		} else
1738 			state &= ~VSW_DRING_INFO_SENT;
1739 		break;
1740 
1741 	case VSW_RDX_INFO_RECV:
1742 		if (phase < VSW_MILESTONE3) {
1743 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1744 			    " when in state %d\n", ldcp->ldc_id, phase);
1745 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1746 			return (1);
1747 		}
1748 		break;
1749 
1750 	case VSW_RDX_ACK_RECV:
1751 	case VSW_RDX_NACK_RECV:
1752 		if (!(state & VSW_RDX_INFO_SENT)) {
1753 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1754 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1755 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1756 			return (1);
1757 		} else
1758 			state &= ~VSW_RDX_INFO_SENT;
1759 		break;
1760 
1761 	case VSW_MCST_INFO_RECV:
1762 		if (phase < VSW_MILESTONE3) {
1763 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1764 			    " when in state %d\n", ldcp->ldc_id, phase);
1765 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1766 			return (1);
1767 		}
1768 		break;
1769 
1770 	default:
1771 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1772 		    ldcp->ldc_id, flag);
1773 		return (1);
1774 	}
1775 
1776 	if (dir == INBOUND)
1777 		ldcp->lane_in.lstate = state;
1778 	else
1779 		ldcp->lane_out.lstate = state;
1780 
1781 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1782 
1783 	return (0);
1784 }
1785 
1786 void
1787 vsw_next_milestone(vsw_ldc_t *ldcp)
1788 {
1789 	vsw_t		*vswp = ldcp->ldc_vswp;
1790 	vsw_port_t	*portp = ldcp->ldc_port;
1791 
1792 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1793 	    ldcp->ldc_id, ldcp->hphase);
1794 
1795 	DUMP_FLAGS(ldcp->lane_in.lstate);
1796 	DUMP_FLAGS(ldcp->lane_out.lstate);
1797 
1798 	switch (ldcp->hphase) {
1799 
1800 	case VSW_MILESTONE0:
1801 		/*
1802 		 * If we haven't started to handshake with our peer,
1803 		 * start to do so now.
1804 		 */
1805 		if (ldcp->lane_out.lstate == 0) {
1806 			D2(vswp, "%s: (chan %lld) starting handshake "
1807 			    "with peer", __func__, ldcp->ldc_id);
1808 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1809 		}
1810 
1811 		/*
1812 		 * Only way to pass this milestone is to have successfully
1813 		 * negotiated version info.
1814 		 */
1815 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1816 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1817 
1818 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1819 			    __func__, ldcp->ldc_id);
1820 
1821 			vsw_set_vnet_proto_ops(ldcp);
1822 
1823 			/*
1824 			 * Next milestone is passed when attribute
1825 			 * information has been successfully exchanged.
1826 			 */
1827 			ldcp->hphase = VSW_MILESTONE1;
1828 			vsw_send_attr(ldcp);
1829 
1830 		}
1831 		break;
1832 
1833 	case VSW_MILESTONE1:
1834 		/*
1835 		 * Only way to pass this milestone is to have successfully
1836 		 * negotiated attribute information.
1837 		 */
1838 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1839 
1840 			ldcp->hphase = VSW_MILESTONE2;
1841 
1842 			/*
1843 			 * If the peer device has said it wishes to
1844 			 * use descriptor rings then we send it our ring
1845 			 * info, otherwise we just set up a private ring
1846 			 * which we use an internal buffer
1847 			 */
1848 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1849 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1850 			    (VSW_VER_LT(ldcp, 1, 2) &&
1851 			    (ldcp->lane_in.xfer_mode ==
1852 			    VIO_DRING_MODE_V1_0))) {
1853 				vsw_send_dring_info(ldcp);
1854 			}
1855 		}
1856 		break;
1857 
1858 	case VSW_MILESTONE2:
1859 		/*
1860 		 * If peer has indicated in its attribute message that
1861 		 * it wishes to use descriptor rings then the only way
1862 		 * to pass this milestone is for us to have received
1863 		 * valid dring info.
1864 		 *
1865 		 * If peer is not using descriptor rings then just fall
1866 		 * through.
1867 		 */
1868 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1869 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1870 		    (VSW_VER_LT(ldcp, 1, 2) &&
1871 		    (ldcp->lane_in.xfer_mode ==
1872 		    VIO_DRING_MODE_V1_0))) {
1873 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
1874 				break;
1875 		}
1876 
1877 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1878 		    __func__, ldcp->ldc_id);
1879 
1880 		ldcp->hphase = VSW_MILESTONE3;
1881 		vsw_send_rdx(ldcp);
1882 		break;
1883 
1884 	case VSW_MILESTONE3:
1885 		/*
1886 		 * Pass this milestone when all paramaters have been
1887 		 * successfully exchanged and RDX sent in both directions.
1888 		 *
1889 		 * Mark outbound lane as available to transmit data.
1890 		 */
1891 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
1892 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
1893 
1894 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1895 			    __func__, ldcp->ldc_id);
1896 			D2(vswp, "%s: ** handshake complete (0x%llx : "
1897 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
1898 			    ldcp->lane_out.lstate);
1899 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
1900 			ldcp->hphase = VSW_MILESTONE4;
1901 			ldcp->hcnt = 0;
1902 			DISPLAY_STATE();
1903 			/* Start HIO if enabled and capable */
1904 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1905 				D2(vswp, "%s: start HybridIO setup", __func__);
1906 				vsw_hio_start(vswp, ldcp);
1907 			}
1908 		} else {
1909 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1910 			    __func__, ldcp->lane_in.lstate,
1911 			    ldcp->lane_out.lstate);
1912 		}
1913 		break;
1914 
1915 	case VSW_MILESTONE4:
1916 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1917 		    ldcp->ldc_id);
1918 		break;
1919 
1920 	default:
1921 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1922 		    ldcp->ldc_id, ldcp->hphase);
1923 	}
1924 
1925 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1926 	    ldcp->hphase);
1927 }
1928 
1929 /*
1930  * Check if major version is supported.
1931  *
1932  * Returns 0 if finds supported major number, and if necessary
1933  * adjusts the minor field.
1934  *
1935  * Returns 1 if can't match major number exactly. Sets mjor/minor
1936  * to next lowest support values, or to zero if no other values possible.
1937  */
1938 static int
1939 vsw_supported_version(vio_ver_msg_t *vp)
1940 {
1941 	int	i;
1942 
1943 	D1(NULL, "vsw_supported_version: enter");
1944 
1945 	for (i = 0; i < VSW_NUM_VER; i++) {
1946 		if (vsw_versions[i].ver_major == vp->ver_major) {
1947 			/*
1948 			 * Matching or lower major version found. Update
1949 			 * minor number if necessary.
1950 			 */
1951 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1952 				D2(NULL, "%s: adjusting minor value from %d "
1953 				    "to %d", __func__, vp->ver_minor,
1954 				    vsw_versions[i].ver_minor);
1955 				vp->ver_minor = vsw_versions[i].ver_minor;
1956 			}
1957 
1958 			return (0);
1959 		}
1960 
1961 		/*
1962 		 * If the message contains a higher major version number, set
1963 		 * the message's major/minor versions to the current values
1964 		 * and return false, so this message will get resent with
1965 		 * these values.
1966 		 */
1967 		if (vsw_versions[i].ver_major < vp->ver_major) {
1968 			D2(NULL, "%s: adjusting major and minor "
1969 			    "values to %d, %d\n",
1970 			    __func__, vsw_versions[i].ver_major,
1971 			    vsw_versions[i].ver_minor);
1972 			vp->ver_major = vsw_versions[i].ver_major;
1973 			vp->ver_minor = vsw_versions[i].ver_minor;
1974 			return (1);
1975 		}
1976 	}
1977 
1978 	/* No match was possible, zero out fields */
1979 	vp->ver_major = 0;
1980 	vp->ver_minor = 0;
1981 
1982 	D1(NULL, "vsw_supported_version: exit");
1983 
1984 	return (1);
1985 }
1986 
1987 /*
1988  * Set vnet-protocol-version dependent functions based on version.
1989  */
1990 static void
1991 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1992 {
1993 	vsw_t	*vswp = ldcp->ldc_vswp;
1994 	lane_t	*lp = &ldcp->lane_out;
1995 
1996 	if (VSW_VER_GTEQ(ldcp, 1, 3)) {
1997 		/*
1998 		 * If the version negotiated with peer is >= 1.3,
1999 		 * set the mtu in our attributes to max_frame_size.
2000 		 */
2001 		lp->mtu = vswp->max_frame_size;
2002 	} else {
2003 		vsw_port_t	*portp = ldcp->ldc_port;
2004 		/*
2005 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2006 		 * We can negotiate that size with those peers provided the
2007 		 * following conditions are true:
2008 		 * - Our max_frame_size is greater only by VLAN_TAGSZ (4).
2009 		 * - Only pvid is defined for our peer and there are no vids.
2010 		 * If the above conditions are true, then we can send/recv only
2011 		 * untagged frames of max size ETHERMAX. Note that pvid of the
2012 		 * peer can be different, as vsw has to serve the vnet in that
2013 		 * vlan even if itself is not assigned to that vlan.
2014 		 */
2015 		if ((vswp->max_frame_size == ETHERMAX + VLAN_TAGSZ) &&
2016 		    portp->nvids == 0) {
2017 			lp->mtu = ETHERMAX;
2018 		}
2019 	}
2020 
2021 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2022 		/* Versions >= 1.2 */
2023 
2024 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2025 			/*
2026 			 * enable priority routines and pkt mode only if
2027 			 * at least one pri-eth-type is specified in MD.
2028 			 */
2029 			ldcp->tx = vsw_ldctx_pri;
2030 			ldcp->rx_pktdata = vsw_process_pkt_data;
2031 
2032 			/* set xfer mode for vsw_send_attr() */
2033 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2034 		} else {
2035 			/* no priority eth types defined in MD */
2036 
2037 			ldcp->tx = vsw_ldctx;
2038 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2039 
2040 			/* set xfer mode for vsw_send_attr() */
2041 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2042 		}
2043 
2044 	} else {
2045 		/* Versions prior to 1.2  */
2046 
2047 		vsw_reset_vnet_proto_ops(ldcp);
2048 	}
2049 }
2050 
2051 /*
2052  * Reset vnet-protocol-version dependent functions to v1.0.
2053  */
2054 static void
2055 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2056 {
2057 	lane_t	*lp = &ldcp->lane_out;
2058 
2059 	ldcp->tx = vsw_ldctx;
2060 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2061 
2062 	/* set xfer mode for vsw_send_attr() */
2063 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2064 }
2065 
2066 /*
2067  * Main routine for processing messages received over LDC.
2068  */
2069 static void
2070 vsw_process_pkt(void *arg)
2071 {
2072 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2073 	vsw_t 		*vswp = ldcp->ldc_vswp;
2074 	size_t		msglen;
2075 	vio_msg_tag_t	*tagp;
2076 	uint64_t	*ldcmsg;
2077 	int 		rv = 0;
2078 
2079 
2080 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2081 
2082 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2083 
2084 	ldcmsg = ldcp->ldcmsg;
2085 	/*
2086 	 * If channel is up read messages until channel is empty.
2087 	 */
2088 	do {
2089 		msglen = ldcp->msglen;
2090 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2091 
2092 		if (rv != 0) {
2093 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2094 			    __func__, ldcp->ldc_id, rv, msglen);
2095 		}
2096 
2097 		/* channel has been reset */
2098 		if (rv == ECONNRESET) {
2099 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2100 			break;
2101 		}
2102 
2103 		if (msglen == 0) {
2104 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2105 			    ldcp->ldc_id);
2106 			break;
2107 		}
2108 
2109 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2110 		    ldcp->ldc_id, msglen);
2111 
2112 		/*
2113 		 * Figure out what sort of packet we have gotten by
2114 		 * examining the msg tag, and then switch it appropriately.
2115 		 */
2116 		tagp = (vio_msg_tag_t *)ldcmsg;
2117 
2118 		switch (tagp->vio_msgtype) {
2119 		case VIO_TYPE_CTRL:
2120 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2121 			break;
2122 		case VIO_TYPE_DATA:
2123 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2124 			break;
2125 		case VIO_TYPE_ERR:
2126 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2127 			break;
2128 		default:
2129 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2130 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2131 			break;
2132 		}
2133 	} while (msglen);
2134 
2135 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2136 }
2137 
2138 /*
2139  * Dispatch a task to process a VIO control message.
2140  */
2141 static void
2142 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2143 {
2144 	vsw_ctrl_task_t		*ctaskp = NULL;
2145 	vsw_port_t		*port = ldcp->ldc_port;
2146 	vsw_t			*vswp = port->p_vswp;
2147 
2148 	D1(vswp, "%s: enter", __func__);
2149 
2150 	/*
2151 	 * We need to handle RDX ACK messages in-band as once they
2152 	 * are exchanged it is possible that we will get an
2153 	 * immediate (legitimate) data packet.
2154 	 */
2155 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2156 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2157 
2158 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2159 			return;
2160 
2161 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2162 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2163 		    "(ostate 0x%llx : hphase %d)", __func__,
2164 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2165 		vsw_next_milestone(ldcp);
2166 		return;
2167 	}
2168 
2169 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2170 
2171 	if (ctaskp == NULL) {
2172 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2173 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2174 		return;
2175 	}
2176 
2177 	ctaskp->ldcp = ldcp;
2178 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2179 	ctaskp->hss_id = ldcp->hss_id;
2180 
2181 	/*
2182 	 * Dispatch task to processing taskq if port is not in
2183 	 * the process of being detached.
2184 	 */
2185 	mutex_enter(&port->state_lock);
2186 	if (port->state == VSW_PORT_INIT) {
2187 		if ((vswp->taskq_p == NULL) ||
2188 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2189 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2190 			DERR(vswp, "%s: unable to dispatch task to taskq",
2191 			    __func__);
2192 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2193 			mutex_exit(&port->state_lock);
2194 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2195 			return;
2196 		}
2197 	} else {
2198 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2199 		    "task", __func__, port->p_instance);
2200 	}
2201 
2202 	mutex_exit(&port->state_lock);
2203 
2204 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2205 	    ldcp->ldc_id);
2206 	D1(vswp, "%s: exit", __func__);
2207 }
2208 
2209 /*
2210  * Process a VIO ctrl message. Invoked from taskq.
2211  */
2212 static void
2213 vsw_process_ctrl_pkt(void *arg)
2214 {
2215 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2216 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2217 	vsw_t 		*vswp = ldcp->ldc_vswp;
2218 	vio_msg_tag_t	tag;
2219 	uint16_t	env;
2220 
2221 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2222 
2223 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2224 	env = tag.vio_subtype_env;
2225 
2226 	/* stale pkt check */
2227 	if (ctaskp->hss_id < ldcp->hss_id) {
2228 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2229 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2230 		return;
2231 	}
2232 
2233 	/* session id check */
2234 	if (ldcp->session_status & VSW_PEER_SESSION) {
2235 		if (ldcp->peer_session != tag.vio_sid) {
2236 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2237 			    __func__, ldcp->ldc_id, tag.vio_sid);
2238 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2239 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2240 			return;
2241 		}
2242 	}
2243 
2244 	/*
2245 	 * Switch on vio_subtype envelope, then let lower routines
2246 	 * decide if its an INFO, ACK or NACK packet.
2247 	 */
2248 	switch (env) {
2249 	case VIO_VER_INFO:
2250 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2251 		break;
2252 	case VIO_DRING_REG:
2253 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2254 		break;
2255 	case VIO_DRING_UNREG:
2256 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2257 		break;
2258 	case VIO_ATTR_INFO:
2259 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2260 		break;
2261 	case VNET_MCAST_INFO:
2262 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2263 		break;
2264 	case VIO_RDX:
2265 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2266 		break;
2267 	case VIO_DDS_INFO:
2268 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2269 		break;
2270 	default:
2271 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2272 	}
2273 
2274 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2275 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2276 }
2277 
2278 /*
2279  * Version negotiation. We can end up here either because our peer
2280  * has responded to a handshake message we have sent it, or our peer
2281  * has initiated a handshake with us. If its the former then can only
2282  * be ACK or NACK, if its the later can only be INFO.
2283  *
2284  * If its an ACK we move to the next stage of the handshake, namely
2285  * attribute exchange. If its a NACK we see if we can specify another
2286  * version, if we can't we stop.
2287  *
2288  * If it is an INFO we reset all params associated with communication
2289  * in that direction over this channel (remember connection is
2290  * essentially 2 independent simplex channels).
2291  */
2292 void
2293 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2294 {
2295 	vio_ver_msg_t	*ver_pkt;
2296 	vsw_t 		*vswp = ldcp->ldc_vswp;
2297 
2298 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2299 
2300 	/*
2301 	 * We know this is a ctrl/version packet so
2302 	 * cast it into the correct structure.
2303 	 */
2304 	ver_pkt = (vio_ver_msg_t *)pkt;
2305 
2306 	switch (ver_pkt->tag.vio_subtype) {
2307 	case VIO_SUBTYPE_INFO:
2308 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2309 
2310 		/*
2311 		 * Record the session id, which we will use from now
2312 		 * until we see another VER_INFO msg. Even then the
2313 		 * session id in most cases will be unchanged, execpt
2314 		 * if channel was reset.
2315 		 */
2316 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2317 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2318 			DERR(vswp, "%s: updating session id for chan %lld "
2319 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2320 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2321 		}
2322 
2323 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2324 		ldcp->session_status |= VSW_PEER_SESSION;
2325 
2326 		/* Legal message at this time ? */
2327 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2328 			return;
2329 
2330 		/*
2331 		 * First check the device class. Currently only expect
2332 		 * to be talking to a network device. In the future may
2333 		 * also talk to another switch.
2334 		 */
2335 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2336 			DERR(vswp, "%s: illegal device class %d", __func__,
2337 			    ver_pkt->dev_class);
2338 
2339 			ver_pkt->tag.vio_sid = ldcp->local_session;
2340 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2341 
2342 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2343 
2344 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2345 			    sizeof (vio_ver_msg_t), B_TRUE);
2346 
2347 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2348 			vsw_next_milestone(ldcp);
2349 			return;
2350 		} else {
2351 			ldcp->dev_class = ver_pkt->dev_class;
2352 		}
2353 
2354 		/*
2355 		 * Now check the version.
2356 		 */
2357 		if (vsw_supported_version(ver_pkt) == 0) {
2358 			/*
2359 			 * Support this major version and possibly
2360 			 * adjusted minor version.
2361 			 */
2362 
2363 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2364 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2365 
2366 			/* Store accepted values */
2367 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2368 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2369 
2370 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2371 
2372 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2373 
2374 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2375 				/*
2376 				 * Send a version info message
2377 				 * using the accepted version that
2378 				 * we are about to ack. Also note that
2379 				 * we send our ver info before we ack.
2380 				 * Otherwise, as soon as receiving the
2381 				 * ack, obp sends attr info msg, which
2382 				 * breaks vsw_check_flag() invoked
2383 				 * from vsw_process_ctrl_attr_pkt();
2384 				 * as we also need VSW_VER_ACK_RECV to
2385 				 * be set in lane_out.lstate, before
2386 				 * we can receive attr info.
2387 				 */
2388 				vsw_send_ver(ldcp);
2389 			}
2390 		} else {
2391 			/*
2392 			 * NACK back with the next lower major/minor
2393 			 * pairing we support (if don't suuport any more
2394 			 * versions then they will be set to zero.
2395 			 */
2396 
2397 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2398 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2399 
2400 			/* Store updated values */
2401 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2402 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2403 
2404 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2405 
2406 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2407 		}
2408 
2409 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2410 		ver_pkt->tag.vio_sid = ldcp->local_session;
2411 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2412 		    sizeof (vio_ver_msg_t), B_TRUE);
2413 
2414 		vsw_next_milestone(ldcp);
2415 		break;
2416 
2417 	case VIO_SUBTYPE_ACK:
2418 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2419 
2420 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2421 			return;
2422 
2423 		/* Store updated values */
2424 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2425 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2426 
2427 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2428 		vsw_next_milestone(ldcp);
2429 
2430 		break;
2431 
2432 	case VIO_SUBTYPE_NACK:
2433 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2434 
2435 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2436 			return;
2437 
2438 		/*
2439 		 * If our peer sent us a NACK with the ver fields set to
2440 		 * zero then there is nothing more we can do. Otherwise see
2441 		 * if we support either the version suggested, or a lesser
2442 		 * one.
2443 		 */
2444 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2445 			DERR(vswp, "%s: peer unable to negotiate any "
2446 			    "further.", __func__);
2447 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2448 			vsw_next_milestone(ldcp);
2449 			return;
2450 		}
2451 
2452 		/*
2453 		 * Check to see if we support this major version or
2454 		 * a lower one. If we don't then maj/min will be set
2455 		 * to zero.
2456 		 */
2457 		(void) vsw_supported_version(ver_pkt);
2458 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2459 			/* Nothing more we can do */
2460 			DERR(vswp, "%s: version negotiation failed.\n",
2461 			    __func__);
2462 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2463 			vsw_next_milestone(ldcp);
2464 		} else {
2465 			/* found a supported major version */
2466 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2467 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2468 
2469 			D2(vswp, "%s: resending with updated values (%x, %x)",
2470 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2471 
2472 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2473 			ver_pkt->tag.vio_sid = ldcp->local_session;
2474 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2475 
2476 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2477 
2478 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2479 			    sizeof (vio_ver_msg_t), B_TRUE);
2480 
2481 			vsw_next_milestone(ldcp);
2482 
2483 		}
2484 		break;
2485 
2486 	default:
2487 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2488 		    ver_pkt->tag.vio_subtype);
2489 	}
2490 
2491 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2492 }
2493 
2494 /*
2495  * Process an attribute packet. We can end up here either because our peer
2496  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2497  * peer has sent us an attribute INFO message
2498  *
2499  * If its an ACK we then move to the next stage of the handshake which
2500  * is to send our descriptor ring info to our peer. If its a NACK then
2501  * there is nothing more we can (currently) do.
2502  *
2503  * If we get a valid/acceptable INFO packet (and we have already negotiated
2504  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2505  * NACK back and reset channel state to INACTIV.
2506  *
2507  * FUTURE: in time we will probably negotiate over attributes, but for
2508  * the moment unacceptable attributes are regarded as a fatal error.
2509  *
2510  */
2511 void
2512 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2513 {
2514 	vnet_attr_msg_t		*attr_pkt;
2515 	vsw_t			*vswp = ldcp->ldc_vswp;
2516 	vsw_port_t		*port = ldcp->ldc_port;
2517 	uint64_t		macaddr = 0;
2518 	int			i;
2519 
2520 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2521 
2522 	/*
2523 	 * We know this is a ctrl/attr packet so
2524 	 * cast it into the correct structure.
2525 	 */
2526 	attr_pkt = (vnet_attr_msg_t *)pkt;
2527 
2528 	switch (attr_pkt->tag.vio_subtype) {
2529 	case VIO_SUBTYPE_INFO:
2530 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2531 
2532 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2533 			return;
2534 
2535 		/*
2536 		 * If the attributes are unacceptable then we NACK back.
2537 		 */
2538 		if (vsw_check_attr(attr_pkt, ldcp)) {
2539 
2540 			DERR(vswp, "%s (chan %d): invalid attributes",
2541 			    __func__, ldcp->ldc_id);
2542 
2543 			vsw_free_lane_resources(ldcp, INBOUND);
2544 
2545 			attr_pkt->tag.vio_sid = ldcp->local_session;
2546 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2547 
2548 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2549 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2550 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2551 			    sizeof (vnet_attr_msg_t), B_TRUE);
2552 
2553 			vsw_next_milestone(ldcp);
2554 			return;
2555 		}
2556 
2557 		/*
2558 		 * Otherwise store attributes for this lane and update
2559 		 * lane state.
2560 		 */
2561 		ldcp->lane_in.mtu = attr_pkt->mtu;
2562 		ldcp->lane_in.addr = attr_pkt->addr;
2563 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
2564 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
2565 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
2566 
2567 		macaddr = ldcp->lane_in.addr;
2568 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2569 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2570 			macaddr >>= 8;
2571 		}
2572 
2573 		/* create the fdb entry for this port/mac address */
2574 		vsw_fdbe_add(vswp, port);
2575 
2576 		/* add the port to the specified vlans */
2577 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2578 
2579 		/* setup device specifc xmit routines */
2580 		mutex_enter(&port->tx_lock);
2581 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2582 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2583 		    (VSW_VER_LT(ldcp, 1, 2) &&
2584 		    (ldcp->lane_in.xfer_mode == VIO_DRING_MODE_V1_0))) {
2585 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2586 			port->transmit = vsw_dringsend;
2587 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
2588 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2589 			vsw_create_privring(ldcp);
2590 			port->transmit = vsw_descrsend;
2591 			ldcp->lane_out.xfer_mode = VIO_DESC_MODE;
2592 		}
2593 
2594 		/*
2595 		 * HybridIO is supported only vnet, not by OBP.
2596 		 * So, set hio_capable to true only when in DRING mode.
2597 		 */
2598 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2599 		    (ldcp->lane_in.xfer_mode != VIO_DESC_MODE)) {
2600 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2601 		} else {
2602 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2603 		}
2604 
2605 		mutex_exit(&port->tx_lock);
2606 
2607 		attr_pkt->tag.vio_sid = ldcp->local_session;
2608 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2609 
2610 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2611 
2612 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
2613 
2614 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2615 		    sizeof (vnet_attr_msg_t), B_TRUE);
2616 
2617 		vsw_next_milestone(ldcp);
2618 		break;
2619 
2620 	case VIO_SUBTYPE_ACK:
2621 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2622 
2623 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2624 			return;
2625 
2626 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
2627 		vsw_next_milestone(ldcp);
2628 		break;
2629 
2630 	case VIO_SUBTYPE_NACK:
2631 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2632 
2633 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2634 			return;
2635 
2636 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
2637 		vsw_next_milestone(ldcp);
2638 		break;
2639 
2640 	default:
2641 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2642 		    attr_pkt->tag.vio_subtype);
2643 	}
2644 
2645 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2646 }
2647 
2648 /*
2649  * Process a dring info packet. We can end up here either because our peer
2650  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2651  * peer has sent us a dring INFO message.
2652  *
2653  * If we get a valid/acceptable INFO packet (and we have already negotiated
2654  * a version) we ACK back and update the lane state, otherwise we NACK back.
2655  *
2656  * FUTURE: nothing to stop client from sending us info on multiple dring's
2657  * but for the moment we will just use the first one we are given.
2658  *
2659  */
2660 void
2661 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2662 {
2663 	vio_dring_reg_msg_t	*dring_pkt;
2664 	vsw_t			*vswp = ldcp->ldc_vswp;
2665 	ldc_mem_info_t		minfo;
2666 	dring_info_t		*dp, *dbp;
2667 	int			dring_found = 0;
2668 
2669 	/*
2670 	 * We know this is a ctrl/dring packet so
2671 	 * cast it into the correct structure.
2672 	 */
2673 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2674 
2675 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2676 
2677 	switch (dring_pkt->tag.vio_subtype) {
2678 	case VIO_SUBTYPE_INFO:
2679 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2680 
2681 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2682 			return;
2683 
2684 		/*
2685 		 * If the dring params are unacceptable then we NACK back.
2686 		 */
2687 		if (vsw_check_dring_info(dring_pkt)) {
2688 
2689 			DERR(vswp, "%s (%lld): invalid dring info",
2690 			    __func__, ldcp->ldc_id);
2691 
2692 			vsw_free_lane_resources(ldcp, INBOUND);
2693 
2694 			dring_pkt->tag.vio_sid = ldcp->local_session;
2695 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2696 
2697 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2698 
2699 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2700 
2701 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2702 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2703 
2704 			vsw_next_milestone(ldcp);
2705 			return;
2706 		}
2707 
2708 		/*
2709 		 * Otherwise, attempt to map in the dring using the
2710 		 * cookie. If that succeeds we send back a unique dring
2711 		 * identifier that the sending side will use in future
2712 		 * to refer to this descriptor ring.
2713 		 */
2714 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2715 
2716 		dp->num_descriptors = dring_pkt->num_descriptors;
2717 		dp->descriptor_size = dring_pkt->descriptor_size;
2718 		dp->options = dring_pkt->options;
2719 		dp->ncookies = dring_pkt->ncookies;
2720 
2721 		/*
2722 		 * Note: should only get one cookie. Enforced in
2723 		 * the ldc layer.
2724 		 */
2725 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2726 		    sizeof (ldc_mem_cookie_t));
2727 
2728 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2729 		    dp->num_descriptors, dp->descriptor_size);
2730 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2731 		    dp->options, dp->ncookies);
2732 
2733 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2734 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2735 		    LDC_DIRECT_MAP, &(dp->handle))) != 0) {
2736 
2737 			DERR(vswp, "%s: dring_map failed\n", __func__);
2738 
2739 			kmem_free(dp, sizeof (dring_info_t));
2740 			vsw_free_lane_resources(ldcp, INBOUND);
2741 
2742 			dring_pkt->tag.vio_sid = ldcp->local_session;
2743 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2744 
2745 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2746 
2747 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2748 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2749 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2750 
2751 			vsw_next_milestone(ldcp);
2752 			return;
2753 		}
2754 
2755 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2756 
2757 			DERR(vswp, "%s: dring_addr failed\n", __func__);
2758 
2759 			kmem_free(dp, sizeof (dring_info_t));
2760 			vsw_free_lane_resources(ldcp, INBOUND);
2761 
2762 			dring_pkt->tag.vio_sid = ldcp->local_session;
2763 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2764 
2765 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2766 
2767 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2768 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2769 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2770 
2771 			vsw_next_milestone(ldcp);
2772 			return;
2773 		} else {
2774 			/* store the address of the pub part of ring */
2775 			dp->pub_addr = minfo.vaddr;
2776 
2777 			/* cache the dring mtype */
2778 			dp->dring_mtype = minfo.mtype;
2779 		}
2780 
2781 		/* no private section as we are importing */
2782 		dp->priv_addr = NULL;
2783 
2784 		/*
2785 		 * Using simple mono increasing int for ident at
2786 		 * the moment.
2787 		 */
2788 		dp->ident = ldcp->next_ident;
2789 		ldcp->next_ident++;
2790 
2791 		dp->end_idx = 0;
2792 		dp->next = NULL;
2793 
2794 		/*
2795 		 * Link it onto the end of the list of drings
2796 		 * for this lane.
2797 		 */
2798 		if (ldcp->lane_in.dringp == NULL) {
2799 			D2(vswp, "%s: adding first INBOUND dring", __func__);
2800 			ldcp->lane_in.dringp = dp;
2801 		} else {
2802 			dbp = ldcp->lane_in.dringp;
2803 
2804 			while (dbp->next != NULL)
2805 				dbp = dbp->next;
2806 
2807 			dbp->next = dp;
2808 		}
2809 
2810 		/* acknowledge it */
2811 		dring_pkt->tag.vio_sid = ldcp->local_session;
2812 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2813 		dring_pkt->dring_ident = dp->ident;
2814 
2815 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2816 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
2817 
2818 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
2819 		vsw_next_milestone(ldcp);
2820 		break;
2821 
2822 	case VIO_SUBTYPE_ACK:
2823 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2824 
2825 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
2826 			return;
2827 
2828 		/*
2829 		 * Peer is acknowledging our dring info and will have
2830 		 * sent us a dring identifier which we will use to
2831 		 * refer to this ring w.r.t. our peer.
2832 		 */
2833 		dp = ldcp->lane_out.dringp;
2834 		if (dp != NULL) {
2835 			/*
2836 			 * Find the ring this ident should be associated
2837 			 * with.
2838 			 */
2839 			if (vsw_dring_match(dp, dring_pkt)) {
2840 				dring_found = 1;
2841 
2842 			} else while (dp != NULL) {
2843 				if (vsw_dring_match(dp, dring_pkt)) {
2844 					dring_found = 1;
2845 					break;
2846 				}
2847 				dp = dp->next;
2848 			}
2849 
2850 			if (dring_found == 0) {
2851 				DERR(NULL, "%s: unrecognised ring cookie",
2852 				    __func__);
2853 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2854 				return;
2855 			}
2856 
2857 		} else {
2858 			DERR(vswp, "%s: DRING ACK received but no drings "
2859 			    "allocated", __func__);
2860 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2861 			return;
2862 		}
2863 
2864 		/* store ident */
2865 		dp->ident = dring_pkt->dring_ident;
2866 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
2867 		vsw_next_milestone(ldcp);
2868 		break;
2869 
2870 	case VIO_SUBTYPE_NACK:
2871 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2872 
2873 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
2874 			return;
2875 
2876 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
2877 		vsw_next_milestone(ldcp);
2878 		break;
2879 
2880 	default:
2881 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2882 		    dring_pkt->tag.vio_subtype);
2883 	}
2884 
2885 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2886 }
2887 
2888 /*
2889  * Process a request from peer to unregister a dring.
2890  *
2891  * For the moment we just restart the handshake if our
2892  * peer endpoint attempts to unregister a dring.
2893  */
2894 void
2895 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
2896 {
2897 	vsw_t			*vswp = ldcp->ldc_vswp;
2898 	vio_dring_unreg_msg_t	*dring_pkt;
2899 
2900 	/*
2901 	 * We know this is a ctrl/dring packet so
2902 	 * cast it into the correct structure.
2903 	 */
2904 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
2905 
2906 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2907 
2908 	switch (dring_pkt->tag.vio_subtype) {
2909 	case VIO_SUBTYPE_INFO:
2910 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2911 
2912 		DWARN(vswp, "%s: restarting handshake..", __func__);
2913 		break;
2914 
2915 	case VIO_SUBTYPE_ACK:
2916 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2917 
2918 		DWARN(vswp, "%s: restarting handshake..", __func__);
2919 		break;
2920 
2921 	case VIO_SUBTYPE_NACK:
2922 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2923 
2924 		DWARN(vswp, "%s: restarting handshake..", __func__);
2925 		break;
2926 
2927 	default:
2928 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2929 		    dring_pkt->tag.vio_subtype);
2930 	}
2931 
2932 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2933 
2934 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2935 }
2936 
2937 #define	SND_MCST_NACK(ldcp, pkt) \
2938 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2939 	pkt->tag.vio_sid = ldcp->local_session; \
2940 	(void) vsw_send_msg(ldcp, (void *)pkt, \
2941 			sizeof (vnet_mcast_msg_t), B_TRUE);
2942 
2943 /*
2944  * Process a multicast request from a vnet.
2945  *
2946  * Vnet's specify a multicast address that they are interested in. This
2947  * address is used as a key into the hash table which forms the multicast
2948  * forwarding database (mFDB).
2949  *
2950  * The table keys are the multicast addresses, while the table entries
2951  * are pointers to lists of ports which wish to receive packets for the
2952  * specified multicast address.
2953  *
2954  * When a multicast packet is being switched we use the address as a key
2955  * into the hash table, and then walk the appropriate port list forwarding
2956  * the pkt to each port in turn.
2957  *
2958  * If a vnet is no longer interested in a particular multicast grouping
2959  * we simply find the correct location in the hash table and then delete
2960  * the relevant port from the port list.
2961  *
2962  * To deal with the case whereby a port is being deleted without first
2963  * removing itself from the lists in the hash table, we maintain a list
2964  * of multicast addresses the port has registered an interest in, within
2965  * the port structure itself. We then simply walk that list of addresses
2966  * using them as keys into the hash table and remove the port from the
2967  * appropriate lists.
2968  */
2969 static void
2970 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
2971 {
2972 	vnet_mcast_msg_t	*mcst_pkt;
2973 	vsw_port_t		*port = ldcp->ldc_port;
2974 	vsw_t			*vswp = ldcp->ldc_vswp;
2975 	int			i;
2976 
2977 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2978 
2979 	/*
2980 	 * We know this is a ctrl/mcast packet so
2981 	 * cast it into the correct structure.
2982 	 */
2983 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
2984 
2985 	switch (mcst_pkt->tag.vio_subtype) {
2986 	case VIO_SUBTYPE_INFO:
2987 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2988 
2989 		/*
2990 		 * Check if in correct state to receive a multicast
2991 		 * message (i.e. handshake complete). If not reset
2992 		 * the handshake.
2993 		 */
2994 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
2995 			return;
2996 
2997 		/*
2998 		 * Before attempting to add or remove address check
2999 		 * that they are valid multicast addresses.
3000 		 * If not, then NACK back.
3001 		 */
3002 		for (i = 0; i < mcst_pkt->count; i++) {
3003 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3004 				DERR(vswp, "%s: invalid multicast address",
3005 				    __func__);
3006 				SND_MCST_NACK(ldcp, mcst_pkt);
3007 				return;
3008 			}
3009 		}
3010 
3011 		/*
3012 		 * Now add/remove the addresses. If this fails we
3013 		 * NACK back.
3014 		 */
3015 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3016 			SND_MCST_NACK(ldcp, mcst_pkt);
3017 			return;
3018 		}
3019 
3020 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3021 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3022 
3023 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3024 
3025 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3026 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3027 		break;
3028 
3029 	case VIO_SUBTYPE_ACK:
3030 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3031 
3032 		/*
3033 		 * We shouldn't ever get a multicast ACK message as
3034 		 * at the moment we never request multicast addresses
3035 		 * to be set on some other device. This may change in
3036 		 * the future if we have cascading switches.
3037 		 */
3038 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3039 			return;
3040 
3041 				/* Do nothing */
3042 		break;
3043 
3044 	case VIO_SUBTYPE_NACK:
3045 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3046 
3047 		/*
3048 		 * We shouldn't get a multicast NACK packet for the
3049 		 * same reasons as we shouldn't get a ACK packet.
3050 		 */
3051 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3052 			return;
3053 
3054 				/* Do nothing */
3055 		break;
3056 
3057 	default:
3058 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3059 		    mcst_pkt->tag.vio_subtype);
3060 	}
3061 
3062 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3063 }
3064 
3065 static void
3066 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3067 {
3068 	vio_rdx_msg_t	*rdx_pkt;
3069 	vsw_t		*vswp = ldcp->ldc_vswp;
3070 
3071 	/*
3072 	 * We know this is a ctrl/rdx packet so
3073 	 * cast it into the correct structure.
3074 	 */
3075 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3076 
3077 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3078 
3079 	switch (rdx_pkt->tag.vio_subtype) {
3080 	case VIO_SUBTYPE_INFO:
3081 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3082 
3083 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3084 			return;
3085 
3086 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3087 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3088 
3089 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3090 
3091 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3092 
3093 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3094 		    sizeof (vio_rdx_msg_t), B_TRUE);
3095 
3096 		vsw_next_milestone(ldcp);
3097 		break;
3098 
3099 	case VIO_SUBTYPE_ACK:
3100 		/*
3101 		 * Should be handled in-band by callback handler.
3102 		 */
3103 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3104 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3105 		break;
3106 
3107 	case VIO_SUBTYPE_NACK:
3108 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3109 
3110 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3111 			return;
3112 
3113 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3114 		vsw_next_milestone(ldcp);
3115 		break;
3116 
3117 	default:
3118 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3119 		    rdx_pkt->tag.vio_subtype);
3120 	}
3121 
3122 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3123 }
3124 
3125 static void
3126 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3127 	uint32_t msglen)
3128 {
3129 	uint16_t	env = tagp->vio_subtype_env;
3130 	vsw_t		*vswp = ldcp->ldc_vswp;
3131 
3132 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3133 
3134 	/* session id check */
3135 	if (ldcp->session_status & VSW_PEER_SESSION) {
3136 		if (ldcp->peer_session != tagp->vio_sid) {
3137 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3138 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3139 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3140 			return;
3141 		}
3142 	}
3143 
3144 	/*
3145 	 * It is an error for us to be getting data packets
3146 	 * before the handshake has completed.
3147 	 */
3148 	if (ldcp->hphase != VSW_MILESTONE4) {
3149 		DERR(vswp, "%s: got data packet before handshake complete "
3150 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3151 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3152 		DUMP_FLAGS(ldcp->lane_in.lstate);
3153 		DUMP_FLAGS(ldcp->lane_out.lstate);
3154 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3155 		return;
3156 	}
3157 
3158 	/*
3159 	 * To reduce the locking contention, release the
3160 	 * ldc_cblock here and re-acquire it once we are done
3161 	 * receiving packets.
3162 	 */
3163 	mutex_exit(&ldcp->ldc_cblock);
3164 	mutex_enter(&ldcp->ldc_rxlock);
3165 
3166 	/*
3167 	 * Switch on vio_subtype envelope, then let lower routines
3168 	 * decide if its an INFO, ACK or NACK packet.
3169 	 */
3170 	if (env == VIO_DRING_DATA) {
3171 		vsw_process_data_dring_pkt(ldcp, dpkt);
3172 	} else if (env == VIO_PKT_DATA) {
3173 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3174 	} else if (env == VIO_DESC_DATA) {
3175 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3176 	} else {
3177 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3178 	}
3179 
3180 	mutex_exit(&ldcp->ldc_rxlock);
3181 	mutex_enter(&ldcp->ldc_cblock);
3182 
3183 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3184 }
3185 
3186 #define	SND_DRING_NACK(ldcp, pkt) \
3187 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3188 	pkt->tag.vio_sid = ldcp->local_session; \
3189 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3190 			sizeof (vio_dring_msg_t), B_TRUE);
3191 
3192 static void
3193 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3194 {
3195 	vio_dring_msg_t		*dring_pkt;
3196 	vnet_public_desc_t	desc, *pub_addr = NULL;
3197 	vsw_private_desc_t	*priv_addr = NULL;
3198 	dring_info_t		*dp = NULL;
3199 	vsw_t			*vswp = ldcp->ldc_vswp;
3200 	mblk_t			*mp = NULL;
3201 	mblk_t			*bp = NULL;
3202 	mblk_t			*bpt = NULL;
3203 	size_t			nbytes = 0;
3204 	uint64_t		chain = 0;
3205 	uint64_t		len;
3206 	uint32_t		pos, start;
3207 	uint32_t		range_start, range_end;
3208 	int32_t			end, num, cnt = 0;
3209 	int			i, rv, rng_rv = 0, msg_rv = 0;
3210 	boolean_t		prev_desc_ack = B_FALSE;
3211 	int			read_attempts = 0;
3212 	struct ether_header	*ehp;
3213 
3214 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3215 
3216 	/*
3217 	 * We know this is a data/dring packet so
3218 	 * cast it into the correct structure.
3219 	 */
3220 	dring_pkt = (vio_dring_msg_t *)dpkt;
3221 
3222 	/*
3223 	 * Switch on the vio_subtype. If its INFO then we need to
3224 	 * process the data. If its an ACK we need to make sure
3225 	 * it makes sense (i.e did we send an earlier data/info),
3226 	 * and if its a NACK then we maybe attempt a retry.
3227 	 */
3228 	switch (dring_pkt->tag.vio_subtype) {
3229 	case VIO_SUBTYPE_INFO:
3230 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3231 
3232 		READ_ENTER(&ldcp->lane_in.dlistrw);
3233 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3234 		    dring_pkt->dring_ident)) == NULL) {
3235 			RW_EXIT(&ldcp->lane_in.dlistrw);
3236 
3237 			DERR(vswp, "%s(%lld): unable to find dring from "
3238 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3239 			    dring_pkt->dring_ident);
3240 
3241 			SND_DRING_NACK(ldcp, dring_pkt);
3242 			return;
3243 		}
3244 
3245 		start = pos = dring_pkt->start_idx;
3246 		end = dring_pkt->end_idx;
3247 		len = dp->num_descriptors;
3248 
3249 		range_start = range_end = pos;
3250 
3251 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3252 		    __func__, ldcp->ldc_id, start, end);
3253 
3254 		if (end == -1) {
3255 			num = -1;
3256 		} else if (end >= 0) {
3257 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3258 
3259 			/* basic sanity check */
3260 			if (end > len) {
3261 				RW_EXIT(&ldcp->lane_in.dlistrw);
3262 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3263 				    "ring length %lld", __func__,
3264 				    ldcp->ldc_id, end, len);
3265 
3266 				SND_DRING_NACK(ldcp, dring_pkt);
3267 				return;
3268 			}
3269 		} else {
3270 			RW_EXIT(&ldcp->lane_in.dlistrw);
3271 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3272 			    __func__, ldcp->ldc_id, end);
3273 			SND_DRING_NACK(ldcp, dring_pkt);
3274 			return;
3275 		}
3276 
3277 		while (cnt != num) {
3278 vsw_recheck_desc:
3279 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3280 
3281 			if ((rng_rv = vnet_dring_entry_copy(pub_addr,
3282 			    &desc, dp->dring_mtype, dp->handle,
3283 			    pos, pos)) != 0) {
3284 				DERR(vswp, "%s(%lld): unable to copy "
3285 				    "descriptor at pos %d: err %d",
3286 				    __func__, pos, ldcp->ldc_id, rng_rv);
3287 				ldcp->ldc_stats.ierrors++;
3288 				break;
3289 			}
3290 
3291 			/*
3292 			 * When given a bounded range of descriptors
3293 			 * to process, its an error to hit a descriptor
3294 			 * which is not ready. In the non-bounded case
3295 			 * (end_idx == -1) this simply indicates we have
3296 			 * reached the end of the current active range.
3297 			 */
3298 			if (desc.hdr.dstate != VIO_DESC_READY) {
3299 				/* unbound - no error */
3300 				if (end == -1) {
3301 					if (read_attempts == vsw_read_attempts)
3302 						break;
3303 
3304 					delay(drv_usectohz(vsw_desc_delay));
3305 					read_attempts++;
3306 					goto vsw_recheck_desc;
3307 				}
3308 
3309 				/* bounded - error - so NACK back */
3310 				RW_EXIT(&ldcp->lane_in.dlistrw);
3311 				DERR(vswp, "%s(%lld): descriptor not READY "
3312 				    "(%d)", __func__, ldcp->ldc_id,
3313 				    desc.hdr.dstate);
3314 				SND_DRING_NACK(ldcp, dring_pkt);
3315 				return;
3316 			}
3317 
3318 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3319 
3320 			range_end = pos;
3321 
3322 			/*
3323 			 * If we ACK'd the previous descriptor then now
3324 			 * record the new range start position for later
3325 			 * ACK's.
3326 			 */
3327 			if (prev_desc_ack) {
3328 				range_start = pos;
3329 
3330 				D2(vswp, "%s(%lld): updating range start to be "
3331 				    "%d", __func__, ldcp->ldc_id, range_start);
3332 
3333 				prev_desc_ack = B_FALSE;
3334 			}
3335 
3336 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3337 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3338 			    __func__, ldcp->ldc_id, pos, &desc,
3339 			    desc.hdr.dstate, desc.nbytes);
3340 
3341 			/*
3342 			 * Ensure that we ask ldc for an aligned
3343 			 * number of bytes. Data is padded to align on 8
3344 			 * byte boundary, desc.nbytes is actual data length,
3345 			 * i.e. minus that padding.
3346 			 */
3347 			nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7;
3348 
3349 			mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3350 			if (mp == NULL) {
3351 				ldcp->ldc_stats.rx_vio_allocb_fail++;
3352 				/*
3353 				 * No free receive buffers available, so
3354 				 * fallback onto allocb(9F). Make sure that
3355 				 * we get a data buffer which is a multiple
3356 				 * of 8 as this is required by ldc_mem_copy.
3357 				 */
3358 				DTRACE_PROBE(allocb);
3359 				if ((mp = allocb(desc.nbytes + VNET_IPALIGN + 8,
3360 				    BPRI_MED)) == NULL) {
3361 					DERR(vswp, "%s(%ld): allocb failed",
3362 					    __func__, ldcp->ldc_id);
3363 					rng_rv = vnet_dring_entry_set_dstate(
3364 					    pub_addr, dp->dring_mtype,
3365 					    dp->handle, pos, pos,
3366 					    VIO_DESC_DONE);
3367 					ldcp->ldc_stats.ierrors++;
3368 					ldcp->ldc_stats.rx_allocb_fail++;
3369 					break;
3370 				}
3371 			}
3372 
3373 			rv = ldc_mem_copy(ldcp->ldc_handle,
3374 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3375 			    desc.memcookie, desc.ncookies, LDC_COPY_IN);
3376 			if (rv != 0) {
3377 				DERR(vswp, "%s(%d): unable to copy in data "
3378 				    "from %d cookies in desc %d (rv %d)",
3379 				    __func__, ldcp->ldc_id, desc.ncookies,
3380 				    pos, rv);
3381 				freemsg(mp);
3382 
3383 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3384 				    dp->dring_mtype, dp->handle, pos, pos,
3385 				    VIO_DESC_DONE);
3386 				ldcp->ldc_stats.ierrors++;
3387 				break;
3388 			} else {
3389 				D2(vswp, "%s(%d): copied in %ld bytes"
3390 				    " using %d cookies", __func__,
3391 				    ldcp->ldc_id, nbytes, desc.ncookies);
3392 			}
3393 
3394 			/* adjust the read pointer to skip over the padding */
3395 			mp->b_rptr += VNET_IPALIGN;
3396 
3397 			/* point to the actual end of data */
3398 			mp->b_wptr = mp->b_rptr + desc.nbytes;
3399 
3400 			/* update statistics */
3401 			ehp = (struct ether_header *)mp->b_rptr;
3402 			if (IS_BROADCAST(ehp))
3403 				ldcp->ldc_stats.brdcstrcv++;
3404 			else if (IS_MULTICAST(ehp))
3405 				ldcp->ldc_stats.multircv++;
3406 
3407 			ldcp->ldc_stats.ipackets++;
3408 			ldcp->ldc_stats.rbytes += desc.nbytes;
3409 
3410 			/*
3411 			 * IPALIGN space can be used for VLAN_TAG
3412 			 */
3413 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3414 			    VSW_VNETPORT, mp);
3415 
3416 			/* build a chain of received packets */
3417 			if (bp == NULL) {
3418 				/* first pkt */
3419 				bp = mp;
3420 				bp->b_next = bp->b_prev = NULL;
3421 				bpt = bp;
3422 				chain = 1;
3423 			} else {
3424 				mp->b_next = mp->b_prev = NULL;
3425 				bpt->b_next = mp;
3426 				bpt = mp;
3427 				chain++;
3428 			}
3429 
3430 			/* mark we are finished with this descriptor */
3431 			if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3432 			    dp->dring_mtype, dp->handle, pos, pos,
3433 			    VIO_DESC_DONE)) != 0) {
3434 				DERR(vswp, "%s(%lld): unable to update "
3435 				    "dstate at pos %d: err %d",
3436 				    __func__, pos, ldcp->ldc_id, rng_rv);
3437 				ldcp->ldc_stats.ierrors++;
3438 				break;
3439 			}
3440 
3441 			/*
3442 			 * Send an ACK back to peer if requested.
3443 			 */
3444 			if (desc.hdr.ack) {
3445 				dring_pkt->start_idx = range_start;
3446 				dring_pkt->end_idx = range_end;
3447 
3448 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3449 				    " requested", __func__, ldcp->ldc_id,
3450 				    dring_pkt->start_idx, dring_pkt->end_idx);
3451 
3452 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3453 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3454 				dring_pkt->tag.vio_sid = ldcp->local_session;
3455 
3456 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3457 				    sizeof (vio_dring_msg_t), B_FALSE);
3458 
3459 				/*
3460 				 * Check if ACK was successfully sent. If not
3461 				 * we break and deal with that below.
3462 				 */
3463 				if (msg_rv != 0)
3464 					break;
3465 
3466 				prev_desc_ack = B_TRUE;
3467 				range_start = pos;
3468 			}
3469 
3470 			/* next descriptor */
3471 			pos = (pos + 1) % len;
3472 			cnt++;
3473 
3474 			/*
3475 			 * Break out of loop here and stop processing to
3476 			 * allow some other network device (or disk) to
3477 			 * get access to the cpu.
3478 			 */
3479 			if (chain > vsw_chain_len) {
3480 				D3(vswp, "%s(%lld): switching chain of %d "
3481 				    "msgs", __func__, ldcp->ldc_id, chain);
3482 				break;
3483 			}
3484 		}
3485 		RW_EXIT(&ldcp->lane_in.dlistrw);
3486 
3487 		/* send the chain of packets to be switched */
3488 		if (bp != NULL) {
3489 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3490 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3491 			    __func__, ldcp->ldc_id, chain);
3492 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3493 			    ldcp->ldc_port, NULL);
3494 		}
3495 
3496 		/*
3497 		 * If when we encountered an error when attempting to
3498 		 * access an imported dring, initiate a connection reset.
3499 		 */
3500 		if (rng_rv != 0) {
3501 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3502 			break;
3503 		}
3504 
3505 		/*
3506 		 * If when we attempted to send the ACK we found that the
3507 		 * channel had been reset then now handle this. We deal with
3508 		 * it here as we cannot reset the channel while holding the
3509 		 * dlistrw lock, and we don't want to acquire/release it
3510 		 * continuously in the above loop, as a channel reset should
3511 		 * be a rare event.
3512 		 */
3513 		if (msg_rv == ECONNRESET) {
3514 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3515 			break;
3516 		}
3517 
3518 		DTRACE_PROBE1(msg_cnt, int, cnt);
3519 
3520 		/*
3521 		 * We are now finished so ACK back with the state
3522 		 * set to STOPPING so our peer knows we are finished
3523 		 */
3524 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3525 		dring_pkt->tag.vio_sid = ldcp->local_session;
3526 
3527 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3528 
3529 		DTRACE_PROBE(stop_process_sent);
3530 
3531 		/*
3532 		 * We have not processed any more descriptors beyond
3533 		 * the last one we ACK'd.
3534 		 */
3535 		if (prev_desc_ack)
3536 			range_start = range_end;
3537 
3538 		dring_pkt->start_idx = range_start;
3539 		dring_pkt->end_idx = range_end;
3540 
3541 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3542 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3543 		    dring_pkt->end_idx);
3544 
3545 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3546 		    sizeof (vio_dring_msg_t), B_TRUE);
3547 		break;
3548 
3549 	case VIO_SUBTYPE_ACK:
3550 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3551 		/*
3552 		 * Verify that the relevant descriptors are all
3553 		 * marked as DONE
3554 		 */
3555 		READ_ENTER(&ldcp->lane_out.dlistrw);
3556 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3557 		    dring_pkt->dring_ident)) == NULL) {
3558 			RW_EXIT(&ldcp->lane_out.dlistrw);
3559 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3560 			return;
3561 		}
3562 
3563 		start = end = 0;
3564 		start = dring_pkt->start_idx;
3565 		end = dring_pkt->end_idx;
3566 		len = dp->num_descriptors;
3567 
3568 
3569 		mutex_enter(&dp->dlock);
3570 		dp->last_ack_recv = end;
3571 		ldcp->ldc_stats.dring_data_acks++;
3572 		mutex_exit(&dp->dlock);
3573 
3574 		(void) vsw_reclaim_dring(dp, start);
3575 
3576 		/*
3577 		 * If our peer is stopping processing descriptors then
3578 		 * we check to make sure it has processed all the descriptors
3579 		 * we have updated. If not then we send it a new message
3580 		 * to prompt it to restart.
3581 		 */
3582 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3583 			DTRACE_PROBE(stop_process_recv);
3584 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3585 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3586 			    dring_pkt->end_idx);
3587 
3588 			/*
3589 			 * Check next descriptor in public section of ring.
3590 			 * If its marked as READY then we need to prompt our
3591 			 * peer to start processing the ring again.
3592 			 */
3593 			i = (end + 1) % len;
3594 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3595 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3596 
3597 			/*
3598 			 * Hold the restart lock across all of this to
3599 			 * make sure that its not possible for us to
3600 			 * decide that a msg needs to be sent in the future
3601 			 * but the sending code having already checked is
3602 			 * about to exit.
3603 			 */
3604 			mutex_enter(&dp->restart_lock);
3605 			ldcp->ldc_stats.dring_stopped_acks++;
3606 			mutex_enter(&priv_addr->dstate_lock);
3607 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3608 
3609 				mutex_exit(&priv_addr->dstate_lock);
3610 
3611 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3612 				dring_pkt->tag.vio_sid = ldcp->local_session;
3613 
3614 				dring_pkt->start_idx = (end + 1) % len;
3615 				dring_pkt->end_idx = -1;
3616 
3617 				D2(vswp, "%s(%lld) : sending restart msg:"
3618 				    " %d : %d", __func__, ldcp->ldc_id,
3619 				    dring_pkt->start_idx, dring_pkt->end_idx);
3620 
3621 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3622 				    sizeof (vio_dring_msg_t), B_FALSE);
3623 				ldcp->ldc_stats.dring_data_msgs++;
3624 
3625 			} else {
3626 				mutex_exit(&priv_addr->dstate_lock);
3627 				dp->restart_reqd = B_TRUE;
3628 			}
3629 			mutex_exit(&dp->restart_lock);
3630 		}
3631 		RW_EXIT(&ldcp->lane_out.dlistrw);
3632 
3633 		/* only do channel reset after dropping dlistrw lock */
3634 		if (msg_rv == ECONNRESET)
3635 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3636 
3637 		break;
3638 
3639 	case VIO_SUBTYPE_NACK:
3640 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3641 		    __func__, ldcp->ldc_id);
3642 		/*
3643 		 * Something is badly wrong if we are getting NACK's
3644 		 * for our data pkts. So reset the channel.
3645 		 */
3646 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3647 
3648 		break;
3649 
3650 	default:
3651 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3652 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3653 	}
3654 
3655 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3656 }
3657 
3658 /*
3659  * dummy pkt data handler function for vnet protocol version 1.0
3660  */
3661 static void
3662 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3663 {
3664 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3665 }
3666 
3667 /*
3668  * This function handles raw pkt data messages received over the channel.
3669  * Currently, only priority-eth-type frames are received through this mechanism.
3670  * In this case, the frame(data) is present within the message itself which
3671  * is copied into an mblk before switching it.
3672  */
3673 static void
3674 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3675 {
3676 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3677 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3678 	uint32_t		size;
3679 	mblk_t			*mp;
3680 	vsw_t			*vswp = ldcp->ldc_vswp;
3681 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3682 	lane_t			*lp = &ldcp->lane_out;
3683 
3684 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3685 	if (size < ETHERMIN || size > lp->mtu) {
3686 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3687 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3688 		    ldcp->ldc_id, size);
3689 		return;
3690 	}
3691 
3692 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3693 	if (mp == NULL) {
3694 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3695 		if (mp == NULL) {
3696 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3697 			DWARN(vswp, "%s(%lld) allocb failure, "
3698 			    "unable to process priority frame\n", __func__,
3699 			    ldcp->ldc_id);
3700 			return;
3701 		}
3702 	}
3703 
3704 	/* skip over the extra space for vlan tag */
3705 	mp->b_rptr += VLAN_TAGSZ;
3706 
3707 	/* copy the frame from the payload of raw data msg into the mblk */
3708 	bcopy(dpkt->data, mp->b_rptr, size);
3709 	mp->b_wptr = mp->b_rptr + size;
3710 
3711 	/* update stats */
3712 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3713 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3714 
3715 	/*
3716 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3717 	 */
3718 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3719 
3720 	/* switch the frame to destination */
3721 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3722 }
3723 
3724 /*
3725  * Process an in-band descriptor message (most likely from
3726  * OBP).
3727  */
3728 static void
3729 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3730 {
3731 	vnet_ibnd_desc_t	*ibnd_desc;
3732 	dring_info_t		*dp = NULL;
3733 	vsw_private_desc_t	*priv_addr = NULL;
3734 	vsw_t			*vswp = ldcp->ldc_vswp;
3735 	mblk_t			*mp = NULL;
3736 	size_t			nbytes = 0;
3737 	size_t			off = 0;
3738 	uint64_t		idx = 0;
3739 	uint32_t		num = 1, len, datalen = 0;
3740 	uint64_t		ncookies = 0;
3741 	int			i, rv;
3742 	int			j = 0;
3743 
3744 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3745 
3746 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3747 
3748 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3749 	case VIO_SUBTYPE_INFO:
3750 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3751 
3752 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3753 			return;
3754 
3755 		/*
3756 		 * Data is padded to align on a 8 byte boundary,
3757 		 * nbytes is actual data length, i.e. minus that
3758 		 * padding.
3759 		 */
3760 		datalen = ibnd_desc->nbytes;
3761 
3762 		D2(vswp, "%s(%lld): processing inband desc : "
3763 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3764 
3765 		ncookies = ibnd_desc->ncookies;
3766 
3767 		/*
3768 		 * allocb(9F) returns an aligned data block. We
3769 		 * need to ensure that we ask ldc for an aligned
3770 		 * number of bytes also.
3771 		 */
3772 		nbytes = datalen;
3773 		if (nbytes & 0x7) {
3774 			off = 8 - (nbytes & 0x7);
3775 			nbytes += off;
3776 		}
3777 
3778 		/* alloc extra space for VLAN_TAG */
3779 		mp = allocb(datalen + 8, BPRI_MED);
3780 		if (mp == NULL) {
3781 			DERR(vswp, "%s(%lld): allocb failed",
3782 			    __func__, ldcp->ldc_id);
3783 			ldcp->ldc_stats.rx_allocb_fail++;
3784 			return;
3785 		}
3786 
3787 		/* skip over the extra space for VLAN_TAG */
3788 		mp->b_rptr += 8;
3789 
3790 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3791 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3792 		    LDC_COPY_IN);
3793 
3794 		if (rv != 0) {
3795 			DERR(vswp, "%s(%d): unable to copy in data from "
3796 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3797 			freemsg(mp);
3798 			ldcp->ldc_stats.ierrors++;
3799 			return;
3800 		}
3801 
3802 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3803 		    __func__, ldcp->ldc_id, nbytes, ncookies);
3804 
3805 		/* point to the actual end of data */
3806 		mp->b_wptr = mp->b_rptr + datalen;
3807 		ldcp->ldc_stats.ipackets++;
3808 		ldcp->ldc_stats.rbytes += datalen;
3809 
3810 		/*
3811 		 * We ACK back every in-band descriptor message we process
3812 		 */
3813 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3814 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3815 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3816 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3817 
3818 		/*
3819 		 * there is extra space alloc'd for VLAN_TAG
3820 		 */
3821 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3822 
3823 		/* send the packet to be switched */
3824 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3825 		    ldcp->ldc_port, NULL);
3826 
3827 		break;
3828 
3829 	case VIO_SUBTYPE_ACK:
3830 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3831 
3832 		/* Verify the ACK is valid */
3833 		idx = ibnd_desc->hdr.desc_handle;
3834 
3835 		if (idx >= vsw_ntxds) {
3836 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3837 			    "(idx %ld)", vswp->instance, idx);
3838 			return;
3839 		}
3840 
3841 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3842 			DERR(vswp, "%s: no dring found", __func__);
3843 			return;
3844 		}
3845 
3846 		len = dp->num_descriptors;
3847 		/*
3848 		 * If the descriptor we are being ACK'ed for is not the
3849 		 * one we expected, then pkts were lost somwhere, either
3850 		 * when we tried to send a msg, or a previous ACK msg from
3851 		 * our peer. In either case we now reclaim the descriptors
3852 		 * in the range from the last ACK we received up to the
3853 		 * current ACK.
3854 		 */
3855 		if (idx != dp->last_ack_recv) {
3856 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3857 			    __func__, dp->last_ack_recv, idx);
3858 			num = idx >= dp->last_ack_recv ?
3859 			    idx - dp->last_ack_recv + 1:
3860 			    (len - dp->last_ack_recv + 1) + idx;
3861 		}
3862 
3863 		/*
3864 		 * When we sent the in-band message to our peer we
3865 		 * marked the copy in our private ring as READY. We now
3866 		 * check that the descriptor we are being ACK'ed for is in
3867 		 * fact READY, i.e. it is one we have shared with our peer.
3868 		 *
3869 		 * If its not we flag an error, but still reset the descr
3870 		 * back to FREE.
3871 		 */
3872 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3873 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3874 			mutex_enter(&priv_addr->dstate_lock);
3875 			if (priv_addr->dstate != VIO_DESC_READY) {
3876 				DERR(vswp, "%s: (%ld) desc at index %ld not "
3877 				    "READY (0x%lx)", __func__,
3878 				    ldcp->ldc_id, idx, priv_addr->dstate);
3879 				DERR(vswp, "%s: bound %d: ncookies %ld : "
3880 				    "datalen %ld", __func__,
3881 				    priv_addr->bound, priv_addr->ncookies,
3882 				    priv_addr->datalen);
3883 			}
3884 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3885 			    ldcp->ldc_id, idx);
3886 			/* release resources associated with sent msg */
3887 			priv_addr->datalen = 0;
3888 			priv_addr->dstate = VIO_DESC_FREE;
3889 			mutex_exit(&priv_addr->dstate_lock);
3890 		}
3891 		/* update to next expected value */
3892 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3893 
3894 		break;
3895 
3896 	case VIO_SUBTYPE_NACK:
3897 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3898 
3899 		/*
3900 		 * We should only get a NACK if our peer doesn't like
3901 		 * something about a message we have sent it. If this
3902 		 * happens we just release the resources associated with
3903 		 * the message. (We are relying on higher layers to decide
3904 		 * whether or not to resend.
3905 		 */
3906 
3907 		/* limit check */
3908 		idx = ibnd_desc->hdr.desc_handle;
3909 
3910 		if (idx >= vsw_ntxds) {
3911 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3912 			    __func__, idx);
3913 			return;
3914 		}
3915 
3916 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3917 			DERR(vswp, "%s: no dring found", __func__);
3918 			return;
3919 		}
3920 
3921 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3922 
3923 		/* move to correct location in ring */
3924 		priv_addr += idx;
3925 
3926 		/* release resources associated with sent msg */
3927 		mutex_enter(&priv_addr->dstate_lock);
3928 		priv_addr->datalen = 0;
3929 		priv_addr->dstate = VIO_DESC_FREE;
3930 		mutex_exit(&priv_addr->dstate_lock);
3931 
3932 		break;
3933 
3934 	default:
3935 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3936 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3937 	}
3938 
3939 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3940 }
3941 
3942 static void
3943 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3944 {
3945 	_NOTE(ARGUNUSED(epkt))
3946 
3947 	vsw_t		*vswp = ldcp->ldc_vswp;
3948 	uint16_t	env = tagp->vio_subtype_env;
3949 
3950 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3951 
3952 	/*
3953 	 * Error vio_subtypes have yet to be defined. So for
3954 	 * the moment we can't do anything.
3955 	 */
3956 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3957 
3958 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3959 }
3960 
3961 /* transmit the packet over the given port */
3962 int
3963 vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
3964 {
3965 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
3966 	vsw_ldc_t 	*ldcp;
3967 	int		status = 0;
3968 	uint32_t	n;
3969 
3970 	READ_ENTER(&ldcl->lockrw);
3971 	/*
3972 	 * Note for now, we have a single channel.
3973 	 */
3974 	ldcp = ldcl->head;
3975 	if (ldcp == NULL) {
3976 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
3977 		freemsgchain(mp);
3978 		RW_EXIT(&ldcl->lockrw);
3979 		return (1);
3980 	}
3981 
3982 	n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3983 
3984 	count -= n;
3985 	if (count == 0) {
3986 		goto vsw_portsend_exit;
3987 	}
3988 
3989 	status = ldcp->tx(ldcp, mp, mpt, count);
3990 
3991 vsw_portsend_exit:
3992 	RW_EXIT(&ldcl->lockrw);
3993 
3994 	return (status);
3995 }
3996 
3997 /*
3998  * Break up frames into 2 seperate chains: normal and
3999  * priority, based on the frame type. The number of
4000  * priority frames is also counted and returned.
4001  *
4002  * Params:
4003  * 	vswp:	pointer to the instance of vsw
4004  *	np:	head of packet chain to be broken
4005  *	npt:	tail of packet chain to be broken
4006  *
4007  * Returns:
4008  *	np:	head of normal data packets
4009  *	npt:	tail of normal data packets
4010  *	hp:	head of high priority packets
4011  *	hpt:	tail of high priority packets
4012  */
4013 static uint32_t
4014 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4015 	mblk_t **hp, mblk_t **hpt)
4016 {
4017 	mblk_t			*tmp = NULL;
4018 	mblk_t			*smp = NULL;
4019 	mblk_t			*hmp = NULL;	/* high prio pkts head */
4020 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4021 	mblk_t			*nmp = NULL;	/* normal pkts head */
4022 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4023 	uint32_t		count = 0;
4024 	int			i;
4025 	struct ether_header	*ehp;
4026 	uint32_t		num_types;
4027 	uint16_t		*types;
4028 
4029 	tmp = *np;
4030 	while (tmp != NULL) {
4031 
4032 		smp = tmp;
4033 		tmp = tmp->b_next;
4034 		smp->b_next = NULL;
4035 		smp->b_prev = NULL;
4036 
4037 		ehp = (struct ether_header *)smp->b_rptr;
4038 		num_types = vswp->pri_num_types;
4039 		types = vswp->pri_types;
4040 		for (i = 0; i < num_types; i++) {
4041 			if (ehp->ether_type == types[i]) {
4042 				/* high priority frame */
4043 
4044 				if (hmp != NULL) {
4045 					hmpt->b_next = smp;
4046 					hmpt = smp;
4047 				} else {
4048 					hmp = hmpt = smp;
4049 				}
4050 				count++;
4051 				break;
4052 			}
4053 		}
4054 		if (i == num_types) {
4055 			/* normal data frame */
4056 
4057 			if (nmp != NULL) {
4058 				nmpt->b_next = smp;
4059 				nmpt = smp;
4060 			} else {
4061 				nmp = nmpt = smp;
4062 			}
4063 		}
4064 	}
4065 
4066 	*hp = hmp;
4067 	*hpt = hmpt;
4068 	*np = nmp;
4069 	*npt = nmpt;
4070 
4071 	return (count);
4072 }
4073 
4074 /*
4075  * Wrapper function to transmit normal and/or priority frames over the channel.
4076  */
4077 static int
4078 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4079 {
4080 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4081 	mblk_t			*tmp;
4082 	mblk_t			*smp;
4083 	mblk_t			*hmp;	/* high prio pkts head */
4084 	mblk_t			*hmpt;	/* high prio pkts tail */
4085 	mblk_t			*nmp;	/* normal pkts head */
4086 	mblk_t			*nmpt;	/* normal pkts tail */
4087 	uint32_t		n = 0;
4088 	vsw_t			*vswp = ldcp->ldc_vswp;
4089 
4090 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4091 	ASSERT(count != 0);
4092 
4093 	nmp = mp;
4094 	nmpt = mpt;
4095 
4096 	/* gather any priority frames from the chain of packets */
4097 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4098 
4099 	/* transmit priority frames */
4100 	tmp = hmp;
4101 	while (tmp != NULL) {
4102 		smp = tmp;
4103 		tmp = tmp->b_next;
4104 		smp->b_next = NULL;
4105 		vsw_ldcsend_pkt(ldcp, smp);
4106 	}
4107 
4108 	count -= n;
4109 
4110 	if (count == 0) {
4111 		/* no normal data frames to process */
4112 		return (0);
4113 	}
4114 
4115 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4116 }
4117 
4118 /*
4119  * Wrapper function to transmit normal frames over the channel.
4120  */
4121 static int
4122 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4123 {
4124 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4125 	mblk_t		*tmp = NULL;
4126 
4127 	ASSERT(count != 0);
4128 	/*
4129 	 * If the TX thread is enabled, then queue the
4130 	 * ordinary frames and signal the tx thread.
4131 	 */
4132 	if (ldcp->tx_thread != NULL) {
4133 
4134 		mutex_enter(&ldcp->tx_thr_lock);
4135 
4136 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4137 			/*
4138 			 * If we reached queue limit,
4139 			 * do not queue new packets,
4140 			 * drop them.
4141 			 */
4142 			ldcp->ldc_stats.tx_qfull += count;
4143 			mutex_exit(&ldcp->tx_thr_lock);
4144 			freemsgchain(mp);
4145 			goto exit;
4146 		}
4147 		if (ldcp->tx_mhead == NULL) {
4148 			ldcp->tx_mhead = mp;
4149 			ldcp->tx_mtail = mpt;
4150 			cv_signal(&ldcp->tx_thr_cv);
4151 		} else {
4152 			ldcp->tx_mtail->b_next = mp;
4153 			ldcp->tx_mtail = mpt;
4154 		}
4155 		ldcp->tx_cnt += count;
4156 		mutex_exit(&ldcp->tx_thr_lock);
4157 	} else {
4158 		while (mp != NULL) {
4159 			tmp = mp->b_next;
4160 			mp->b_next = mp->b_prev = NULL;
4161 			(void) vsw_ldcsend(ldcp, mp, 1);
4162 			mp = tmp;
4163 		}
4164 	}
4165 
4166 exit:
4167 	return (0);
4168 }
4169 
4170 /*
4171  * This function transmits the frame in the payload of a raw data
4172  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4173  * send special frames with high priorities, without going through
4174  * the normal data path which uses descriptor ring mechanism.
4175  */
4176 static void
4177 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4178 {
4179 	vio_raw_data_msg_t	*pkt;
4180 	mblk_t			*bp;
4181 	mblk_t			*nmp = NULL;
4182 	caddr_t			dst;
4183 	uint32_t		mblksz;
4184 	uint32_t		size;
4185 	uint32_t		nbytes;
4186 	int			rv;
4187 	vsw_t			*vswp = ldcp->ldc_vswp;
4188 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4189 
4190 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4191 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4192 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4193 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4194 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4195 		    ldcp->lane_out.lstate);
4196 		goto send_pkt_exit;
4197 	}
4198 
4199 	size = msgsize(mp);
4200 
4201 	/* frame size bigger than available payload len of raw data msg ? */
4202 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4203 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4204 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4205 		    ldcp->ldc_id, size);
4206 		goto send_pkt_exit;
4207 	}
4208 
4209 	if (size < ETHERMIN)
4210 		size = ETHERMIN;
4211 
4212 	/* alloc space for a raw data message */
4213 	nmp = vio_allocb(vswp->pri_tx_vmp);
4214 	if (nmp == NULL) {
4215 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4216 		DWARN(vswp, "vio_allocb failed\n");
4217 		goto send_pkt_exit;
4218 	}
4219 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4220 
4221 	/* copy frame into the payload of raw data message */
4222 	dst = (caddr_t)pkt->data;
4223 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4224 		mblksz = MBLKL(bp);
4225 		bcopy(bp->b_rptr, dst, mblksz);
4226 		dst += mblksz;
4227 	}
4228 
4229 	/* setup the raw data msg */
4230 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4231 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4232 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4233 	pkt->tag.vio_sid = ldcp->local_session;
4234 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4235 
4236 	/* send the msg over ldc */
4237 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4238 	if (rv != 0) {
4239 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4240 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4241 		    ldcp->ldc_id);
4242 		goto send_pkt_exit;
4243 	}
4244 
4245 	/* update stats */
4246 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4247 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4248 
4249 send_pkt_exit:
4250 	if (nmp != NULL)
4251 		freemsg(nmp);
4252 	freemsg(mp);
4253 }
4254 
4255 /*
4256  * Transmit the packet over the given LDC channel.
4257  *
4258  * The 'retries' argument indicates how many times a packet
4259  * is retried before it is dropped. Note, the retry is done
4260  * only for a resource related failure, for all other failures
4261  * the packet is dropped immediately.
4262  */
4263 static int
4264 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4265 {
4266 	int i;
4267 	int rc;
4268 	int status = 0;
4269 	vsw_port_t *port = ldcp->ldc_port;
4270 	dring_info_t *dp = NULL;
4271 
4272 
4273 	for (i = 0; i < retries; ) {
4274 		/*
4275 		 * Send the message out using the appropriate
4276 		 * transmit function which will free mblock when it
4277 		 * is finished with it.
4278 		 */
4279 		mutex_enter(&port->tx_lock);
4280 		if (port->transmit != NULL) {
4281 			status = (*port->transmit)(ldcp, mp);
4282 		}
4283 		if (status == LDC_TX_SUCCESS) {
4284 			mutex_exit(&port->tx_lock);
4285 			break;
4286 		}
4287 		i++;	/* increment the counter here */
4288 
4289 		/* If its the last retry, then update the oerror */
4290 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4291 			ldcp->ldc_stats.oerrors++;
4292 		}
4293 		mutex_exit(&port->tx_lock);
4294 
4295 		if (status != LDC_TX_NORESOURCES) {
4296 			/*
4297 			 * No retrying required for errors un-related
4298 			 * to resources.
4299 			 */
4300 			break;
4301 		}
4302 		READ_ENTER(&ldcp->lane_out.dlistrw);
4303 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4304 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4305 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4306 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4307 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4308 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4309 		} else {
4310 			/*
4311 			 * If there is no dring or the xfer_mode is
4312 			 * set to DESC_MODE(ie., OBP), then simply break here.
4313 			 */
4314 			RW_EXIT(&ldcp->lane_out.dlistrw);
4315 			break;
4316 		}
4317 		RW_EXIT(&ldcp->lane_out.dlistrw);
4318 
4319 		/*
4320 		 * Delay only if none were reclaimed
4321 		 * and its not the last retry.
4322 		 */
4323 		if ((rc == 0) && (i < retries)) {
4324 			delay(drv_usectohz(vsw_ldc_tx_delay));
4325 		}
4326 	}
4327 	freemsg(mp);
4328 	return (status);
4329 }
4330 
4331 /*
4332  * Send packet out via descriptor ring to a logical device.
4333  */
4334 static int
4335 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4336 {
4337 	vio_dring_msg_t		dring_pkt;
4338 	dring_info_t		*dp = NULL;
4339 	vsw_private_desc_t	*priv_desc = NULL;
4340 	vnet_public_desc_t	*pub = NULL;
4341 	vsw_t			*vswp = ldcp->ldc_vswp;
4342 	mblk_t			*bp;
4343 	size_t			n, size;
4344 	caddr_t			bufp;
4345 	int			idx;
4346 	int			status = LDC_TX_SUCCESS;
4347 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4348 	lane_t			*lp = &ldcp->lane_out;
4349 
4350 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4351 
4352 	/* TODO: make test a macro */
4353 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4354 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4355 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4356 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4357 		    ldcp->lane_out.lstate);
4358 		ldcp->ldc_stats.oerrors++;
4359 		return (LDC_TX_FAILURE);
4360 	}
4361 
4362 	/*
4363 	 * Note - using first ring only, this may change
4364 	 * in the future.
4365 	 */
4366 	READ_ENTER(&ldcp->lane_out.dlistrw);
4367 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4368 		RW_EXIT(&ldcp->lane_out.dlistrw);
4369 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4370 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4371 		ldcp->ldc_stats.oerrors++;
4372 		return (LDC_TX_FAILURE);
4373 	}
4374 
4375 	size = msgsize(mp);
4376 	if (size > (size_t)lp->mtu) {
4377 		RW_EXIT(&ldcp->lane_out.dlistrw);
4378 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4379 		    ldcp->ldc_id, size);
4380 		ldcp->ldc_stats.oerrors++;
4381 		return (LDC_TX_FAILURE);
4382 	}
4383 
4384 	/*
4385 	 * Find a free descriptor
4386 	 *
4387 	 * Note: for the moment we are assuming that we will only
4388 	 * have one dring going from the switch to each of its
4389 	 * peers. This may change in the future.
4390 	 */
4391 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4392 		D2(vswp, "%s(%lld): no descriptor available for ring "
4393 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4394 
4395 		/* nothing more we can do */
4396 		status = LDC_TX_NORESOURCES;
4397 		ldcp->ldc_stats.tx_no_desc++;
4398 		goto vsw_dringsend_free_exit;
4399 	} else {
4400 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4401 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4402 	}
4403 
4404 	/* copy data into the descriptor */
4405 	bufp = priv_desc->datap;
4406 	bufp += VNET_IPALIGN;
4407 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4408 		n = MBLKL(bp);
4409 		bcopy(bp->b_rptr, bufp, n);
4410 		bufp += n;
4411 	}
4412 
4413 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4414 
4415 	pub = priv_desc->descp;
4416 	pub->nbytes = priv_desc->datalen;
4417 
4418 	/* update statistics */
4419 	if (IS_BROADCAST(ehp))
4420 		ldcp->ldc_stats.brdcstxmt++;
4421 	else if (IS_MULTICAST(ehp))
4422 		ldcp->ldc_stats.multixmt++;
4423 	ldcp->ldc_stats.opackets++;
4424 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4425 
4426 	mutex_enter(&priv_desc->dstate_lock);
4427 	pub->hdr.dstate = VIO_DESC_READY;
4428 	mutex_exit(&priv_desc->dstate_lock);
4429 
4430 	/*
4431 	 * Determine whether or not we need to send a message to our
4432 	 * peer prompting them to read our newly updated descriptor(s).
4433 	 */
4434 	mutex_enter(&dp->restart_lock);
4435 	if (dp->restart_reqd) {
4436 		dp->restart_reqd = B_FALSE;
4437 		ldcp->ldc_stats.dring_data_msgs++;
4438 		mutex_exit(&dp->restart_lock);
4439 
4440 		/*
4441 		 * Send a vio_dring_msg to peer to prompt them to read
4442 		 * the updated descriptor ring.
4443 		 */
4444 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4445 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4446 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4447 		dring_pkt.tag.vio_sid = ldcp->local_session;
4448 
4449 		/* Note - for now using first ring */
4450 		dring_pkt.dring_ident = dp->ident;
4451 
4452 		/*
4453 		 * If last_ack_recv is -1 then we know we've not
4454 		 * received any ack's yet, so this must be the first
4455 		 * msg sent, so set the start to the begining of the ring.
4456 		 */
4457 		mutex_enter(&dp->dlock);
4458 		if (dp->last_ack_recv == -1) {
4459 			dring_pkt.start_idx = 0;
4460 		} else {
4461 			dring_pkt.start_idx =
4462 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4463 		}
4464 		dring_pkt.end_idx = -1;
4465 		mutex_exit(&dp->dlock);
4466 
4467 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4468 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4469 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4470 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4471 		    dring_pkt.end_idx);
4472 
4473 		RW_EXIT(&ldcp->lane_out.dlistrw);
4474 
4475 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4476 		    sizeof (vio_dring_msg_t), B_TRUE);
4477 
4478 		return (status);
4479 
4480 	} else {
4481 		mutex_exit(&dp->restart_lock);
4482 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4483 		    ldcp->ldc_id, idx);
4484 	}
4485 
4486 vsw_dringsend_free_exit:
4487 
4488 	RW_EXIT(&ldcp->lane_out.dlistrw);
4489 
4490 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4491 	return (status);
4492 }
4493 
4494 /*
4495  * Send an in-band descriptor message over ldc.
4496  */
4497 static int
4498 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4499 {
4500 	vsw_t			*vswp = ldcp->ldc_vswp;
4501 	vnet_ibnd_desc_t	ibnd_msg;
4502 	vsw_private_desc_t	*priv_desc = NULL;
4503 	dring_info_t		*dp = NULL;
4504 	size_t			n, size = 0;
4505 	caddr_t			bufp;
4506 	mblk_t			*bp;
4507 	int			idx, i;
4508 	int			status = LDC_TX_SUCCESS;
4509 	static int		warn_msg = 1;
4510 	lane_t			*lp = &ldcp->lane_out;
4511 
4512 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4513 
4514 	ASSERT(mp != NULL);
4515 
4516 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4517 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4518 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4519 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4520 		    ldcp->lane_out.lstate);
4521 		ldcp->ldc_stats.oerrors++;
4522 		return (LDC_TX_FAILURE);
4523 	}
4524 
4525 	/*
4526 	 * only expect single dring to exist, which we use
4527 	 * as an internal buffer, rather than a transfer channel.
4528 	 */
4529 	READ_ENTER(&ldcp->lane_out.dlistrw);
4530 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4531 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4532 		    __func__, ldcp->ldc_id);
4533 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4534 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4535 		RW_EXIT(&ldcp->lane_out.dlistrw);
4536 		ldcp->ldc_stats.oerrors++;
4537 		return (LDC_TX_FAILURE);
4538 	}
4539 
4540 	size = msgsize(mp);
4541 	if (size > (size_t)lp->mtu) {
4542 		RW_EXIT(&ldcp->lane_out.dlistrw);
4543 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4544 		    ldcp->ldc_id, size);
4545 		ldcp->ldc_stats.oerrors++;
4546 		return (LDC_TX_FAILURE);
4547 	}
4548 
4549 	/*
4550 	 * Find a free descriptor in our buffer ring
4551 	 */
4552 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4553 		RW_EXIT(&ldcp->lane_out.dlistrw);
4554 		if (warn_msg) {
4555 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4556 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4557 			warn_msg = 0;
4558 		}
4559 
4560 		/* nothing more we can do */
4561 		status = LDC_TX_NORESOURCES;
4562 		goto vsw_descrsend_free_exit;
4563 	} else {
4564 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4565 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4566 		warn_msg = 1;
4567 	}
4568 
4569 	/* copy data into the descriptor */
4570 	bufp = priv_desc->datap;
4571 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4572 		n = MBLKL(bp);
4573 		bcopy(bp->b_rptr, bufp, n);
4574 		bufp += n;
4575 	}
4576 
4577 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4578 
4579 	/* create and send the in-band descp msg */
4580 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4581 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4582 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4583 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4584 
4585 	/*
4586 	 * Copy the mem cookies describing the data from the
4587 	 * private region of the descriptor ring into the inband
4588 	 * descriptor.
4589 	 */
4590 	for (i = 0; i < priv_desc->ncookies; i++) {
4591 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4592 		    sizeof (ldc_mem_cookie_t));
4593 	}
4594 
4595 	ibnd_msg.hdr.desc_handle = idx;
4596 	ibnd_msg.ncookies = priv_desc->ncookies;
4597 	ibnd_msg.nbytes = size;
4598 
4599 	ldcp->ldc_stats.opackets++;
4600 	ldcp->ldc_stats.obytes += size;
4601 
4602 	RW_EXIT(&ldcp->lane_out.dlistrw);
4603 
4604 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4605 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4606 
4607 vsw_descrsend_free_exit:
4608 
4609 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4610 	return (status);
4611 }
4612 
4613 static void
4614 vsw_send_ver(void *arg)
4615 {
4616 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4617 	vsw_t		*vswp = ldcp->ldc_vswp;
4618 	lane_t		*lp = &ldcp->lane_out;
4619 	vio_ver_msg_t	ver_msg;
4620 
4621 	D1(vswp, "%s enter", __func__);
4622 
4623 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4624 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4625 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4626 	ver_msg.tag.vio_sid = ldcp->local_session;
4627 
4628 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4629 		ver_msg.ver_major = vsw_versions[0].ver_major;
4630 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4631 	} else {
4632 		/* use the major,minor that we've ack'd */
4633 		lane_t	*lpi = &ldcp->lane_in;
4634 		ver_msg.ver_major = lpi->ver_major;
4635 		ver_msg.ver_minor = lpi->ver_minor;
4636 	}
4637 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4638 
4639 	lp->lstate |= VSW_VER_INFO_SENT;
4640 	lp->ver_major = ver_msg.ver_major;
4641 	lp->ver_minor = ver_msg.ver_minor;
4642 
4643 	DUMP_TAG(ver_msg.tag);
4644 
4645 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4646 
4647 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4648 }
4649 
4650 static void
4651 vsw_send_attr(vsw_ldc_t *ldcp)
4652 {
4653 	vsw_t			*vswp = ldcp->ldc_vswp;
4654 	lane_t			*lp = &ldcp->lane_out;
4655 	vnet_attr_msg_t		attr_msg;
4656 
4657 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4658 
4659 	/*
4660 	 * Subtype is set to INFO by default
4661 	 */
4662 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4663 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4664 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4665 	attr_msg.tag.vio_sid = ldcp->local_session;
4666 
4667 	/* payload copied from default settings for lane */
4668 	attr_msg.mtu = lp->mtu;
4669 	attr_msg.addr_type = lp->addr_type;
4670 	attr_msg.xfer_mode = lp->xfer_mode;
4671 	attr_msg.ack_freq = lp->xfer_mode;
4672 
4673 	READ_ENTER(&vswp->if_lockrw);
4674 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4675 	RW_EXIT(&vswp->if_lockrw);
4676 
4677 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4678 
4679 	DUMP_TAG(attr_msg.tag);
4680 
4681 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4682 
4683 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4684 }
4685 
4686 /*
4687  * Create dring info msg (which also results in the creation of
4688  * a dring).
4689  */
4690 static vio_dring_reg_msg_t *
4691 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4692 {
4693 	vio_dring_reg_msg_t	*mp;
4694 	dring_info_t		*dp;
4695 	vsw_t			*vswp = ldcp->ldc_vswp;
4696 
4697 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4698 
4699 	/*
4700 	 * If we can't create a dring, obviously no point sending
4701 	 * a message.
4702 	 */
4703 	if ((dp = vsw_create_dring(ldcp)) == NULL)
4704 		return (NULL);
4705 
4706 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4707 
4708 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4709 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4710 	mp->tag.vio_subtype_env = VIO_DRING_REG;
4711 	mp->tag.vio_sid = ldcp->local_session;
4712 
4713 	/* payload */
4714 	mp->num_descriptors = dp->num_descriptors;
4715 	mp->descriptor_size = dp->descriptor_size;
4716 	mp->options = dp->options;
4717 	mp->ncookies = dp->ncookies;
4718 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4719 
4720 	mp->dring_ident = 0;
4721 
4722 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4723 
4724 	return (mp);
4725 }
4726 
4727 static void
4728 vsw_send_dring_info(vsw_ldc_t *ldcp)
4729 {
4730 	vio_dring_reg_msg_t	*dring_msg;
4731 	vsw_t			*vswp = ldcp->ldc_vswp;
4732 
4733 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4734 
4735 	dring_msg = vsw_create_dring_info_pkt(ldcp);
4736 	if (dring_msg == NULL) {
4737 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4738 		    vswp->instance, __func__);
4739 		return;
4740 	}
4741 
4742 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4743 
4744 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4745 
4746 	(void) vsw_send_msg(ldcp, dring_msg,
4747 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4748 
4749 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4750 
4751 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4752 }
4753 
4754 static void
4755 vsw_send_rdx(vsw_ldc_t *ldcp)
4756 {
4757 	vsw_t		*vswp = ldcp->ldc_vswp;
4758 	vio_rdx_msg_t	rdx_msg;
4759 
4760 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4761 
4762 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4763 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4764 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4765 	rdx_msg.tag.vio_sid = ldcp->local_session;
4766 
4767 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4768 
4769 	DUMP_TAG(rdx_msg.tag);
4770 
4771 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4772 
4773 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4774 }
4775 
4776 /*
4777  * Generic routine to send message out over ldc channel.
4778  *
4779  * It is possible that when we attempt to write over the ldc channel
4780  * that we get notified that it has been reset. Depending on the value
4781  * of the handle_reset flag we either handle that event here or simply
4782  * notify the caller that the channel was reset.
4783  */
4784 int
4785 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
4786 {
4787 	int			rv;
4788 	size_t			msglen = size;
4789 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
4790 	vsw_t			*vswp = ldcp->ldc_vswp;
4791 	vio_dring_msg_t		*dmsg;
4792 	vio_raw_data_msg_t	*rmsg;
4793 	vnet_ibnd_desc_t	*imsg;
4794 	boolean_t		data_msg = B_FALSE;
4795 
4796 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
4797 	    ldcp->ldc_id, size);
4798 
4799 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
4800 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
4801 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
4802 
4803 	mutex_enter(&ldcp->ldc_txlock);
4804 
4805 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
4806 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
4807 			dmsg = (vio_dring_msg_t *)tag;
4808 			dmsg->seq_num = ldcp->lane_out.seq_num;
4809 			data_msg = B_TRUE;
4810 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
4811 			rmsg = (vio_raw_data_msg_t *)tag;
4812 			rmsg->seq_num = ldcp->lane_out.seq_num;
4813 			data_msg = B_TRUE;
4814 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
4815 			imsg = (vnet_ibnd_desc_t *)tag;
4816 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
4817 			data_msg = B_TRUE;
4818 		}
4819 	}
4820 
4821 	do {
4822 		msglen = size;
4823 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
4824 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
4825 
4826 	if (rv == 0 && data_msg == B_TRUE) {
4827 		ldcp->lane_out.seq_num++;
4828 	}
4829 
4830 	if ((rv != 0) || (msglen != size)) {
4831 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
4832 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
4833 		ldcp->ldc_stats.oerrors++;
4834 	}
4835 
4836 	mutex_exit(&ldcp->ldc_txlock);
4837 
4838 	/*
4839 	 * If channel has been reset we either handle it here or
4840 	 * simply report back that it has been reset and let caller
4841 	 * decide what to do.
4842 	 */
4843 	if (rv == ECONNRESET) {
4844 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
4845 
4846 		/*
4847 		 * N.B - must never be holding the dlistrw lock when
4848 		 * we do a reset of the channel.
4849 		 */
4850 		if (handle_reset) {
4851 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4852 		}
4853 	}
4854 
4855 	return (rv);
4856 }
4857 
4858 /*
4859  * Remove the specified address from the list of address maintained
4860  * in this port node.
4861  */
4862 mcst_addr_t *
4863 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4864 {
4865 	vsw_t		*vswp = NULL;
4866 	vsw_port_t	*port = NULL;
4867 	mcst_addr_t	*prev_p = NULL;
4868 	mcst_addr_t	*curr_p = NULL;
4869 
4870 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4871 	    __func__, devtype, addr);
4872 
4873 	if (devtype == VSW_VNETPORT) {
4874 		port = (vsw_port_t *)arg;
4875 		mutex_enter(&port->mca_lock);
4876 		prev_p = curr_p = port->mcap;
4877 	} else {
4878 		vswp = (vsw_t *)arg;
4879 		mutex_enter(&vswp->mca_lock);
4880 		prev_p = curr_p = vswp->mcap;
4881 	}
4882 
4883 	while (curr_p != NULL) {
4884 		if (curr_p->addr == addr) {
4885 			D2(NULL, "%s: address found", __func__);
4886 			/* match found */
4887 			if (prev_p == curr_p) {
4888 				/* list head */
4889 				if (devtype == VSW_VNETPORT)
4890 					port->mcap = curr_p->nextp;
4891 				else
4892 					vswp->mcap = curr_p->nextp;
4893 			} else {
4894 				prev_p->nextp = curr_p->nextp;
4895 			}
4896 			break;
4897 		} else {
4898 			prev_p = curr_p;
4899 			curr_p = curr_p->nextp;
4900 		}
4901 	}
4902 
4903 	if (devtype == VSW_VNETPORT)
4904 		mutex_exit(&port->mca_lock);
4905 	else
4906 		mutex_exit(&vswp->mca_lock);
4907 
4908 	D1(NULL, "%s: exit", __func__);
4909 
4910 	return (curr_p);
4911 }
4912 
4913 /*
4914  * Creates a descriptor ring (dring) and links it into the
4915  * link of outbound drings for this channel.
4916  *
4917  * Returns NULL if creation failed.
4918  */
4919 static dring_info_t *
4920 vsw_create_dring(vsw_ldc_t *ldcp)
4921 {
4922 	vsw_private_desc_t	*priv_addr = NULL;
4923 	vsw_t			*vswp = ldcp->ldc_vswp;
4924 	ldc_mem_info_t		minfo;
4925 	dring_info_t		*dp, *tp;
4926 	int			i;
4927 
4928 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4929 
4930 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4931 
4932 	/* create public section of ring */
4933 	if ((ldc_mem_dring_create(vsw_ntxds,
4934 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
4935 
4936 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
4937 		    "failed", ldcp->ldc_id);
4938 		goto create_fail_exit;
4939 	}
4940 
4941 	ASSERT(dp->handle != NULL);
4942 
4943 	/*
4944 	 * Get the base address of the public section of the ring.
4945 	 */
4946 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4947 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
4948 		    ldcp->ldc_id);
4949 		goto dring_fail_exit;
4950 	} else {
4951 		ASSERT(minfo.vaddr != 0);
4952 		dp->pub_addr = minfo.vaddr;
4953 	}
4954 
4955 	dp->num_descriptors = vsw_ntxds;
4956 	dp->descriptor_size = VSW_PUB_SIZE;
4957 	dp->options = VIO_TX_DRING;
4958 	dp->ncookies = 1;	/* guaranteed by ldc */
4959 
4960 	/*
4961 	 * create private portion of ring
4962 	 */
4963 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
4964 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
4965 
4966 	if (vsw_setup_ring(ldcp, dp)) {
4967 		DERR(vswp, "%s: unable to setup ring", __func__);
4968 		goto dring_fail_exit;
4969 	}
4970 
4971 	/* haven't used any descriptors yet */
4972 	dp->end_idx = 0;
4973 	dp->last_ack_recv = -1;
4974 
4975 	/* bind dring to the channel */
4976 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
4977 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
4978 	    &dp->cookie[0], &dp->ncookies)) != 0) {
4979 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
4980 		    "%lld", ldcp->ldc_id);
4981 		goto dring_fail_exit;
4982 	}
4983 
4984 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4985 	dp->restart_reqd = B_TRUE;
4986 
4987 	/*
4988 	 * Only ever create rings for outgoing lane. Link it onto
4989 	 * end of list.
4990 	 */
4991 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4992 	if (ldcp->lane_out.dringp == NULL) {
4993 		D2(vswp, "vsw_create_dring: adding first outbound ring");
4994 		ldcp->lane_out.dringp = dp;
4995 	} else {
4996 		tp = ldcp->lane_out.dringp;
4997 		while (tp->next != NULL)
4998 			tp = tp->next;
4999 
5000 		tp->next = dp;
5001 	}
5002 	RW_EXIT(&ldcp->lane_out.dlistrw);
5003 
5004 	return (dp);
5005 
5006 dring_fail_exit:
5007 	(void) ldc_mem_dring_destroy(dp->handle);
5008 
5009 create_fail_exit:
5010 	if (dp->priv_addr != NULL) {
5011 		priv_addr = dp->priv_addr;
5012 		for (i = 0; i < vsw_ntxds; i++) {
5013 			if (priv_addr->memhandle != NULL)
5014 				(void) ldc_mem_free_handle(
5015 				    priv_addr->memhandle);
5016 			priv_addr++;
5017 		}
5018 		kmem_free(dp->priv_addr,
5019 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5020 	}
5021 	mutex_destroy(&dp->dlock);
5022 
5023 	kmem_free(dp, sizeof (dring_info_t));
5024 	return (NULL);
5025 }
5026 
5027 /*
5028  * Create a ring consisting of just a private portion and link
5029  * it into the list of rings for the outbound lane.
5030  *
5031  * These type of rings are used primarily for temporary data
5032  * storage (i.e. as data buffers).
5033  */
5034 void
5035 vsw_create_privring(vsw_ldc_t *ldcp)
5036 {
5037 	dring_info_t		*dp, *tp;
5038 	vsw_t			*vswp = ldcp->ldc_vswp;
5039 
5040 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5041 
5042 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5043 
5044 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5045 
5046 	/* no public section */
5047 	dp->pub_addr = NULL;
5048 
5049 	dp->priv_addr = kmem_zalloc(
5050 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5051 
5052 	dp->num_descriptors = vsw_ntxds;
5053 
5054 	if (vsw_setup_ring(ldcp, dp)) {
5055 		DERR(vswp, "%s: setup of ring failed", __func__);
5056 		kmem_free(dp->priv_addr,
5057 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5058 		mutex_destroy(&dp->dlock);
5059 		kmem_free(dp, sizeof (dring_info_t));
5060 		return;
5061 	}
5062 
5063 	/* haven't used any descriptors yet */
5064 	dp->end_idx = 0;
5065 
5066 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5067 	dp->restart_reqd = B_TRUE;
5068 
5069 	/*
5070 	 * Only ever create rings for outgoing lane. Link it onto
5071 	 * end of list.
5072 	 */
5073 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5074 	if (ldcp->lane_out.dringp == NULL) {
5075 		D2(vswp, "%s: adding first outbound privring", __func__);
5076 		ldcp->lane_out.dringp = dp;
5077 	} else {
5078 		tp = ldcp->lane_out.dringp;
5079 		while (tp->next != NULL)
5080 			tp = tp->next;
5081 
5082 		tp->next = dp;
5083 	}
5084 	RW_EXIT(&ldcp->lane_out.dlistrw);
5085 
5086 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5087 }
5088 
5089 /*
5090  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5091  * failure.
5092  */
5093 int
5094 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5095 {
5096 	vnet_public_desc_t	*pub_addr = NULL;
5097 	vsw_private_desc_t	*priv_addr = NULL;
5098 	vsw_t			*vswp = ldcp->ldc_vswp;
5099 	uint64_t		*tmpp;
5100 	uint64_t		offset = 0;
5101 	uint32_t		ncookies = 0;
5102 	static char		*name = "vsw_setup_ring";
5103 	int			i, j, nc, rv;
5104 	size_t			data_sz;
5105 
5106 	priv_addr = dp->priv_addr;
5107 	pub_addr = dp->pub_addr;
5108 
5109 	/* public section may be null but private should never be */
5110 	ASSERT(priv_addr != NULL);
5111 
5112 	/*
5113 	 * Allocate the region of memory which will be used to hold
5114 	 * the data the descriptors will refer to.
5115 	 */
5116 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5117 	data_sz = VNET_ROUNDUP_2K(data_sz);
5118 	dp->desc_data_sz = data_sz;
5119 	dp->data_sz = vsw_ntxds * data_sz;
5120 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5121 
5122 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5123 	    dp->data_sz, dp->data_addr);
5124 
5125 	tmpp = (uint64_t *)dp->data_addr;
5126 	offset = dp->desc_data_sz/sizeof (tmpp);
5127 
5128 	/*
5129 	 * Initialise some of the private and public (if they exist)
5130 	 * descriptor fields.
5131 	 */
5132 	for (i = 0; i < vsw_ntxds; i++) {
5133 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5134 
5135 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5136 		    &priv_addr->memhandle)) != 0) {
5137 			DERR(vswp, "%s: alloc mem handle failed", name);
5138 			goto setup_ring_cleanup;
5139 		}
5140 
5141 		priv_addr->datap = (void *)tmpp;
5142 
5143 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5144 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5145 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5146 		    &(priv_addr->memcookie[0]), &ncookies);
5147 		if (rv != 0) {
5148 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5149 			    "(rv %d)", name, ldcp->ldc_id, rv);
5150 			goto setup_ring_cleanup;
5151 		}
5152 		priv_addr->bound = 1;
5153 
5154 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5155 		    name, i, priv_addr->memcookie[0].addr,
5156 		    priv_addr->memcookie[0].size);
5157 
5158 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5159 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5160 			    "invalid num of cookies (%d) for size 0x%llx",
5161 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5162 
5163 			goto setup_ring_cleanup;
5164 		} else {
5165 			for (j = 1; j < ncookies; j++) {
5166 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5167 				    &(priv_addr->memcookie[j]));
5168 				if (rv != 0) {
5169 					DERR(vswp, "%s: ldc_mem_nextcookie "
5170 					    "failed rv (%d)", name, rv);
5171 					goto setup_ring_cleanup;
5172 				}
5173 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5174 				    "size 0x%llx", name, j,
5175 				    priv_addr->memcookie[j].addr,
5176 				    priv_addr->memcookie[j].size);
5177 			}
5178 
5179 		}
5180 		priv_addr->ncookies = ncookies;
5181 		priv_addr->dstate = VIO_DESC_FREE;
5182 
5183 		if (pub_addr != NULL) {
5184 
5185 			/* link pub and private sides */
5186 			priv_addr->descp = pub_addr;
5187 
5188 			pub_addr->ncookies = priv_addr->ncookies;
5189 
5190 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5191 				bcopy(&priv_addr->memcookie[nc],
5192 				    &pub_addr->memcookie[nc],
5193 				    sizeof (ldc_mem_cookie_t));
5194 			}
5195 
5196 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5197 			pub_addr++;
5198 		}
5199 
5200 		/*
5201 		 * move to next element in the dring and the next
5202 		 * position in the data buffer.
5203 		 */
5204 		priv_addr++;
5205 		tmpp += offset;
5206 	}
5207 
5208 	return (0);
5209 
5210 setup_ring_cleanup:
5211 	priv_addr = dp->priv_addr;
5212 
5213 	for (j = 0; j < i; j++) {
5214 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5215 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5216 
5217 		mutex_destroy(&priv_addr->dstate_lock);
5218 
5219 		priv_addr++;
5220 	}
5221 	kmem_free(dp->data_addr, dp->data_sz);
5222 
5223 	return (1);
5224 }
5225 
5226 /*
5227  * Searches the private section of a ring for a free descriptor,
5228  * starting at the location of the last free descriptor found
5229  * previously.
5230  *
5231  * Returns 0 if free descriptor is available, and updates state
5232  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5233  *
5234  * FUTURE: might need to return contiguous range of descriptors
5235  * as dring info msg assumes all will be contiguous.
5236  */
5237 static int
5238 vsw_dring_find_free_desc(dring_info_t *dringp,
5239 		vsw_private_desc_t **priv_p, int *idx)
5240 {
5241 	vsw_private_desc_t	*addr = NULL;
5242 	int			num = vsw_ntxds;
5243 	int			ret = 1;
5244 
5245 	D1(NULL, "%s enter\n", __func__);
5246 
5247 	ASSERT(dringp->priv_addr != NULL);
5248 
5249 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5250 	    __func__, dringp, dringp->end_idx);
5251 
5252 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5253 
5254 	mutex_enter(&addr->dstate_lock);
5255 	if (addr->dstate == VIO_DESC_FREE) {
5256 		addr->dstate = VIO_DESC_READY;
5257 		*priv_p = addr;
5258 		*idx = dringp->end_idx;
5259 		dringp->end_idx = (dringp->end_idx + 1) % num;
5260 		ret = 0;
5261 
5262 	}
5263 	mutex_exit(&addr->dstate_lock);
5264 
5265 	/* ring full */
5266 	if (ret == 1) {
5267 		D2(NULL, "%s: no desp free: started at %d", __func__,
5268 		    dringp->end_idx);
5269 	}
5270 
5271 	D1(NULL, "%s: exit\n", __func__);
5272 
5273 	return (ret);
5274 }
5275 
5276 /*
5277  * Map from a dring identifier to the ring itself. Returns
5278  * pointer to ring or NULL if no match found.
5279  *
5280  * Should be called with dlistrw rwlock held as reader.
5281  */
5282 static dring_info_t *
5283 vsw_ident2dring(lane_t *lane, uint64_t ident)
5284 {
5285 	dring_info_t	*dp = NULL;
5286 
5287 	if ((dp = lane->dringp) == NULL) {
5288 		return (NULL);
5289 	} else {
5290 		if (dp->ident == ident)
5291 			return (dp);
5292 
5293 		while (dp != NULL) {
5294 			if (dp->ident == ident)
5295 				break;
5296 			dp = dp->next;
5297 		}
5298 	}
5299 
5300 	return (dp);
5301 }
5302 
5303 /*
5304  * Set the default lane attributes. These are copied into
5305  * the attr msg we send to our peer. If they are not acceptable
5306  * then (currently) the handshake ends.
5307  */
5308 static void
5309 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5310 {
5311 	bzero(lp, sizeof (lane_t));
5312 
5313 	READ_ENTER(&vswp->if_lockrw);
5314 	ether_copy(&(vswp->if_addr), &(lp->addr));
5315 	RW_EXIT(&vswp->if_lockrw);
5316 
5317 	lp->mtu = vswp->max_frame_size;
5318 	lp->addr_type = ADDR_TYPE_MAC;
5319 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5320 	lp->ack_freq = 0;	/* for shared mode */
5321 	lp->seq_num = VNET_ISS;
5322 }
5323 
5324 /*
5325  * Verify that the attributes are acceptable.
5326  *
5327  * FUTURE: If some attributes are not acceptable, change them
5328  * our desired values.
5329  */
5330 static int
5331 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5332 {
5333 	int			ret = 0;
5334 	struct ether_addr	ea;
5335 	vsw_port_t		*port = ldcp->ldc_port;
5336 	lane_t			*lp = &ldcp->lane_out;
5337 
5338 	D1(NULL, "vsw_check_attr enter\n");
5339 
5340 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5341 	    (pkt->xfer_mode != lp->xfer_mode)) {
5342 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5343 		ret = 1;
5344 	}
5345 
5346 	/* Only support MAC addresses at moment. */
5347 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5348 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5349 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5350 		ret = 1;
5351 	}
5352 
5353 	/*
5354 	 * MAC address supplied by device should match that stored
5355 	 * in the vsw-port OBP node. Need to decide what to do if they
5356 	 * don't match, for the moment just warn but don't fail.
5357 	 */
5358 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5359 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5360 		DERR(NULL, "vsw_check_attr: device supplied address "
5361 		    "0x%llx doesn't match node address 0x%llx\n",
5362 		    pkt->addr, port->p_macaddr);
5363 	}
5364 
5365 	/*
5366 	 * Ack freq only makes sense in pkt mode, in shared
5367 	 * mode the ring descriptors say whether or not to
5368 	 * send back an ACK.
5369 	 */
5370 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5371 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5372 	    (VSW_VER_LT(ldcp, 1, 2) &&
5373 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5374 		if (pkt->ack_freq > 0) {
5375 			D2(NULL, "vsw_check_attr: non zero ack freq "
5376 			    " in SHM mode\n");
5377 			ret = 1;
5378 		}
5379 	}
5380 
5381 	/*
5382 	 * Note: for the moment we only support ETHER
5383 	 * frames. This may change in the future.
5384 	 */
5385 	if ((pkt->mtu > lp->mtu) || (pkt->mtu <= 0)) {
5386 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5387 		    pkt->mtu);
5388 		ret = 1;
5389 	}
5390 
5391 	D1(NULL, "vsw_check_attr exit\n");
5392 
5393 	return (ret);
5394 }
5395 
5396 /*
5397  * Returns 1 if there is a problem, 0 otherwise.
5398  */
5399 static int
5400 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5401 {
5402 	_NOTE(ARGUNUSED(pkt))
5403 
5404 	int	ret = 0;
5405 
5406 	D1(NULL, "vsw_check_dring_info enter\n");
5407 
5408 	if ((pkt->num_descriptors == 0) ||
5409 	    (pkt->descriptor_size == 0) ||
5410 	    (pkt->ncookies != 1)) {
5411 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5412 		ret = 1;
5413 	}
5414 
5415 	D1(NULL, "vsw_check_dring_info exit\n");
5416 
5417 	return (ret);
5418 }
5419 
5420 /*
5421  * Returns 1 if two memory cookies match. Otherwise returns 0.
5422  */
5423 static int
5424 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5425 {
5426 	if ((m1->addr != m2->addr) ||
5427 	    (m2->size != m2->size)) {
5428 		return (0);
5429 	} else {
5430 		return (1);
5431 	}
5432 }
5433 
5434 /*
5435  * Returns 1 if ring described in reg message matches that
5436  * described by dring_info structure. Otherwise returns 0.
5437  */
5438 static int
5439 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5440 {
5441 	if ((msg->descriptor_size != dp->descriptor_size) ||
5442 	    (msg->num_descriptors != dp->num_descriptors) ||
5443 	    (msg->ncookies != dp->ncookies) ||
5444 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5445 		return (0);
5446 	} else {
5447 		return (1);
5448 	}
5449 
5450 }
5451 
5452 static caddr_t
5453 vsw_print_ethaddr(uint8_t *a, char *ebuf)
5454 {
5455 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
5456 	    a[0], a[1], a[2], a[3], a[4], a[5]);
5457 	return (ebuf);
5458 }
5459 
5460 /*
5461  * Reset and free all the resources associated with
5462  * the channel.
5463  */
5464 static void
5465 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5466 {
5467 	dring_info_t		*dp, *dpp;
5468 	lane_t			*lp = NULL;
5469 	int			rv = 0;
5470 
5471 	ASSERT(ldcp != NULL);
5472 
5473 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5474 
5475 	if (dir == INBOUND) {
5476 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5477 		    " of channel %lld", __func__, ldcp->ldc_id);
5478 		lp = &ldcp->lane_in;
5479 	} else {
5480 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5481 		    " of channel %lld", __func__, ldcp->ldc_id);
5482 		lp = &ldcp->lane_out;
5483 	}
5484 
5485 	lp->lstate = VSW_LANE_INACTIV;
5486 	lp->seq_num = VNET_ISS;
5487 
5488 	if (lp->dringp) {
5489 		if (dir == INBOUND) {
5490 			WRITE_ENTER(&lp->dlistrw);
5491 			dp = lp->dringp;
5492 			while (dp != NULL) {
5493 				dpp = dp->next;
5494 				if (dp->handle != NULL)
5495 					(void) ldc_mem_dring_unmap(dp->handle);
5496 				kmem_free(dp, sizeof (dring_info_t));
5497 				dp = dpp;
5498 			}
5499 			RW_EXIT(&lp->dlistrw);
5500 		} else {
5501 			/*
5502 			 * unbind, destroy exported dring, free dring struct
5503 			 */
5504 			WRITE_ENTER(&lp->dlistrw);
5505 			dp = lp->dringp;
5506 			rv = vsw_free_ring(dp);
5507 			RW_EXIT(&lp->dlistrw);
5508 		}
5509 		if (rv == 0) {
5510 			lp->dringp = NULL;
5511 		}
5512 	}
5513 
5514 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5515 }
5516 
5517 /*
5518  * Free ring and all associated resources.
5519  *
5520  * Should be called with dlistrw rwlock held as writer.
5521  */
5522 static int
5523 vsw_free_ring(dring_info_t *dp)
5524 {
5525 	vsw_private_desc_t	*paddr = NULL;
5526 	dring_info_t		*dpp;
5527 	int			i, rv = 1;
5528 
5529 	while (dp != NULL) {
5530 		mutex_enter(&dp->dlock);
5531 		dpp = dp->next;
5532 		if (dp->priv_addr != NULL) {
5533 			/*
5534 			 * First unbind and free the memory handles
5535 			 * stored in each descriptor within the ring.
5536 			 */
5537 			for (i = 0; i < vsw_ntxds; i++) {
5538 				paddr = (vsw_private_desc_t *)
5539 				    dp->priv_addr + i;
5540 				if (paddr->memhandle != NULL) {
5541 					if (paddr->bound == 1) {
5542 						rv = ldc_mem_unbind_handle(
5543 						    paddr->memhandle);
5544 
5545 						if (rv != 0) {
5546 							DERR(NULL, "error "
5547 							"unbinding handle for "
5548 							"ring 0x%llx at pos %d",
5549 							    dp, i);
5550 							mutex_exit(&dp->dlock);
5551 							return (rv);
5552 						}
5553 						paddr->bound = 0;
5554 					}
5555 
5556 					rv = ldc_mem_free_handle(
5557 					    paddr->memhandle);
5558 					if (rv != 0) {
5559 						DERR(NULL, "error freeing "
5560 						    "handle for ring 0x%llx "
5561 						    "at pos %d", dp, i);
5562 						mutex_exit(&dp->dlock);
5563 						return (rv);
5564 					}
5565 					paddr->memhandle = NULL;
5566 				}
5567 				mutex_destroy(&paddr->dstate_lock);
5568 			}
5569 			kmem_free(dp->priv_addr,
5570 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5571 		}
5572 
5573 		/*
5574 		 * Now unbind and destroy the ring itself.
5575 		 */
5576 		if (dp->handle != NULL) {
5577 			(void) ldc_mem_dring_unbind(dp->handle);
5578 			(void) ldc_mem_dring_destroy(dp->handle);
5579 		}
5580 
5581 		if (dp->data_addr != NULL) {
5582 			kmem_free(dp->data_addr, dp->data_sz);
5583 		}
5584 
5585 		mutex_exit(&dp->dlock);
5586 		mutex_destroy(&dp->dlock);
5587 		mutex_destroy(&dp->restart_lock);
5588 		kmem_free(dp, sizeof (dring_info_t));
5589 
5590 		dp = dpp;
5591 	}
5592 	return (0);
5593 }
5594 
5595 /*
5596  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5597  * This thread is woken up by the LDC interrupt handler to process
5598  * LDC packets and receive data.
5599  */
5600 static void
5601 vsw_ldc_rx_worker(void *arg)
5602 {
5603 	callb_cpr_t	cprinfo;
5604 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5605 	vsw_t *vswp = ldcp->ldc_vswp;
5606 
5607 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5608 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5609 	    "vsw_rx_thread");
5610 	mutex_enter(&ldcp->rx_thr_lock);
5611 	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5612 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5613 
5614 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5615 		/*
5616 		 * Wait until the data is received or a stop
5617 		 * request is received.
5618 		 */
5619 		while (!(ldcp->rx_thr_flags &
5620 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5621 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5622 		}
5623 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5624 
5625 		/*
5626 		 * First process the stop request.
5627 		 */
5628 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5629 			D2(vswp, "%s(%lld):Rx thread stopped\n",
5630 			    __func__, ldcp->ldc_id);
5631 			break;
5632 		}
5633 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5634 		mutex_exit(&ldcp->rx_thr_lock);
5635 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5636 		    __func__, ldcp->ldc_id);
5637 		mutex_enter(&ldcp->ldc_cblock);
5638 		vsw_process_pkt(ldcp);
5639 		mutex_exit(&ldcp->ldc_cblock);
5640 		mutex_enter(&ldcp->rx_thr_lock);
5641 	}
5642 
5643 	/*
5644 	 * Update the run status and wakeup the thread that
5645 	 * has sent the stop request.
5646 	 */
5647 	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5648 	cv_signal(&ldcp->rx_thr_cv);
5649 	CALLB_CPR_EXIT(&cprinfo);
5650 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5651 	thread_exit();
5652 }
5653 
5654 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5655 static void
5656 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5657 {
5658 	vsw_t *vswp = ldcp->ldc_vswp;
5659 
5660 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5661 	/*
5662 	 * Send a stop request by setting the stop flag and
5663 	 * wait until the receive thread stops.
5664 	 */
5665 	mutex_enter(&ldcp->rx_thr_lock);
5666 	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5667 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5668 		cv_signal(&ldcp->rx_thr_cv);
5669 		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5670 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5671 		}
5672 	}
5673 	mutex_exit(&ldcp->rx_thr_lock);
5674 	ldcp->rx_thread = NULL;
5675 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5676 }
5677 
5678 /*
5679  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5680  * This thread is woken up by the vsw_portsend to transmit
5681  * packets.
5682  */
5683 static void
5684 vsw_ldc_tx_worker(void *arg)
5685 {
5686 	callb_cpr_t	cprinfo;
5687 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5688 	vsw_t *vswp = ldcp->ldc_vswp;
5689 	mblk_t *mp;
5690 	mblk_t *tmp;
5691 
5692 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5693 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5694 	    "vnet_tx_thread");
5695 	mutex_enter(&ldcp->tx_thr_lock);
5696 	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5697 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5698 
5699 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5700 		/*
5701 		 * Wait until the data is received or a stop
5702 		 * request is received.
5703 		 */
5704 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5705 		    (ldcp->tx_mhead == NULL)) {
5706 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5707 		}
5708 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5709 
5710 		/*
5711 		 * First process the stop request.
5712 		 */
5713 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5714 			D2(vswp, "%s(%lld):tx thread stopped\n",
5715 			    __func__, ldcp->ldc_id);
5716 			break;
5717 		}
5718 		mp = ldcp->tx_mhead;
5719 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5720 		ldcp->tx_cnt = 0;
5721 		mutex_exit(&ldcp->tx_thr_lock);
5722 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5723 		    __func__, ldcp->ldc_id);
5724 		while (mp != NULL) {
5725 			tmp = mp->b_next;
5726 			mp->b_next = mp->b_prev = NULL;
5727 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5728 			mp = tmp;
5729 		}
5730 		mutex_enter(&ldcp->tx_thr_lock);
5731 	}
5732 
5733 	/*
5734 	 * Update the run status and wakeup the thread that
5735 	 * has sent the stop request.
5736 	 */
5737 	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5738 	cv_signal(&ldcp->tx_thr_cv);
5739 	CALLB_CPR_EXIT(&cprinfo);
5740 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5741 	thread_exit();
5742 }
5743 
5744 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5745 static void
5746 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5747 {
5748 	vsw_t *vswp = ldcp->ldc_vswp;
5749 
5750 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5751 	/*
5752 	 * Send a stop request by setting the stop flag and
5753 	 * wait until the receive thread stops.
5754 	 */
5755 	mutex_enter(&ldcp->tx_thr_lock);
5756 	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5757 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5758 		cv_signal(&ldcp->tx_thr_cv);
5759 		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5760 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5761 		}
5762 	}
5763 	mutex_exit(&ldcp->tx_thr_lock);
5764 	ldcp->tx_thread = NULL;
5765 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5766 }
5767 
5768 /* vsw_reclaim_dring -- reclaim descriptors */
5769 static int
5770 vsw_reclaim_dring(dring_info_t *dp, int start)
5771 {
5772 	int i, j, len;
5773 	vsw_private_desc_t *priv_addr;
5774 	vnet_public_desc_t *pub_addr;
5775 
5776 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5777 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5778 	len = dp->num_descriptors;
5779 
5780 	D2(NULL, "%s: start index %ld\n", __func__, start);
5781 
5782 	j = 0;
5783 	for (i = start; j < len; i = (i + 1) % len, j++) {
5784 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5785 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5786 
5787 		mutex_enter(&priv_addr->dstate_lock);
5788 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
5789 			mutex_exit(&priv_addr->dstate_lock);
5790 			break;
5791 		}
5792 		pub_addr->hdr.dstate = VIO_DESC_FREE;
5793 		priv_addr->dstate = VIO_DESC_FREE;
5794 		/* clear all the fields */
5795 		priv_addr->datalen = 0;
5796 		pub_addr->hdr.ack = 0;
5797 		mutex_exit(&priv_addr->dstate_lock);
5798 
5799 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
5800 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
5801 	}
5802 	return (j);
5803 }
5804 
5805 /*
5806  * Debugging routines
5807  */
5808 static void
5809 display_state(void)
5810 {
5811 	vsw_t		*vswp;
5812 	vsw_port_list_t	*plist;
5813 	vsw_port_t 	*port;
5814 	vsw_ldc_list_t	*ldcl;
5815 	vsw_ldc_t 	*ldcp;
5816 	extern vsw_t 	*vsw_head;
5817 
5818 	cmn_err(CE_NOTE, "***** system state *****");
5819 
5820 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
5821 		plist = &vswp->plist;
5822 		READ_ENTER(&plist->lockrw);
5823 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
5824 		    vswp->instance, plist->num_ports);
5825 
5826 		for (port = plist->head; port != NULL; port = port->p_next) {
5827 			ldcl = &port->p_ldclist;
5828 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
5829 			    port->p_instance, port->num_ldcs);
5830 			READ_ENTER(&ldcl->lockrw);
5831 			ldcp = ldcl->head;
5832 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
5833 				cmn_err(CE_CONT, "chan %lu : dev %d : "
5834 				    "status %d : phase %u\n",
5835 				    ldcp->ldc_id, ldcp->dev_class,
5836 				    ldcp->ldc_status, ldcp->hphase);
5837 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
5838 				    "psession %lu\n", ldcp->ldc_id,
5839 				    ldcp->local_session, ldcp->peer_session);
5840 
5841 				cmn_err(CE_CONT, "Inbound lane:\n");
5842 				display_lane(&ldcp->lane_in);
5843 				cmn_err(CE_CONT, "Outbound lane:\n");
5844 				display_lane(&ldcp->lane_out);
5845 			}
5846 			RW_EXIT(&ldcl->lockrw);
5847 		}
5848 		RW_EXIT(&plist->lockrw);
5849 	}
5850 	cmn_err(CE_NOTE, "***** system state *****");
5851 }
5852 
5853 static void
5854 display_lane(lane_t *lp)
5855 {
5856 	dring_info_t	*drp;
5857 
5858 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
5859 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
5860 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
5861 	    lp->addr_type, lp->addr, lp->xfer_mode);
5862 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
5863 
5864 	cmn_err(CE_CONT, "Dring info:\n");
5865 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
5866 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
5867 		    drp->num_descriptors, drp->descriptor_size);
5868 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
5869 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
5870 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
5871 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
5872 		    drp->ident, drp->end_idx);
5873 		display_ring(drp);
5874 	}
5875 }
5876 
5877 static void
5878 display_ring(dring_info_t *dringp)
5879 {
5880 	uint64_t		i;
5881 	uint64_t		priv_count = 0;
5882 	uint64_t		pub_count = 0;
5883 	vnet_public_desc_t	*pub_addr = NULL;
5884 	vsw_private_desc_t	*priv_addr = NULL;
5885 
5886 	for (i = 0; i < vsw_ntxds; i++) {
5887 		if (dringp->pub_addr != NULL) {
5888 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
5889 
5890 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
5891 				pub_count++;
5892 		}
5893 
5894 		if (dringp->priv_addr != NULL) {
5895 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
5896 
5897 			if (priv_addr->dstate == VIO_DESC_FREE)
5898 				priv_count++;
5899 		}
5900 	}
5901 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
5902 	    i, priv_count, pub_count);
5903 }
5904 
5905 static void
5906 dump_flags(uint64_t state)
5907 {
5908 	int	i;
5909 
5910 	typedef struct flag_name {
5911 		int	flag_val;
5912 		char	*flag_name;
5913 	} flag_name_t;
5914 
5915 	flag_name_t	flags[] = {
5916 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
5917 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
5918 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
5919 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
5920 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
5921 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
5922 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
5923 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
5924 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
5925 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
5926 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
5927 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
5928 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
5929 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
5930 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
5931 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
5932 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
5933 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
5934 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
5935 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
5936 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
5937 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
5938 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
5939 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
5940 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
5941 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
5942 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
5943 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
5944 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
5945 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
5946 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
5947 
5948 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
5949 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
5950 		if (state & flags[i].flag_val)
5951 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
5952 	}
5953 }
5954