xref: /titanic_52/usr/src/uts/sun4v/io/vsw_ldc.c (revision d91236fe104c7ea63142e053b22a39c8a30d304b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 #include <sys/callb.h>
75 #include <sys/vlan.h>
76 
77 /* Port add/deletion/etc routines */
78 static	int vsw_port_delete(vsw_port_t *port);
79 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
80 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
81 static	int vsw_init_ldcs(vsw_port_t *port);
82 static	int vsw_uninit_ldcs(vsw_port_t *port);
83 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
84 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
85 static	int vsw_drain_ldcs(vsw_port_t *port);
86 static	int vsw_drain_port_taskq(vsw_port_t *port);
87 static	void vsw_marker_task(void *);
88 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
89 int vsw_detach_ports(vsw_t *vswp);
90 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
91 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
92 int vsw_port_detach(vsw_t *vswp, int p_instance);
93 int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
94 int vsw_port_attach(vsw_port_t *portp);
95 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
96 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
97 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
98 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
99 
100 /* Interrupt routines */
101 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
102 
103 /* Handshake routines */
104 static	void vsw_ldc_reinit(vsw_ldc_t *);
105 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
106 static	void vsw_conn_task(void *);
107 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
108 static	void vsw_next_milestone(vsw_ldc_t *);
109 static	int vsw_supported_version(vio_ver_msg_t *);
110 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
111 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
112 
113 /* Data processing routines */
114 static void vsw_process_pkt(void *);
115 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
116 static void vsw_process_ctrl_pkt(void *);
117 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
123 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
124 	uint32_t);
125 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
126 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
127 static void vsw_process_pkt_data(void *, void *, uint32_t);
128 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
129 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
130 
131 /* Switching/data transmit routines */
132 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
133 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
134 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
135 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
136 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
138 
139 /* Packet creation routines */
140 static void vsw_send_ver(void *);
141 static void vsw_send_attr(vsw_ldc_t *);
142 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
143 static void vsw_send_dring_info(vsw_ldc_t *);
144 static void vsw_send_rdx(vsw_ldc_t *);
145 
146 /* Dring routines */
147 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
148 static void vsw_create_privring(vsw_ldc_t *);
149 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
150 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
151     int *);
152 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
153 static int vsw_reclaim_dring(dring_info_t *dp, int start);
154 
155 static void vsw_set_lane_attr(vsw_t *, lane_t *);
156 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
157 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
158 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
159 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
160 
161 /* Rcv/Tx thread routines */
162 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
163 static void vsw_ldc_tx_worker(void *arg);
164 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
165 static void vsw_ldc_rx_worker(void *arg);
166 
167 /* Misc support routines */
168 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
169 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
170 static int vsw_free_ring(dring_info_t *);
171 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
172 static int vsw_get_same_dest_list(struct ether_header *ehp,
173     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
174 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
175 
176 /* Debugging routines */
177 static void dump_flags(uint64_t);
178 static void display_state(void);
179 static void display_lane(lane_t *);
180 static void display_ring(dring_info_t *);
181 
182 /*
183  * Functions imported from other files.
184  */
185 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
186 extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
187 extern void vsw_reconfig_hw(vsw_t *);
188 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
189 extern void vsw_del_mcst_port(vsw_port_t *port);
190 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
191 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
192 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
193 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
194 extern void vsw_create_vlans(void *arg, int type);
195 extern void vsw_destroy_vlans(void *arg, int type);
196 extern void vsw_vlan_add_ids(void *arg, int type);
197 extern void vsw_vlan_remove_ids(void *arg, int type);
198 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
199 	struct ether_header *ehp, uint16_t *vidp);
200 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
201 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
202 	mblk_t **npt);
203 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
204 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
205 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
206 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
207 extern void vsw_hio_stop_port(vsw_port_t *portp);
208 extern void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr);
209 
210 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
211 
212 /*
213  * Tunables used in this file.
214  */
215 extern int vsw_num_handshakes;
216 extern int vsw_wretries;
217 extern int vsw_desc_delay;
218 extern int vsw_read_attempts;
219 extern int vsw_ldc_tx_delay;
220 extern int vsw_ldc_tx_retries;
221 extern boolean_t vsw_ldc_rxthr_enabled;
222 extern boolean_t vsw_ldc_txthr_enabled;
223 extern uint32_t vsw_ntxds;
224 extern uint32_t vsw_max_tx_qcount;
225 extern uint32_t vsw_chain_len;
226 extern uint32_t vsw_mblk_size1;
227 extern uint32_t vsw_mblk_size2;
228 extern uint32_t vsw_mblk_size3;
229 extern uint32_t vsw_num_mblks1;
230 extern uint32_t vsw_num_mblks2;
231 extern uint32_t vsw_num_mblks3;
232 extern boolean_t vsw_obp_ver_proto_workaround;
233 extern uint32_t vsw_publish_macaddr_count;
234 
235 #define	LDC_ENTER_LOCK(ldcp)	\
236 				mutex_enter(&((ldcp)->ldc_cblock));\
237 				mutex_enter(&((ldcp)->ldc_rxlock));\
238 				mutex_enter(&((ldcp)->ldc_txlock));
239 #define	LDC_EXIT_LOCK(ldcp)	\
240 				mutex_exit(&((ldcp)->ldc_txlock));\
241 				mutex_exit(&((ldcp)->ldc_rxlock));\
242 				mutex_exit(&((ldcp)->ldc_cblock));
243 
244 #define	VSW_VER_EQ(ldcp, major, minor)	\
245 	((ldcp)->lane_out.ver_major == (major) &&	\
246 	    (ldcp)->lane_out.ver_minor == (minor))
247 
248 #define	VSW_VER_LT(ldcp, major, minor)	\
249 	(((ldcp)->lane_out.ver_major < (major)) ||	\
250 	    ((ldcp)->lane_out.ver_major == (major) &&	\
251 	    (ldcp)->lane_out.ver_minor < (minor)))
252 
253 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
254 	(((ldcp)->lane_out.ver_major > (major)) ||	\
255 	    ((ldcp)->lane_out.ver_major == (major) &&	\
256 	    (ldcp)->lane_out.ver_minor >= (minor)))
257 
258 /* supported versions */
259 static	ver_sup_t	vsw_versions[] = { {1, 3} };
260 
261 /*
262  * For the moment the state dump routines have their own
263  * private flag.
264  */
265 #define	DUMP_STATE	0
266 
267 #if DUMP_STATE
268 
269 #define	DUMP_TAG(tag) \
270 {			\
271 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
272 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
273 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
274 }
275 
276 #define	DUMP_TAG_PTR(tag) \
277 {			\
278 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
279 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
280 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
281 }
282 
283 #define	DUMP_FLAGS(flags) dump_flags(flags);
284 #define	DISPLAY_STATE()	display_state()
285 
286 #else
287 
288 #define	DUMP_TAG(tag)
289 #define	DUMP_TAG_PTR(tag)
290 #define	DUMP_FLAGS(state)
291 #define	DISPLAY_STATE()
292 
293 #endif	/* DUMP_STATE */
294 
295 /*
296  * Attach the specified port.
297  *
298  * Returns 0 on success, 1 on failure.
299  */
300 int
301 vsw_port_attach(vsw_port_t *port)
302 {
303 	vsw_t			*vswp = port->p_vswp;
304 	vsw_port_list_t		*plist = &vswp->plist;
305 	vsw_port_t		*p, **pp;
306 	int			i;
307 	int			nids = port->num_ldcs;
308 	uint64_t		*ldcids;
309 
310 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
311 
312 	/* port already exists? */
313 	READ_ENTER(&plist->lockrw);
314 	for (p = plist->head; p != NULL; p = p->p_next) {
315 		if (p->p_instance == port->p_instance) {
316 			DWARN(vswp, "%s: port instance %d already attached",
317 			    __func__, p->p_instance);
318 			RW_EXIT(&plist->lockrw);
319 			return (1);
320 		}
321 	}
322 	RW_EXIT(&plist->lockrw);
323 
324 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
325 
326 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
327 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
328 
329 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
330 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
331 	port->state = VSW_PORT_INIT;
332 
333 	D2(vswp, "%s: %d nids", __func__, nids);
334 	ldcids = port->ldc_ids;
335 	for (i = 0; i < nids; i++) {
336 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
337 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
338 			DERR(vswp, "%s: ldc_attach failed", __func__);
339 
340 			rw_destroy(&port->p_ldclist.lockrw);
341 
342 			cv_destroy(&port->state_cv);
343 			mutex_destroy(&port->state_lock);
344 
345 			mutex_destroy(&port->tx_lock);
346 			mutex_destroy(&port->mca_lock);
347 			kmem_free(port, sizeof (vsw_port_t));
348 			return (1);
349 		}
350 	}
351 
352 	if (vswp->switching_setup_done == B_TRUE) {
353 		/*
354 		 * If the underlying physical device has been setup,
355 		 * program the mac address of this port in it.
356 		 * Otherwise, port macaddr will be set after the physical
357 		 * device is successfully setup by the timeout handler.
358 		 */
359 		mutex_enter(&vswp->hw_lock);
360 		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
361 		mutex_exit(&vswp->hw_lock);
362 	}
363 
364 	/* create the fdb entry for this port/mac address */
365 	vsw_fdbe_add(vswp, port);
366 
367 	vsw_create_vlans(port, VSW_VNETPORT);
368 
369 	WRITE_ENTER(&plist->lockrw);
370 
371 	/* link it into the list of ports for this vsw instance */
372 	pp = (vsw_port_t **)(&plist->head);
373 	port->p_next = *pp;
374 	*pp = port;
375 	plist->num_ports++;
376 
377 	RW_EXIT(&plist->lockrw);
378 
379 	/*
380 	 * Initialise the port and any ldc's under it.
381 	 */
382 	(void) vsw_init_ldcs(port);
383 
384 	/* announce macaddr of vnet to the physical switch */
385 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
386 		vsw_publish_macaddr(vswp, (uint8_t *)&(port->p_macaddr));
387 	}
388 
389 	D1(vswp, "%s: exit", __func__);
390 	return (0);
391 }
392 
393 /*
394  * Detach the specified port.
395  *
396  * Returns 0 on success, 1 on failure.
397  */
398 int
399 vsw_port_detach(vsw_t *vswp, int p_instance)
400 {
401 	vsw_port_t	*port = NULL;
402 	vsw_port_list_t	*plist = &vswp->plist;
403 
404 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
405 
406 	WRITE_ENTER(&plist->lockrw);
407 
408 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
409 		RW_EXIT(&plist->lockrw);
410 		return (1);
411 	}
412 
413 	if (vsw_plist_del_node(vswp, port)) {
414 		RW_EXIT(&plist->lockrw);
415 		return (1);
416 	}
417 
418 	/* cleanup any HybridIO for this port */
419 	vsw_hio_stop_port(port);
420 
421 	/*
422 	 * No longer need to hold writer lock on port list now
423 	 * that we have unlinked the target port from the list.
424 	 */
425 	RW_EXIT(&plist->lockrw);
426 
427 	/* Remove the fdb entry for this port/mac address */
428 	vsw_fdbe_del(vswp, &(port->p_macaddr));
429 	vsw_destroy_vlans(port, VSW_VNETPORT);
430 
431 	/* Remove any multicast addresses.. */
432 	vsw_del_mcst_port(port);
433 
434 	/* Remove address if was programmed into HW. */
435 	mutex_enter(&vswp->hw_lock);
436 
437 	/*
438 	 * Port's address may not have been set in hardware. This could
439 	 * happen if the underlying physical device is not yet available and
440 	 * vsw_setup_switching_timeout() may be in progress.
441 	 * We remove its addr from hardware only if it has been set before.
442 	 */
443 	if (port->addr_set != VSW_ADDR_UNSET)
444 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
445 
446 	if (vswp->recfg_reqd)
447 		vsw_reconfig_hw(vswp);
448 
449 	mutex_exit(&vswp->hw_lock);
450 
451 	if (vsw_port_delete(port)) {
452 		return (1);
453 	}
454 
455 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
456 	return (0);
457 }
458 
459 /*
460  * Detach all active ports.
461  *
462  * Returns 0 on success, 1 on failure.
463  */
464 int
465 vsw_detach_ports(vsw_t *vswp)
466 {
467 	vsw_port_list_t 	*plist = &vswp->plist;
468 	vsw_port_t		*port = NULL;
469 
470 	D1(vswp, "%s: enter", __func__);
471 
472 	WRITE_ENTER(&plist->lockrw);
473 
474 	while ((port = plist->head) != NULL) {
475 		if (vsw_plist_del_node(vswp, port)) {
476 			DERR(vswp, "%s: Error deleting port %d"
477 			    " from port list", __func__, port->p_instance);
478 			RW_EXIT(&plist->lockrw);
479 			return (1);
480 		}
481 
482 		/* Remove address if was programmed into HW. */
483 		mutex_enter(&vswp->hw_lock);
484 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
485 		mutex_exit(&vswp->hw_lock);
486 
487 		/* Remove the fdb entry for this port/mac address */
488 		vsw_fdbe_del(vswp, &(port->p_macaddr));
489 		vsw_destroy_vlans(port, VSW_VNETPORT);
490 
491 		/* Remove any multicast addresses.. */
492 		vsw_del_mcst_port(port);
493 
494 		/*
495 		 * No longer need to hold the lock on the port list
496 		 * now that we have unlinked the target port from the
497 		 * list.
498 		 */
499 		RW_EXIT(&plist->lockrw);
500 		if (vsw_port_delete(port)) {
501 			DERR(vswp, "%s: Error deleting port %d",
502 			    __func__, port->p_instance);
503 			return (1);
504 		}
505 		WRITE_ENTER(&plist->lockrw);
506 	}
507 	RW_EXIT(&plist->lockrw);
508 
509 	D1(vswp, "%s: exit", __func__);
510 
511 	return (0);
512 }
513 
514 /*
515  * Delete the specified port.
516  *
517  * Returns 0 on success, 1 on failure.
518  */
519 static int
520 vsw_port_delete(vsw_port_t *port)
521 {
522 	vsw_ldc_list_t 		*ldcl;
523 	vsw_t			*vswp = port->p_vswp;
524 	int			num_ldcs;
525 
526 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
527 
528 	(void) vsw_uninit_ldcs(port);
529 
530 	/*
531 	 * Wait for any pending ctrl msg tasks which reference this
532 	 * port to finish.
533 	 */
534 	if (vsw_drain_port_taskq(port))
535 		return (1);
536 
537 	/*
538 	 * Wait for any active callbacks to finish
539 	 */
540 	if (vsw_drain_ldcs(port))
541 		return (1);
542 
543 	ldcl = &port->p_ldclist;
544 	num_ldcs = port->num_ldcs;
545 	WRITE_ENTER(&ldcl->lockrw);
546 	while (num_ldcs > 0) {
547 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
548 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
549 			    vswp->instance, ldcl->head->ldc_id);
550 			RW_EXIT(&ldcl->lockrw);
551 			port->num_ldcs = num_ldcs;
552 			return (1);
553 		}
554 		num_ldcs--;
555 	}
556 	RW_EXIT(&ldcl->lockrw);
557 
558 	rw_destroy(&port->p_ldclist.lockrw);
559 
560 	mutex_destroy(&port->mca_lock);
561 	mutex_destroy(&port->tx_lock);
562 
563 	cv_destroy(&port->state_cv);
564 	mutex_destroy(&port->state_lock);
565 
566 	if (port->num_ldcs != 0) {
567 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
568 		port->num_ldcs = 0;
569 	}
570 	kmem_free(port, sizeof (vsw_port_t));
571 
572 	D1(vswp, "%s: exit", __func__);
573 
574 	return (0);
575 }
576 
577 /*
578  * Attach a logical domain channel (ldc) under a specified port.
579  *
580  * Returns 0 on success, 1 on failure.
581  */
582 static int
583 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
584 {
585 	vsw_t 		*vswp = port->p_vswp;
586 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
587 	vsw_ldc_t 	*ldcp = NULL;
588 	ldc_attr_t 	attr;
589 	ldc_status_t	istatus;
590 	int 		status = DDI_FAILURE;
591 	int		rv;
592 	char		kname[MAXNAMELEN];
593 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
594 			    PROG_callback = 0x2, PROG_rx_thread = 0x4,
595 			    PROG_tx_thread = 0x8}
596 			progress;
597 
598 	progress = PROG_init;
599 
600 	D1(vswp, "%s: enter", __func__);
601 
602 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
603 	if (ldcp == NULL) {
604 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
605 		return (1);
606 	}
607 	ldcp->ldc_id = ldc_id;
608 
609 	/* Allocate pools of receive mblks */
610 	rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
611 	    vsw_mblk_size1, vsw_mblk_size2, vsw_mblk_size3,
612 	    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
613 	if (rv) {
614 		DWARN(vswp, "%s: unable to create free mblk pools for"
615 		    " channel %ld (rv %d)", __func__, ldc_id, rv);
616 		kmem_free(ldcp, sizeof (vsw_ldc_t));
617 		return (1);
618 	}
619 
620 	progress |= PROG_mblks;
621 
622 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
623 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
624 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
625 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
626 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
627 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
628 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
629 
630 	/* required for handshake with peer */
631 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
632 	ldcp->peer_session = 0;
633 	ldcp->session_status = 0;
634 	ldcp->hss_id = 1;	/* Initial handshake session id */
635 
636 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
637 
638 	/* only set for outbound lane, inbound set by peer */
639 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
640 
641 	attr.devclass = LDC_DEV_NT_SVC;
642 	attr.instance = ddi_get_instance(vswp->dip);
643 	attr.mode = LDC_MODE_UNRELIABLE;
644 	attr.mtu = VSW_LDC_MTU;
645 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
646 	if (status != 0) {
647 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
648 		    __func__, ldc_id, status);
649 		goto ldc_attach_fail;
650 	}
651 
652 	if (vsw_ldc_rxthr_enabled) {
653 		ldcp->rx_thr_flags = 0;
654 
655 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
656 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
657 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
658 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
659 
660 		progress |= PROG_rx_thread;
661 		if (ldcp->rx_thread == NULL) {
662 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
663 			    __func__, ldc_id);
664 			goto ldc_attach_fail;
665 		}
666 	}
667 
668 	if (vsw_ldc_txthr_enabled) {
669 		ldcp->tx_thr_flags = 0;
670 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
671 
672 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
673 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
674 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
675 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
676 
677 		progress |= PROG_tx_thread;
678 		if (ldcp->tx_thread == NULL) {
679 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
680 			    __func__, ldc_id);
681 			goto ldc_attach_fail;
682 		}
683 	}
684 
685 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
686 	if (status != 0) {
687 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
688 		    __func__, ldc_id, status);
689 		(void) ldc_fini(ldcp->ldc_handle);
690 		goto ldc_attach_fail;
691 	}
692 	/*
693 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
694 	 * data msgs, including raw data msgs used to recv priority frames.
695 	 */
696 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
697 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
698 
699 	progress |= PROG_callback;
700 
701 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
702 
703 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
704 		DERR(vswp, "%s: ldc_status failed", __func__);
705 		mutex_destroy(&ldcp->status_lock);
706 		goto ldc_attach_fail;
707 	}
708 
709 	ldcp->ldc_status = istatus;
710 	ldcp->ldc_port = port;
711 	ldcp->ldc_vswp = vswp;
712 
713 	vsw_reset_vnet_proto_ops(ldcp);
714 
715 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
716 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
717 	    kname, &ldcp->ldc_stats);
718 	if (ldcp->ksp == NULL) {
719 		DERR(vswp, "%s: kstats setup failed", __func__);
720 		goto ldc_attach_fail;
721 	}
722 
723 	/* link it into the list of channels for this port */
724 	WRITE_ENTER(&ldcl->lockrw);
725 	ldcp->ldc_next = ldcl->head;
726 	ldcl->head = ldcp;
727 	RW_EXIT(&ldcl->lockrw);
728 
729 	D1(vswp, "%s: exit", __func__);
730 	return (0);
731 
732 ldc_attach_fail:
733 
734 	if (progress & PROG_callback) {
735 		(void) ldc_unreg_callback(ldcp->ldc_handle);
736 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
737 	}
738 
739 	if (progress & PROG_rx_thread) {
740 		if (ldcp->rx_thread != NULL) {
741 			vsw_stop_rx_thread(ldcp);
742 		}
743 		mutex_destroy(&ldcp->rx_thr_lock);
744 		cv_destroy(&ldcp->rx_thr_cv);
745 	}
746 
747 	if (progress & PROG_tx_thread) {
748 		if (ldcp->tx_thread != NULL) {
749 			vsw_stop_tx_thread(ldcp);
750 		}
751 		mutex_destroy(&ldcp->tx_thr_lock);
752 		cv_destroy(&ldcp->tx_thr_cv);
753 	}
754 	if (ldcp->ksp != NULL) {
755 		vgen_destroy_kstats(ldcp->ksp);
756 	}
757 	mutex_destroy(&ldcp->ldc_txlock);
758 	mutex_destroy(&ldcp->ldc_rxlock);
759 	mutex_destroy(&ldcp->ldc_cblock);
760 	mutex_destroy(&ldcp->drain_cv_lock);
761 
762 	cv_destroy(&ldcp->drain_cv);
763 
764 	rw_destroy(&ldcp->lane_in.dlistrw);
765 	rw_destroy(&ldcp->lane_out.dlistrw);
766 
767 	if (progress & PROG_mblks) {
768 		vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
769 	}
770 	kmem_free(ldcp, sizeof (vsw_ldc_t));
771 
772 	return (1);
773 }
774 
775 /*
776  * Detach a logical domain channel (ldc) belonging to a
777  * particular port.
778  *
779  * Returns 0 on success, 1 on failure.
780  */
781 static int
782 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
783 {
784 	vsw_t 		*vswp = port->p_vswp;
785 	vsw_ldc_t 	*ldcp, *prev_ldcp;
786 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
787 	int 		rv;
788 
789 	prev_ldcp = ldcl->head;
790 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
791 		if (ldcp->ldc_id == ldc_id) {
792 			break;
793 		}
794 	}
795 
796 	/* specified ldc id not found */
797 	if (ldcp == NULL) {
798 		DERR(vswp, "%s: ldcp = NULL", __func__);
799 		return (1);
800 	}
801 
802 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
803 
804 	/* Stop the receive thread */
805 	if (ldcp->rx_thread != NULL) {
806 		vsw_stop_rx_thread(ldcp);
807 		mutex_destroy(&ldcp->rx_thr_lock);
808 		cv_destroy(&ldcp->rx_thr_cv);
809 	}
810 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
811 
812 	/* Stop the tx thread */
813 	if (ldcp->tx_thread != NULL) {
814 		vsw_stop_tx_thread(ldcp);
815 		mutex_destroy(&ldcp->tx_thr_lock);
816 		cv_destroy(&ldcp->tx_thr_cv);
817 		if (ldcp->tx_mhead != NULL) {
818 			freemsgchain(ldcp->tx_mhead);
819 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
820 			ldcp->tx_cnt = 0;
821 		}
822 	}
823 
824 	/* Destory kstats */
825 	vgen_destroy_kstats(ldcp->ksp);
826 
827 	/*
828 	 * Before we can close the channel we must release any mapped
829 	 * resources (e.g. drings).
830 	 */
831 	vsw_free_lane_resources(ldcp, INBOUND);
832 	vsw_free_lane_resources(ldcp, OUTBOUND);
833 
834 	/*
835 	 * If the close fails we are in serious trouble, as won't
836 	 * be able to delete the parent port.
837 	 */
838 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
839 		DERR(vswp, "%s: error %d closing channel %lld",
840 		    __func__, rv, ldcp->ldc_id);
841 		return (1);
842 	}
843 
844 	(void) ldc_fini(ldcp->ldc_handle);
845 
846 	ldcp->ldc_status = LDC_INIT;
847 	ldcp->ldc_handle = NULL;
848 	ldcp->ldc_vswp = NULL;
849 
850 
851 	/*
852 	 * Most likely some mblks are still in use and
853 	 * have not been returned to the pool. These mblks are
854 	 * added to the pool that is maintained in the device instance.
855 	 * Another attempt will be made to destroy the pool
856 	 * when the device detaches.
857 	 */
858 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
859 
860 	/* unlink it from the list */
861 	prev_ldcp = ldcp->ldc_next;
862 
863 	mutex_destroy(&ldcp->ldc_txlock);
864 	mutex_destroy(&ldcp->ldc_rxlock);
865 	mutex_destroy(&ldcp->ldc_cblock);
866 	cv_destroy(&ldcp->drain_cv);
867 	mutex_destroy(&ldcp->drain_cv_lock);
868 	mutex_destroy(&ldcp->status_lock);
869 	rw_destroy(&ldcp->lane_in.dlistrw);
870 	rw_destroy(&ldcp->lane_out.dlistrw);
871 
872 	kmem_free(ldcp, sizeof (vsw_ldc_t));
873 
874 	return (0);
875 }
876 
877 /*
878  * Open and attempt to bring up the channel. Note that channel
879  * can only be brought up if peer has also opened channel.
880  *
881  * Returns 0 if can open and bring up channel, otherwise
882  * returns 1.
883  */
884 static int
885 vsw_ldc_init(vsw_ldc_t *ldcp)
886 {
887 	vsw_t 		*vswp = ldcp->ldc_vswp;
888 	ldc_status_t	istatus = 0;
889 	int		rv;
890 
891 	D1(vswp, "%s: enter", __func__);
892 
893 	LDC_ENTER_LOCK(ldcp);
894 
895 	/* don't start at 0 in case clients don't like that */
896 	ldcp->next_ident = 1;
897 
898 	rv = ldc_open(ldcp->ldc_handle);
899 	if (rv != 0) {
900 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
901 		    __func__, ldcp->ldc_id, rv);
902 		LDC_EXIT_LOCK(ldcp);
903 		return (1);
904 	}
905 
906 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
907 		DERR(vswp, "%s: unable to get status", __func__);
908 		LDC_EXIT_LOCK(ldcp);
909 		return (1);
910 
911 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
912 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
913 		    __func__, ldcp->ldc_id, istatus);
914 		LDC_EXIT_LOCK(ldcp);
915 		return (1);
916 	}
917 
918 	mutex_enter(&ldcp->status_lock);
919 	ldcp->ldc_status = istatus;
920 	mutex_exit(&ldcp->status_lock);
921 
922 	rv = ldc_up(ldcp->ldc_handle);
923 	if (rv != 0) {
924 		/*
925 		 * Not a fatal error for ldc_up() to fail, as peer
926 		 * end point may simply not be ready yet.
927 		 */
928 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
929 		    ldcp->ldc_id, rv);
930 		LDC_EXIT_LOCK(ldcp);
931 		return (1);
932 	}
933 
934 	/*
935 	 * ldc_up() call is non-blocking so need to explicitly
936 	 * check channel status to see if in fact the channel
937 	 * is UP.
938 	 */
939 	mutex_enter(&ldcp->status_lock);
940 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
941 		DERR(vswp, "%s: unable to get status", __func__);
942 		mutex_exit(&ldcp->status_lock);
943 		LDC_EXIT_LOCK(ldcp);
944 		return (1);
945 
946 	}
947 
948 	if (ldcp->ldc_status == LDC_UP) {
949 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
950 		    ldcp->ldc_id, istatus);
951 		mutex_exit(&ldcp->status_lock);
952 		LDC_EXIT_LOCK(ldcp);
953 
954 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
955 		return (0);
956 	}
957 
958 	mutex_exit(&ldcp->status_lock);
959 	LDC_EXIT_LOCK(ldcp);
960 
961 	D1(vswp, "%s: exit", __func__);
962 	return (0);
963 }
964 
965 /* disable callbacks on the channel */
966 static int
967 vsw_ldc_uninit(vsw_ldc_t *ldcp)
968 {
969 	vsw_t	*vswp = ldcp->ldc_vswp;
970 	int	rv;
971 
972 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
973 
974 	LDC_ENTER_LOCK(ldcp);
975 
976 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
977 	if (rv != 0) {
978 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
979 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
980 		LDC_EXIT_LOCK(ldcp);
981 		return (1);
982 	}
983 
984 	mutex_enter(&ldcp->status_lock);
985 	ldcp->ldc_status = LDC_INIT;
986 	mutex_exit(&ldcp->status_lock);
987 
988 	LDC_EXIT_LOCK(ldcp);
989 
990 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
991 
992 	return (0);
993 }
994 
995 static int
996 vsw_init_ldcs(vsw_port_t *port)
997 {
998 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
999 	vsw_ldc_t	*ldcp;
1000 
1001 	READ_ENTER(&ldcl->lockrw);
1002 	ldcp =  ldcl->head;
1003 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1004 		(void) vsw_ldc_init(ldcp);
1005 	}
1006 	RW_EXIT(&ldcl->lockrw);
1007 
1008 	return (0);
1009 }
1010 
1011 static int
1012 vsw_uninit_ldcs(vsw_port_t *port)
1013 {
1014 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1015 	vsw_ldc_t	*ldcp;
1016 
1017 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1018 
1019 	READ_ENTER(&ldcl->lockrw);
1020 	ldcp =  ldcl->head;
1021 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1022 		(void) vsw_ldc_uninit(ldcp);
1023 	}
1024 	RW_EXIT(&ldcl->lockrw);
1025 
1026 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1027 
1028 	return (0);
1029 }
1030 
1031 /*
1032  * Wait until the callback(s) associated with the ldcs under the specified
1033  * port have completed.
1034  *
1035  * Prior to this function being invoked each channel under this port
1036  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1037  *
1038  * A short explaination of what we are doing below..
1039  *
1040  * The simplest approach would be to have a reference counter in
1041  * the ldc structure which is increment/decremented by the callbacks as
1042  * they use the channel. The drain function could then simply disable any
1043  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1044  * there is a tiny window here - before the callback is able to get the lock
1045  * on the channel it is interrupted and this function gets to execute. It
1046  * sees that the ref count is zero and believes its free to delete the
1047  * associated data structures.
1048  *
1049  * We get around this by taking advantage of the fact that before the ldc
1050  * framework invokes a callback it sets a flag to indicate that there is a
1051  * callback active (or about to become active). If when we attempt to
1052  * unregister a callback when this active flag is set then the unregister
1053  * will fail with EWOULDBLOCK.
1054  *
1055  * If the unregister fails we do a cv_timedwait. We will either be signaled
1056  * by the callback as it is exiting (note we have to wait a short period to
1057  * allow the callback to return fully to the ldc framework and it to clear
1058  * the active flag), or by the timer expiring. In either case we again attempt
1059  * the unregister. We repeat this until we can succesfully unregister the
1060  * callback.
1061  *
1062  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1063  * the case where the callback has finished but the ldc framework has not yet
1064  * cleared the active flag. In this case we would never get a cv_signal.
1065  */
1066 static int
1067 vsw_drain_ldcs(vsw_port_t *port)
1068 {
1069 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1070 	vsw_ldc_t	*ldcp;
1071 	vsw_t		*vswp = port->p_vswp;
1072 
1073 	D1(vswp, "%s: enter", __func__);
1074 
1075 	READ_ENTER(&ldcl->lockrw);
1076 
1077 	ldcp = ldcl->head;
1078 
1079 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1080 		/*
1081 		 * If we can unregister the channel callback then we
1082 		 * know that there is no callback either running or
1083 		 * scheduled to run for this channel so move on to next
1084 		 * channel in the list.
1085 		 */
1086 		mutex_enter(&ldcp->drain_cv_lock);
1087 
1088 		/* prompt active callbacks to quit */
1089 		ldcp->drain_state = VSW_LDC_DRAINING;
1090 
1091 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1092 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1093 			    ldcp->ldc_id);
1094 			mutex_exit(&ldcp->drain_cv_lock);
1095 			continue;
1096 		} else {
1097 			/*
1098 			 * If we end up here we know that either 1) a callback
1099 			 * is currently executing, 2) is about to start (i.e.
1100 			 * the ldc framework has set the active flag but
1101 			 * has not actually invoked the callback yet, or 3)
1102 			 * has finished and has returned to the ldc framework
1103 			 * but the ldc framework has not yet cleared the
1104 			 * active bit.
1105 			 *
1106 			 * Wait for it to finish.
1107 			 */
1108 			while (ldc_unreg_callback(ldcp->ldc_handle)
1109 			    == EWOULDBLOCK)
1110 				(void) cv_timedwait(&ldcp->drain_cv,
1111 				    &ldcp->drain_cv_lock, lbolt + hz);
1112 
1113 			mutex_exit(&ldcp->drain_cv_lock);
1114 			D2(vswp, "%s: unreg callback for chan %ld after "
1115 			    "timeout", __func__, ldcp->ldc_id);
1116 		}
1117 	}
1118 	RW_EXIT(&ldcl->lockrw);
1119 
1120 	D1(vswp, "%s: exit", __func__);
1121 	return (0);
1122 }
1123 
1124 /*
1125  * Wait until all tasks which reference this port have completed.
1126  *
1127  * Prior to this function being invoked each channel under this port
1128  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1129  */
1130 static int
1131 vsw_drain_port_taskq(vsw_port_t *port)
1132 {
1133 	vsw_t		*vswp = port->p_vswp;
1134 
1135 	D1(vswp, "%s: enter", __func__);
1136 
1137 	/*
1138 	 * Mark the port as in the process of being detached, and
1139 	 * dispatch a marker task to the queue so we know when all
1140 	 * relevant tasks have completed.
1141 	 */
1142 	mutex_enter(&port->state_lock);
1143 	port->state = VSW_PORT_DETACHING;
1144 
1145 	if ((vswp->taskq_p == NULL) ||
1146 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1147 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1148 		DERR(vswp, "%s: unable to dispatch marker task",
1149 		    __func__);
1150 		mutex_exit(&port->state_lock);
1151 		return (1);
1152 	}
1153 
1154 	/*
1155 	 * Wait for the marker task to finish.
1156 	 */
1157 	while (port->state != VSW_PORT_DETACHABLE)
1158 		cv_wait(&port->state_cv, &port->state_lock);
1159 
1160 	mutex_exit(&port->state_lock);
1161 
1162 	D1(vswp, "%s: exit", __func__);
1163 
1164 	return (0);
1165 }
1166 
1167 static void
1168 vsw_marker_task(void *arg)
1169 {
1170 	vsw_port_t	*port = arg;
1171 	vsw_t		*vswp = port->p_vswp;
1172 
1173 	D1(vswp, "%s: enter", __func__);
1174 
1175 	mutex_enter(&port->state_lock);
1176 
1177 	/*
1178 	 * No further tasks should be dispatched which reference
1179 	 * this port so ok to mark it as safe to detach.
1180 	 */
1181 	port->state = VSW_PORT_DETACHABLE;
1182 
1183 	cv_signal(&port->state_cv);
1184 
1185 	mutex_exit(&port->state_lock);
1186 
1187 	D1(vswp, "%s: exit", __func__);
1188 }
1189 
1190 vsw_port_t *
1191 vsw_lookup_port(vsw_t *vswp, int p_instance)
1192 {
1193 	vsw_port_list_t *plist = &vswp->plist;
1194 	vsw_port_t	*port;
1195 
1196 	for (port = plist->head; port != NULL; port = port->p_next) {
1197 		if (port->p_instance == p_instance) {
1198 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1199 			return (port);
1200 		}
1201 	}
1202 
1203 	return (NULL);
1204 }
1205 
1206 void
1207 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1208 {
1209 	vsw_ldc_list_t 	*ldclp;
1210 	vsw_ldc_t	*ldcp;
1211 
1212 	ldclp = &portp->p_ldclist;
1213 
1214 	READ_ENTER(&ldclp->lockrw);
1215 
1216 	/*
1217 	 * NOTE: for now, we will assume we have a single channel.
1218 	 */
1219 	if (ldclp->head == NULL) {
1220 		RW_EXIT(&ldclp->lockrw);
1221 		return;
1222 	}
1223 	ldcp = ldclp->head;
1224 
1225 	mutex_enter(&ldcp->ldc_cblock);
1226 
1227 	/*
1228 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1229 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1230 	 */
1231 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1232 	    portp->nvids != 0) {
1233 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1234 	}
1235 
1236 	mutex_exit(&ldcp->ldc_cblock);
1237 
1238 	RW_EXIT(&ldclp->lockrw);
1239 }
1240 
1241 void
1242 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1243 {
1244 	vsw_ldc_list_t	*ldclp;
1245 	vsw_ldc_t	*ldcp;
1246 
1247 	ldclp = &portp->p_ldclist;
1248 
1249 	READ_ENTER(&ldclp->lockrw);
1250 
1251 	/*
1252 	 * NOTE: for now, we will assume we have a single channel.
1253 	 */
1254 	if (ldclp->head == NULL) {
1255 		RW_EXIT(&ldclp->lockrw);
1256 		return;
1257 	}
1258 	ldcp = ldclp->head;
1259 
1260 	mutex_enter(&ldcp->ldc_cblock);
1261 
1262 	/*
1263 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1264 	 * to trigger re-negotiation, which inturn trigger HybridIO
1265 	 * setup/cleanup.
1266 	 */
1267 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1268 	    (portp->p_hio_capable == B_TRUE)) {
1269 		if (immediate == B_TRUE) {
1270 			(void) ldc_down(ldcp->ldc_handle);
1271 		} else {
1272 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1273 		}
1274 	}
1275 
1276 	mutex_exit(&ldcp->ldc_cblock);
1277 
1278 	RW_EXIT(&ldclp->lockrw);
1279 }
1280 
1281 /*
1282  * Search for and remove the specified port from the port
1283  * list. Returns 0 if able to locate and remove port, otherwise
1284  * returns 1.
1285  */
1286 static int
1287 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1288 {
1289 	vsw_port_list_t *plist = &vswp->plist;
1290 	vsw_port_t	*curr_p, *prev_p;
1291 
1292 	if (plist->head == NULL)
1293 		return (1);
1294 
1295 	curr_p = prev_p = plist->head;
1296 
1297 	while (curr_p != NULL) {
1298 		if (curr_p == port) {
1299 			if (prev_p == curr_p) {
1300 				plist->head = curr_p->p_next;
1301 			} else {
1302 				prev_p->p_next = curr_p->p_next;
1303 			}
1304 			plist->num_ports--;
1305 			break;
1306 		} else {
1307 			prev_p = curr_p;
1308 			curr_p = curr_p->p_next;
1309 		}
1310 	}
1311 	return (0);
1312 }
1313 
1314 /*
1315  * Interrupt handler for ldc messages.
1316  */
1317 static uint_t
1318 vsw_ldc_cb(uint64_t event, caddr_t arg)
1319 {
1320 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1321 	vsw_t 		*vswp = ldcp->ldc_vswp;
1322 
1323 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1324 
1325 	mutex_enter(&ldcp->ldc_cblock);
1326 	ldcp->ldc_stats.callbacks++;
1327 
1328 	mutex_enter(&ldcp->status_lock);
1329 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1330 		mutex_exit(&ldcp->status_lock);
1331 		mutex_exit(&ldcp->ldc_cblock);
1332 		return (LDC_SUCCESS);
1333 	}
1334 	mutex_exit(&ldcp->status_lock);
1335 
1336 	if (event & LDC_EVT_UP) {
1337 		/*
1338 		 * Channel has come up.
1339 		 */
1340 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1341 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1342 
1343 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1344 
1345 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1346 	}
1347 
1348 	if (event & LDC_EVT_READ) {
1349 		/*
1350 		 * Data available for reading.
1351 		 */
1352 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1353 		    __func__, ldcp->ldc_id, event);
1354 
1355 		if (ldcp->rx_thread != NULL) {
1356 			/*
1357 			 * If the receive thread is enabled, then
1358 			 * wakeup the receive thread to process the
1359 			 * LDC messages.
1360 			 */
1361 			mutex_exit(&ldcp->ldc_cblock);
1362 			mutex_enter(&ldcp->rx_thr_lock);
1363 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1364 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1365 				cv_signal(&ldcp->rx_thr_cv);
1366 			}
1367 			mutex_exit(&ldcp->rx_thr_lock);
1368 			mutex_enter(&ldcp->ldc_cblock);
1369 		} else {
1370 			vsw_process_pkt(ldcp);
1371 		}
1372 
1373 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1374 
1375 		goto vsw_cb_exit;
1376 	}
1377 
1378 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1379 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1380 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1381 
1382 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1383 	}
1384 
1385 	/*
1386 	 * Catch either LDC_EVT_WRITE which we don't support or any
1387 	 * unknown event.
1388 	 */
1389 	if (event &
1390 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1391 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1392 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1393 	}
1394 
1395 vsw_cb_exit:
1396 	mutex_exit(&ldcp->ldc_cblock);
1397 
1398 	/*
1399 	 * Let the drain function know we are finishing if it
1400 	 * is waiting.
1401 	 */
1402 	mutex_enter(&ldcp->drain_cv_lock);
1403 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1404 		cv_signal(&ldcp->drain_cv);
1405 	mutex_exit(&ldcp->drain_cv_lock);
1406 
1407 	return (LDC_SUCCESS);
1408 }
1409 
1410 /*
1411  * Reinitialise data structures associated with the channel.
1412  */
1413 static void
1414 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1415 {
1416 	vsw_t		*vswp = ldcp->ldc_vswp;
1417 	vsw_port_t	*port;
1418 	vsw_ldc_list_t	*ldcl;
1419 
1420 	D1(vswp, "%s: enter", __func__);
1421 
1422 	port = ldcp->ldc_port;
1423 	ldcl = &port->p_ldclist;
1424 
1425 	READ_ENTER(&ldcl->lockrw);
1426 
1427 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1428 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1429 
1430 	vsw_free_lane_resources(ldcp, INBOUND);
1431 	vsw_free_lane_resources(ldcp, OUTBOUND);
1432 	RW_EXIT(&ldcl->lockrw);
1433 
1434 	ldcp->lane_in.lstate = 0;
1435 	ldcp->lane_out.lstate = 0;
1436 
1437 	/* Remove the fdb entry for this port/mac address */
1438 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1439 
1440 	/* remove the port from vlans it has been assigned to */
1441 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1442 
1443 	/*
1444 	 * Remove parent port from any multicast groups
1445 	 * it may have registered with. Client must resend
1446 	 * multicast add command after handshake completes.
1447 	 */
1448 	vsw_del_mcst_port(port);
1449 
1450 	ldcp->peer_session = 0;
1451 	ldcp->session_status = 0;
1452 	ldcp->hcnt = 0;
1453 	ldcp->hphase = VSW_MILESTONE0;
1454 
1455 	vsw_reset_vnet_proto_ops(ldcp);
1456 
1457 	D1(vswp, "%s: exit", __func__);
1458 }
1459 
1460 /*
1461  * Process a connection event.
1462  *
1463  * Note - care must be taken to ensure that this function is
1464  * not called with the dlistrw lock held.
1465  */
1466 static void
1467 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1468 {
1469 	vsw_t		*vswp = ldcp->ldc_vswp;
1470 	vsw_conn_evt_t	*conn = NULL;
1471 
1472 	D1(vswp, "%s: enter", __func__);
1473 
1474 	/*
1475 	 * Check if either a reset or restart event is pending
1476 	 * or in progress. If so just return.
1477 	 *
1478 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1479 	 * being received by the callback handler, or a ECONNRESET error
1480 	 * code being returned from a ldc_read() or ldc_write() call.
1481 	 *
1482 	 * A VSW_CONN_RESTART event occurs when some error checking code
1483 	 * decides that there is a problem with data from the channel,
1484 	 * and that the handshake should be restarted.
1485 	 */
1486 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1487 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1488 		return;
1489 
1490 	/*
1491 	 * If it is an LDC_UP event we first check the recorded
1492 	 * state of the channel. If this is UP then we know that
1493 	 * the channel moving to the UP state has already been dealt
1494 	 * with and don't need to dispatch a  new task.
1495 	 *
1496 	 * The reason for this check is that when we do a ldc_up(),
1497 	 * depending on the state of the peer, we may or may not get
1498 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1499 	 * every time we do ldc_up() we explicitly check the channel
1500 	 * status to see has it come up (ldc_up() is asynch and will
1501 	 * complete at some undefined time), and take the appropriate
1502 	 * action.
1503 	 *
1504 	 * The flip side of this is that we may get a LDC_UP event
1505 	 * when we have already seen that the channel is up and have
1506 	 * dealt with that.
1507 	 */
1508 	mutex_enter(&ldcp->status_lock);
1509 	if (evt == VSW_CONN_UP) {
1510 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1511 			mutex_exit(&ldcp->status_lock);
1512 			return;
1513 		}
1514 	}
1515 	mutex_exit(&ldcp->status_lock);
1516 
1517 	/*
1518 	 * The transaction group id allows us to identify and discard
1519 	 * any tasks which are still pending on the taskq and refer
1520 	 * to the handshake session we are about to restart or reset.
1521 	 * These stale messages no longer have any real meaning.
1522 	 */
1523 	(void) atomic_inc_32(&ldcp->hss_id);
1524 
1525 	ASSERT(vswp->taskq_p != NULL);
1526 
1527 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1528 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1529 		    " connection event", vswp->instance);
1530 		goto err_exit;
1531 	}
1532 
1533 	conn->evt = evt;
1534 	conn->ldcp = ldcp;
1535 
1536 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1537 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1538 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1539 		    vswp->instance);
1540 
1541 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1542 		goto err_exit;
1543 	}
1544 
1545 	D1(vswp, "%s: exit", __func__);
1546 	return;
1547 
1548 err_exit:
1549 	/*
1550 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1551 	 * that future requests will at least be attempted and will hopefully
1552 	 * succeed.
1553 	 */
1554 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1555 		ldcp->reset_active = 0;
1556 }
1557 
1558 /*
1559  * Deal with events relating to a connection. Invoked from a taskq.
1560  */
1561 static void
1562 vsw_conn_task(void *arg)
1563 {
1564 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1565 	vsw_ldc_t	*ldcp = NULL;
1566 	vsw_port_t	*portp;
1567 	vsw_t		*vswp = NULL;
1568 	uint16_t	evt;
1569 	ldc_status_t	curr_status;
1570 
1571 	ldcp = conn->ldcp;
1572 	evt = conn->evt;
1573 	vswp = ldcp->ldc_vswp;
1574 	portp = ldcp->ldc_port;
1575 
1576 	D1(vswp, "%s: enter", __func__);
1577 
1578 	/* can safely free now have copied out data */
1579 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1580 
1581 	mutex_enter(&ldcp->status_lock);
1582 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1583 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1584 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1585 		mutex_exit(&ldcp->status_lock);
1586 		return;
1587 	}
1588 
1589 	/*
1590 	 * If we wish to restart the handshake on this channel, then if
1591 	 * the channel is UP we bring it DOWN to flush the underlying
1592 	 * ldc queue.
1593 	 */
1594 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1595 		(void) ldc_down(ldcp->ldc_handle);
1596 
1597 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1598 		vsw_hio_stop(vswp, ldcp);
1599 	}
1600 
1601 	/*
1602 	 * re-init all the associated data structures.
1603 	 */
1604 	vsw_ldc_reinit(ldcp);
1605 
1606 	/*
1607 	 * Bring the channel back up (note it does no harm to
1608 	 * do this even if the channel is already UP, Just
1609 	 * becomes effectively a no-op).
1610 	 */
1611 	(void) ldc_up(ldcp->ldc_handle);
1612 
1613 	/*
1614 	 * Check if channel is now UP. This will only happen if
1615 	 * peer has also done a ldc_up().
1616 	 */
1617 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1618 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1619 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1620 		mutex_exit(&ldcp->status_lock);
1621 		return;
1622 	}
1623 
1624 	ldcp->ldc_status = curr_status;
1625 
1626 	/* channel UP so restart handshake by sending version info */
1627 	if (curr_status == LDC_UP) {
1628 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1629 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1630 			    " handshake attempts (%d) on channel %ld",
1631 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1632 			mutex_exit(&ldcp->status_lock);
1633 			return;
1634 		}
1635 
1636 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1637 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1638 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1639 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1640 			    vswp->instance);
1641 
1642 			/*
1643 			 * Don't count as valid restart attempt if couldn't
1644 			 * send version msg.
1645 			 */
1646 			if (ldcp->hcnt > 0)
1647 				ldcp->hcnt--;
1648 		}
1649 	}
1650 
1651 	/*
1652 	 * Mark that the process is complete by clearing the flag.
1653 	 *
1654 	 * Note is it possible that the taskq dispatch above may have failed,
1655 	 * most likely due to memory shortage. We still clear the flag so
1656 	 * future attempts will at least be attempted and will hopefully
1657 	 * succeed.
1658 	 */
1659 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1660 		ldcp->reset_active = 0;
1661 
1662 	mutex_exit(&ldcp->status_lock);
1663 
1664 	D1(vswp, "%s: exit", __func__);
1665 }
1666 
1667 /*
1668  * returns 0 if legal for event signified by flag to have
1669  * occured at the time it did. Otherwise returns 1.
1670  */
1671 int
1672 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1673 {
1674 	vsw_t		*vswp = ldcp->ldc_vswp;
1675 	uint64_t	state;
1676 	uint64_t	phase;
1677 
1678 	if (dir == INBOUND)
1679 		state = ldcp->lane_in.lstate;
1680 	else
1681 		state = ldcp->lane_out.lstate;
1682 
1683 	phase = ldcp->hphase;
1684 
1685 	switch (flag) {
1686 	case VSW_VER_INFO_RECV:
1687 		if (phase > VSW_MILESTONE0) {
1688 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1689 			    " when in state %d\n", ldcp->ldc_id, phase);
1690 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1691 			return (1);
1692 		}
1693 		break;
1694 
1695 	case VSW_VER_ACK_RECV:
1696 	case VSW_VER_NACK_RECV:
1697 		if (!(state & VSW_VER_INFO_SENT)) {
1698 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1699 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1700 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1701 			return (1);
1702 		} else
1703 			state &= ~VSW_VER_INFO_SENT;
1704 		break;
1705 
1706 	case VSW_ATTR_INFO_RECV:
1707 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1708 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1709 			    " when in state %d\n", ldcp->ldc_id, phase);
1710 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1711 			return (1);
1712 		}
1713 		break;
1714 
1715 	case VSW_ATTR_ACK_RECV:
1716 	case VSW_ATTR_NACK_RECV:
1717 		if (!(state & VSW_ATTR_INFO_SENT)) {
1718 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1719 			    " or ATTR_NACK when in state %d\n",
1720 			    ldcp->ldc_id, phase);
1721 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1722 			return (1);
1723 		} else
1724 			state &= ~VSW_ATTR_INFO_SENT;
1725 		break;
1726 
1727 	case VSW_DRING_INFO_RECV:
1728 		if (phase < VSW_MILESTONE1) {
1729 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1730 			    " when in state %d\n", ldcp->ldc_id, phase);
1731 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1732 			return (1);
1733 		}
1734 		break;
1735 
1736 	case VSW_DRING_ACK_RECV:
1737 	case VSW_DRING_NACK_RECV:
1738 		if (!(state & VSW_DRING_INFO_SENT)) {
1739 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1740 			    " or DRING_NACK when in state %d\n",
1741 			    ldcp->ldc_id, phase);
1742 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1743 			return (1);
1744 		} else
1745 			state &= ~VSW_DRING_INFO_SENT;
1746 		break;
1747 
1748 	case VSW_RDX_INFO_RECV:
1749 		if (phase < VSW_MILESTONE3) {
1750 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1751 			    " when in state %d\n", ldcp->ldc_id, phase);
1752 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1753 			return (1);
1754 		}
1755 		break;
1756 
1757 	case VSW_RDX_ACK_RECV:
1758 	case VSW_RDX_NACK_RECV:
1759 		if (!(state & VSW_RDX_INFO_SENT)) {
1760 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1761 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1762 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1763 			return (1);
1764 		} else
1765 			state &= ~VSW_RDX_INFO_SENT;
1766 		break;
1767 
1768 	case VSW_MCST_INFO_RECV:
1769 		if (phase < VSW_MILESTONE3) {
1770 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1771 			    " when in state %d\n", ldcp->ldc_id, phase);
1772 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1773 			return (1);
1774 		}
1775 		break;
1776 
1777 	default:
1778 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1779 		    ldcp->ldc_id, flag);
1780 		return (1);
1781 	}
1782 
1783 	if (dir == INBOUND)
1784 		ldcp->lane_in.lstate = state;
1785 	else
1786 		ldcp->lane_out.lstate = state;
1787 
1788 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1789 
1790 	return (0);
1791 }
1792 
1793 void
1794 vsw_next_milestone(vsw_ldc_t *ldcp)
1795 {
1796 	vsw_t		*vswp = ldcp->ldc_vswp;
1797 	vsw_port_t	*portp = ldcp->ldc_port;
1798 
1799 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1800 	    ldcp->ldc_id, ldcp->hphase);
1801 
1802 	DUMP_FLAGS(ldcp->lane_in.lstate);
1803 	DUMP_FLAGS(ldcp->lane_out.lstate);
1804 
1805 	switch (ldcp->hphase) {
1806 
1807 	case VSW_MILESTONE0:
1808 		/*
1809 		 * If we haven't started to handshake with our peer,
1810 		 * start to do so now.
1811 		 */
1812 		if (ldcp->lane_out.lstate == 0) {
1813 			D2(vswp, "%s: (chan %lld) starting handshake "
1814 			    "with peer", __func__, ldcp->ldc_id);
1815 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1816 		}
1817 
1818 		/*
1819 		 * Only way to pass this milestone is to have successfully
1820 		 * negotiated version info.
1821 		 */
1822 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1823 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1824 
1825 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1826 			    __func__, ldcp->ldc_id);
1827 
1828 			vsw_set_vnet_proto_ops(ldcp);
1829 
1830 			/*
1831 			 * Next milestone is passed when attribute
1832 			 * information has been successfully exchanged.
1833 			 */
1834 			ldcp->hphase = VSW_MILESTONE1;
1835 			vsw_send_attr(ldcp);
1836 
1837 		}
1838 		break;
1839 
1840 	case VSW_MILESTONE1:
1841 		/*
1842 		 * Only way to pass this milestone is to have successfully
1843 		 * negotiated attribute information.
1844 		 */
1845 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1846 
1847 			ldcp->hphase = VSW_MILESTONE2;
1848 
1849 			/*
1850 			 * If the peer device has said it wishes to
1851 			 * use descriptor rings then we send it our ring
1852 			 * info, otherwise we just set up a private ring
1853 			 * which we use an internal buffer
1854 			 */
1855 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1856 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1857 			    (VSW_VER_LT(ldcp, 1, 2) &&
1858 			    (ldcp->lane_in.xfer_mode ==
1859 			    VIO_DRING_MODE_V1_0))) {
1860 				vsw_send_dring_info(ldcp);
1861 			}
1862 		}
1863 		break;
1864 
1865 	case VSW_MILESTONE2:
1866 		/*
1867 		 * If peer has indicated in its attribute message that
1868 		 * it wishes to use descriptor rings then the only way
1869 		 * to pass this milestone is for us to have received
1870 		 * valid dring info.
1871 		 *
1872 		 * If peer is not using descriptor rings then just fall
1873 		 * through.
1874 		 */
1875 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1876 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1877 		    (VSW_VER_LT(ldcp, 1, 2) &&
1878 		    (ldcp->lane_in.xfer_mode ==
1879 		    VIO_DRING_MODE_V1_0))) {
1880 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
1881 				break;
1882 		}
1883 
1884 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1885 		    __func__, ldcp->ldc_id);
1886 
1887 		ldcp->hphase = VSW_MILESTONE3;
1888 		vsw_send_rdx(ldcp);
1889 		break;
1890 
1891 	case VSW_MILESTONE3:
1892 		/*
1893 		 * Pass this milestone when all paramaters have been
1894 		 * successfully exchanged and RDX sent in both directions.
1895 		 *
1896 		 * Mark outbound lane as available to transmit data.
1897 		 */
1898 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
1899 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
1900 
1901 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1902 			    __func__, ldcp->ldc_id);
1903 			D2(vswp, "%s: ** handshake complete (0x%llx : "
1904 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
1905 			    ldcp->lane_out.lstate);
1906 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
1907 			ldcp->hphase = VSW_MILESTONE4;
1908 			ldcp->hcnt = 0;
1909 			DISPLAY_STATE();
1910 			/* Start HIO if enabled and capable */
1911 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1912 				D2(vswp, "%s: start HybridIO setup", __func__);
1913 				vsw_hio_start(vswp, ldcp);
1914 			}
1915 		} else {
1916 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1917 			    __func__, ldcp->lane_in.lstate,
1918 			    ldcp->lane_out.lstate);
1919 		}
1920 		break;
1921 
1922 	case VSW_MILESTONE4:
1923 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1924 		    ldcp->ldc_id);
1925 		break;
1926 
1927 	default:
1928 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1929 		    ldcp->ldc_id, ldcp->hphase);
1930 	}
1931 
1932 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1933 	    ldcp->hphase);
1934 }
1935 
1936 /*
1937  * Check if major version is supported.
1938  *
1939  * Returns 0 if finds supported major number, and if necessary
1940  * adjusts the minor field.
1941  *
1942  * Returns 1 if can't match major number exactly. Sets mjor/minor
1943  * to next lowest support values, or to zero if no other values possible.
1944  */
1945 static int
1946 vsw_supported_version(vio_ver_msg_t *vp)
1947 {
1948 	int	i;
1949 
1950 	D1(NULL, "vsw_supported_version: enter");
1951 
1952 	for (i = 0; i < VSW_NUM_VER; i++) {
1953 		if (vsw_versions[i].ver_major == vp->ver_major) {
1954 			/*
1955 			 * Matching or lower major version found. Update
1956 			 * minor number if necessary.
1957 			 */
1958 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1959 				D2(NULL, "%s: adjusting minor value from %d "
1960 				    "to %d", __func__, vp->ver_minor,
1961 				    vsw_versions[i].ver_minor);
1962 				vp->ver_minor = vsw_versions[i].ver_minor;
1963 			}
1964 
1965 			return (0);
1966 		}
1967 
1968 		/*
1969 		 * If the message contains a higher major version number, set
1970 		 * the message's major/minor versions to the current values
1971 		 * and return false, so this message will get resent with
1972 		 * these values.
1973 		 */
1974 		if (vsw_versions[i].ver_major < vp->ver_major) {
1975 			D2(NULL, "%s: adjusting major and minor "
1976 			    "values to %d, %d\n",
1977 			    __func__, vsw_versions[i].ver_major,
1978 			    vsw_versions[i].ver_minor);
1979 			vp->ver_major = vsw_versions[i].ver_major;
1980 			vp->ver_minor = vsw_versions[i].ver_minor;
1981 			return (1);
1982 		}
1983 	}
1984 
1985 	/* No match was possible, zero out fields */
1986 	vp->ver_major = 0;
1987 	vp->ver_minor = 0;
1988 
1989 	D1(NULL, "vsw_supported_version: exit");
1990 
1991 	return (1);
1992 }
1993 
1994 /*
1995  * Set vnet-protocol-version dependent functions based on version.
1996  */
1997 static void
1998 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1999 {
2000 	vsw_t	*vswp = ldcp->ldc_vswp;
2001 	lane_t	*lp = &ldcp->lane_out;
2002 
2003 	if (VSW_VER_GTEQ(ldcp, 1, 3)) {
2004 		/*
2005 		 * If the version negotiated with peer is >= 1.3,
2006 		 * set the mtu in our attributes to max_frame_size.
2007 		 */
2008 		lp->mtu = vswp->max_frame_size;
2009 	} else {
2010 		vsw_port_t	*portp = ldcp->ldc_port;
2011 		/*
2012 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2013 		 * We can negotiate that size with those peers provided the
2014 		 * following conditions are true:
2015 		 * - Our max_frame_size is greater only by VLAN_TAGSZ (4).
2016 		 * - Only pvid is defined for our peer and there are no vids.
2017 		 * If the above conditions are true, then we can send/recv only
2018 		 * untagged frames of max size ETHERMAX. Note that pvid of the
2019 		 * peer can be different, as vsw has to serve the vnet in that
2020 		 * vlan even if itself is not assigned to that vlan.
2021 		 */
2022 		if ((vswp->max_frame_size == ETHERMAX + VLAN_TAGSZ) &&
2023 		    portp->nvids == 0) {
2024 			lp->mtu = ETHERMAX;
2025 		}
2026 	}
2027 
2028 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2029 		/* Versions >= 1.2 */
2030 
2031 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2032 			/*
2033 			 * enable priority routines and pkt mode only if
2034 			 * at least one pri-eth-type is specified in MD.
2035 			 */
2036 			ldcp->tx = vsw_ldctx_pri;
2037 			ldcp->rx_pktdata = vsw_process_pkt_data;
2038 
2039 			/* set xfer mode for vsw_send_attr() */
2040 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2041 		} else {
2042 			/* no priority eth types defined in MD */
2043 
2044 			ldcp->tx = vsw_ldctx;
2045 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2046 
2047 			/* set xfer mode for vsw_send_attr() */
2048 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2049 		}
2050 
2051 	} else {
2052 		/* Versions prior to 1.2  */
2053 
2054 		vsw_reset_vnet_proto_ops(ldcp);
2055 	}
2056 }
2057 
2058 /*
2059  * Reset vnet-protocol-version dependent functions to v1.0.
2060  */
2061 static void
2062 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2063 {
2064 	lane_t	*lp = &ldcp->lane_out;
2065 
2066 	ldcp->tx = vsw_ldctx;
2067 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2068 
2069 	/* set xfer mode for vsw_send_attr() */
2070 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2071 }
2072 
2073 /*
2074  * Main routine for processing messages received over LDC.
2075  */
2076 static void
2077 vsw_process_pkt(void *arg)
2078 {
2079 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2080 	vsw_t 		*vswp = ldcp->ldc_vswp;
2081 	size_t		msglen;
2082 	vio_msg_tag_t	*tagp;
2083 	uint64_t	*ldcmsg;
2084 	int 		rv = 0;
2085 
2086 
2087 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2088 
2089 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2090 
2091 	ldcmsg = ldcp->ldcmsg;
2092 	/*
2093 	 * If channel is up read messages until channel is empty.
2094 	 */
2095 	do {
2096 		msglen = ldcp->msglen;
2097 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2098 
2099 		if (rv != 0) {
2100 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2101 			    __func__, ldcp->ldc_id, rv, msglen);
2102 		}
2103 
2104 		/* channel has been reset */
2105 		if (rv == ECONNRESET) {
2106 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2107 			break;
2108 		}
2109 
2110 		if (msglen == 0) {
2111 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2112 			    ldcp->ldc_id);
2113 			break;
2114 		}
2115 
2116 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2117 		    ldcp->ldc_id, msglen);
2118 
2119 		/*
2120 		 * Figure out what sort of packet we have gotten by
2121 		 * examining the msg tag, and then switch it appropriately.
2122 		 */
2123 		tagp = (vio_msg_tag_t *)ldcmsg;
2124 
2125 		switch (tagp->vio_msgtype) {
2126 		case VIO_TYPE_CTRL:
2127 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2128 			break;
2129 		case VIO_TYPE_DATA:
2130 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2131 			break;
2132 		case VIO_TYPE_ERR:
2133 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2134 			break;
2135 		default:
2136 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2137 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2138 			break;
2139 		}
2140 	} while (msglen);
2141 
2142 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2143 }
2144 
2145 /*
2146  * Dispatch a task to process a VIO control message.
2147  */
2148 static void
2149 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2150 {
2151 	vsw_ctrl_task_t		*ctaskp = NULL;
2152 	vsw_port_t		*port = ldcp->ldc_port;
2153 	vsw_t			*vswp = port->p_vswp;
2154 
2155 	D1(vswp, "%s: enter", __func__);
2156 
2157 	/*
2158 	 * We need to handle RDX ACK messages in-band as once they
2159 	 * are exchanged it is possible that we will get an
2160 	 * immediate (legitimate) data packet.
2161 	 */
2162 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2163 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2164 
2165 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2166 			return;
2167 
2168 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2169 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2170 		    "(ostate 0x%llx : hphase %d)", __func__,
2171 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2172 		vsw_next_milestone(ldcp);
2173 		return;
2174 	}
2175 
2176 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2177 
2178 	if (ctaskp == NULL) {
2179 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2180 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2181 		return;
2182 	}
2183 
2184 	ctaskp->ldcp = ldcp;
2185 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2186 	ctaskp->hss_id = ldcp->hss_id;
2187 
2188 	/*
2189 	 * Dispatch task to processing taskq if port is not in
2190 	 * the process of being detached.
2191 	 */
2192 	mutex_enter(&port->state_lock);
2193 	if (port->state == VSW_PORT_INIT) {
2194 		if ((vswp->taskq_p == NULL) ||
2195 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2196 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2197 			DERR(vswp, "%s: unable to dispatch task to taskq",
2198 			    __func__);
2199 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2200 			mutex_exit(&port->state_lock);
2201 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2202 			return;
2203 		}
2204 	} else {
2205 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2206 		    "task", __func__, port->p_instance);
2207 	}
2208 
2209 	mutex_exit(&port->state_lock);
2210 
2211 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2212 	    ldcp->ldc_id);
2213 	D1(vswp, "%s: exit", __func__);
2214 }
2215 
2216 /*
2217  * Process a VIO ctrl message. Invoked from taskq.
2218  */
2219 static void
2220 vsw_process_ctrl_pkt(void *arg)
2221 {
2222 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2223 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2224 	vsw_t 		*vswp = ldcp->ldc_vswp;
2225 	vio_msg_tag_t	tag;
2226 	uint16_t	env;
2227 
2228 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2229 
2230 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2231 	env = tag.vio_subtype_env;
2232 
2233 	/* stale pkt check */
2234 	if (ctaskp->hss_id < ldcp->hss_id) {
2235 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2236 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2237 		return;
2238 	}
2239 
2240 	/* session id check */
2241 	if (ldcp->session_status & VSW_PEER_SESSION) {
2242 		if (ldcp->peer_session != tag.vio_sid) {
2243 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2244 			    __func__, ldcp->ldc_id, tag.vio_sid);
2245 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2246 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2247 			return;
2248 		}
2249 	}
2250 
2251 	/*
2252 	 * Switch on vio_subtype envelope, then let lower routines
2253 	 * decide if its an INFO, ACK or NACK packet.
2254 	 */
2255 	switch (env) {
2256 	case VIO_VER_INFO:
2257 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2258 		break;
2259 	case VIO_DRING_REG:
2260 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2261 		break;
2262 	case VIO_DRING_UNREG:
2263 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2264 		break;
2265 	case VIO_ATTR_INFO:
2266 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2267 		break;
2268 	case VNET_MCAST_INFO:
2269 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2270 		break;
2271 	case VIO_RDX:
2272 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2273 		break;
2274 	case VIO_DDS_INFO:
2275 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2276 		break;
2277 	default:
2278 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2279 	}
2280 
2281 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2282 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2283 }
2284 
2285 /*
2286  * Version negotiation. We can end up here either because our peer
2287  * has responded to a handshake message we have sent it, or our peer
2288  * has initiated a handshake with us. If its the former then can only
2289  * be ACK or NACK, if its the later can only be INFO.
2290  *
2291  * If its an ACK we move to the next stage of the handshake, namely
2292  * attribute exchange. If its a NACK we see if we can specify another
2293  * version, if we can't we stop.
2294  *
2295  * If it is an INFO we reset all params associated with communication
2296  * in that direction over this channel (remember connection is
2297  * essentially 2 independent simplex channels).
2298  */
2299 void
2300 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2301 {
2302 	vio_ver_msg_t	*ver_pkt;
2303 	vsw_t 		*vswp = ldcp->ldc_vswp;
2304 
2305 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2306 
2307 	/*
2308 	 * We know this is a ctrl/version packet so
2309 	 * cast it into the correct structure.
2310 	 */
2311 	ver_pkt = (vio_ver_msg_t *)pkt;
2312 
2313 	switch (ver_pkt->tag.vio_subtype) {
2314 	case VIO_SUBTYPE_INFO:
2315 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2316 
2317 		/*
2318 		 * Record the session id, which we will use from now
2319 		 * until we see another VER_INFO msg. Even then the
2320 		 * session id in most cases will be unchanged, execpt
2321 		 * if channel was reset.
2322 		 */
2323 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2324 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2325 			DERR(vswp, "%s: updating session id for chan %lld "
2326 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2327 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2328 		}
2329 
2330 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2331 		ldcp->session_status |= VSW_PEER_SESSION;
2332 
2333 		/* Legal message at this time ? */
2334 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2335 			return;
2336 
2337 		/*
2338 		 * First check the device class. Currently only expect
2339 		 * to be talking to a network device. In the future may
2340 		 * also talk to another switch.
2341 		 */
2342 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2343 			DERR(vswp, "%s: illegal device class %d", __func__,
2344 			    ver_pkt->dev_class);
2345 
2346 			ver_pkt->tag.vio_sid = ldcp->local_session;
2347 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2348 
2349 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2350 
2351 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2352 			    sizeof (vio_ver_msg_t), B_TRUE);
2353 
2354 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2355 			vsw_next_milestone(ldcp);
2356 			return;
2357 		} else {
2358 			ldcp->dev_class = ver_pkt->dev_class;
2359 		}
2360 
2361 		/*
2362 		 * Now check the version.
2363 		 */
2364 		if (vsw_supported_version(ver_pkt) == 0) {
2365 			/*
2366 			 * Support this major version and possibly
2367 			 * adjusted minor version.
2368 			 */
2369 
2370 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2371 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2372 
2373 			/* Store accepted values */
2374 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2375 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2376 
2377 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2378 
2379 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2380 
2381 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2382 				/*
2383 				 * Send a version info message
2384 				 * using the accepted version that
2385 				 * we are about to ack. Also note that
2386 				 * we send our ver info before we ack.
2387 				 * Otherwise, as soon as receiving the
2388 				 * ack, obp sends attr info msg, which
2389 				 * breaks vsw_check_flag() invoked
2390 				 * from vsw_process_ctrl_attr_pkt();
2391 				 * as we also need VSW_VER_ACK_RECV to
2392 				 * be set in lane_out.lstate, before
2393 				 * we can receive attr info.
2394 				 */
2395 				vsw_send_ver(ldcp);
2396 			}
2397 		} else {
2398 			/*
2399 			 * NACK back with the next lower major/minor
2400 			 * pairing we support (if don't suuport any more
2401 			 * versions then they will be set to zero.
2402 			 */
2403 
2404 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2405 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2406 
2407 			/* Store updated values */
2408 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2409 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2410 
2411 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2412 
2413 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2414 		}
2415 
2416 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2417 		ver_pkt->tag.vio_sid = ldcp->local_session;
2418 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2419 		    sizeof (vio_ver_msg_t), B_TRUE);
2420 
2421 		vsw_next_milestone(ldcp);
2422 		break;
2423 
2424 	case VIO_SUBTYPE_ACK:
2425 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2426 
2427 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2428 			return;
2429 
2430 		/* Store updated values */
2431 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2432 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2433 
2434 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2435 		vsw_next_milestone(ldcp);
2436 
2437 		break;
2438 
2439 	case VIO_SUBTYPE_NACK:
2440 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2441 
2442 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2443 			return;
2444 
2445 		/*
2446 		 * If our peer sent us a NACK with the ver fields set to
2447 		 * zero then there is nothing more we can do. Otherwise see
2448 		 * if we support either the version suggested, or a lesser
2449 		 * one.
2450 		 */
2451 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2452 			DERR(vswp, "%s: peer unable to negotiate any "
2453 			    "further.", __func__);
2454 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2455 			vsw_next_milestone(ldcp);
2456 			return;
2457 		}
2458 
2459 		/*
2460 		 * Check to see if we support this major version or
2461 		 * a lower one. If we don't then maj/min will be set
2462 		 * to zero.
2463 		 */
2464 		(void) vsw_supported_version(ver_pkt);
2465 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2466 			/* Nothing more we can do */
2467 			DERR(vswp, "%s: version negotiation failed.\n",
2468 			    __func__);
2469 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2470 			vsw_next_milestone(ldcp);
2471 		} else {
2472 			/* found a supported major version */
2473 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2474 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2475 
2476 			D2(vswp, "%s: resending with updated values (%x, %x)",
2477 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2478 
2479 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2480 			ver_pkt->tag.vio_sid = ldcp->local_session;
2481 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2482 
2483 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2484 
2485 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2486 			    sizeof (vio_ver_msg_t), B_TRUE);
2487 
2488 			vsw_next_milestone(ldcp);
2489 
2490 		}
2491 		break;
2492 
2493 	default:
2494 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2495 		    ver_pkt->tag.vio_subtype);
2496 	}
2497 
2498 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2499 }
2500 
2501 /*
2502  * Process an attribute packet. We can end up here either because our peer
2503  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2504  * peer has sent us an attribute INFO message
2505  *
2506  * If its an ACK we then move to the next stage of the handshake which
2507  * is to send our descriptor ring info to our peer. If its a NACK then
2508  * there is nothing more we can (currently) do.
2509  *
2510  * If we get a valid/acceptable INFO packet (and we have already negotiated
2511  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2512  * NACK back and reset channel state to INACTIV.
2513  *
2514  * FUTURE: in time we will probably negotiate over attributes, but for
2515  * the moment unacceptable attributes are regarded as a fatal error.
2516  *
2517  */
2518 void
2519 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2520 {
2521 	vnet_attr_msg_t		*attr_pkt;
2522 	vsw_t			*vswp = ldcp->ldc_vswp;
2523 	vsw_port_t		*port = ldcp->ldc_port;
2524 	uint64_t		macaddr = 0;
2525 	int			i;
2526 
2527 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2528 
2529 	/*
2530 	 * We know this is a ctrl/attr packet so
2531 	 * cast it into the correct structure.
2532 	 */
2533 	attr_pkt = (vnet_attr_msg_t *)pkt;
2534 
2535 	switch (attr_pkt->tag.vio_subtype) {
2536 	case VIO_SUBTYPE_INFO:
2537 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2538 
2539 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2540 			return;
2541 
2542 		/*
2543 		 * If the attributes are unacceptable then we NACK back.
2544 		 */
2545 		if (vsw_check_attr(attr_pkt, ldcp)) {
2546 
2547 			DERR(vswp, "%s (chan %d): invalid attributes",
2548 			    __func__, ldcp->ldc_id);
2549 
2550 			vsw_free_lane_resources(ldcp, INBOUND);
2551 
2552 			attr_pkt->tag.vio_sid = ldcp->local_session;
2553 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2554 
2555 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2556 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2557 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2558 			    sizeof (vnet_attr_msg_t), B_TRUE);
2559 
2560 			vsw_next_milestone(ldcp);
2561 			return;
2562 		}
2563 
2564 		/*
2565 		 * Otherwise store attributes for this lane and update
2566 		 * lane state.
2567 		 */
2568 		ldcp->lane_in.mtu = attr_pkt->mtu;
2569 		ldcp->lane_in.addr = attr_pkt->addr;
2570 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
2571 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
2572 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
2573 
2574 		macaddr = ldcp->lane_in.addr;
2575 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2576 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2577 			macaddr >>= 8;
2578 		}
2579 
2580 		/* create the fdb entry for this port/mac address */
2581 		vsw_fdbe_add(vswp, port);
2582 
2583 		/* add the port to the specified vlans */
2584 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2585 
2586 		/* setup device specifc xmit routines */
2587 		mutex_enter(&port->tx_lock);
2588 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2589 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2590 		    (VSW_VER_LT(ldcp, 1, 2) &&
2591 		    (ldcp->lane_in.xfer_mode == VIO_DRING_MODE_V1_0))) {
2592 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2593 			port->transmit = vsw_dringsend;
2594 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
2595 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2596 			vsw_create_privring(ldcp);
2597 			port->transmit = vsw_descrsend;
2598 			ldcp->lane_out.xfer_mode = VIO_DESC_MODE;
2599 		}
2600 
2601 		/*
2602 		 * HybridIO is supported only vnet, not by OBP.
2603 		 * So, set hio_capable to true only when in DRING mode.
2604 		 */
2605 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2606 		    (ldcp->lane_in.xfer_mode != VIO_DESC_MODE)) {
2607 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2608 		} else {
2609 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2610 		}
2611 
2612 		mutex_exit(&port->tx_lock);
2613 
2614 		attr_pkt->tag.vio_sid = ldcp->local_session;
2615 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2616 
2617 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2618 
2619 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
2620 
2621 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2622 		    sizeof (vnet_attr_msg_t), B_TRUE);
2623 
2624 		vsw_next_milestone(ldcp);
2625 		break;
2626 
2627 	case VIO_SUBTYPE_ACK:
2628 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2629 
2630 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2631 			return;
2632 
2633 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
2634 		vsw_next_milestone(ldcp);
2635 		break;
2636 
2637 	case VIO_SUBTYPE_NACK:
2638 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2639 
2640 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2641 			return;
2642 
2643 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
2644 		vsw_next_milestone(ldcp);
2645 		break;
2646 
2647 	default:
2648 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2649 		    attr_pkt->tag.vio_subtype);
2650 	}
2651 
2652 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2653 }
2654 
2655 /*
2656  * Process a dring info packet. We can end up here either because our peer
2657  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2658  * peer has sent us a dring INFO message.
2659  *
2660  * If we get a valid/acceptable INFO packet (and we have already negotiated
2661  * a version) we ACK back and update the lane state, otherwise we NACK back.
2662  *
2663  * FUTURE: nothing to stop client from sending us info on multiple dring's
2664  * but for the moment we will just use the first one we are given.
2665  *
2666  */
2667 void
2668 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2669 {
2670 	vio_dring_reg_msg_t	*dring_pkt;
2671 	vsw_t			*vswp = ldcp->ldc_vswp;
2672 	ldc_mem_info_t		minfo;
2673 	dring_info_t		*dp, *dbp;
2674 	int			dring_found = 0;
2675 
2676 	/*
2677 	 * We know this is a ctrl/dring packet so
2678 	 * cast it into the correct structure.
2679 	 */
2680 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2681 
2682 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2683 
2684 	switch (dring_pkt->tag.vio_subtype) {
2685 	case VIO_SUBTYPE_INFO:
2686 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2687 
2688 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2689 			return;
2690 
2691 		/*
2692 		 * If the dring params are unacceptable then we NACK back.
2693 		 */
2694 		if (vsw_check_dring_info(dring_pkt)) {
2695 
2696 			DERR(vswp, "%s (%lld): invalid dring info",
2697 			    __func__, ldcp->ldc_id);
2698 
2699 			vsw_free_lane_resources(ldcp, INBOUND);
2700 
2701 			dring_pkt->tag.vio_sid = ldcp->local_session;
2702 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2703 
2704 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2705 
2706 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2707 
2708 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2709 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2710 
2711 			vsw_next_milestone(ldcp);
2712 			return;
2713 		}
2714 
2715 		/*
2716 		 * Otherwise, attempt to map in the dring using the
2717 		 * cookie. If that succeeds we send back a unique dring
2718 		 * identifier that the sending side will use in future
2719 		 * to refer to this descriptor ring.
2720 		 */
2721 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2722 
2723 		dp->num_descriptors = dring_pkt->num_descriptors;
2724 		dp->descriptor_size = dring_pkt->descriptor_size;
2725 		dp->options = dring_pkt->options;
2726 		dp->ncookies = dring_pkt->ncookies;
2727 
2728 		/*
2729 		 * Note: should only get one cookie. Enforced in
2730 		 * the ldc layer.
2731 		 */
2732 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2733 		    sizeof (ldc_mem_cookie_t));
2734 
2735 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2736 		    dp->num_descriptors, dp->descriptor_size);
2737 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2738 		    dp->options, dp->ncookies);
2739 
2740 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2741 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2742 		    LDC_DIRECT_MAP, &(dp->handle))) != 0) {
2743 
2744 			DERR(vswp, "%s: dring_map failed\n", __func__);
2745 
2746 			kmem_free(dp, sizeof (dring_info_t));
2747 			vsw_free_lane_resources(ldcp, INBOUND);
2748 
2749 			dring_pkt->tag.vio_sid = ldcp->local_session;
2750 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2751 
2752 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2753 
2754 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2755 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2756 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2757 
2758 			vsw_next_milestone(ldcp);
2759 			return;
2760 		}
2761 
2762 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2763 
2764 			DERR(vswp, "%s: dring_addr failed\n", __func__);
2765 
2766 			kmem_free(dp, sizeof (dring_info_t));
2767 			vsw_free_lane_resources(ldcp, INBOUND);
2768 
2769 			dring_pkt->tag.vio_sid = ldcp->local_session;
2770 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2771 
2772 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2773 
2774 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2775 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2776 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2777 
2778 			vsw_next_milestone(ldcp);
2779 			return;
2780 		} else {
2781 			/* store the address of the pub part of ring */
2782 			dp->pub_addr = minfo.vaddr;
2783 
2784 			/* cache the dring mtype */
2785 			dp->dring_mtype = minfo.mtype;
2786 		}
2787 
2788 		/* no private section as we are importing */
2789 		dp->priv_addr = NULL;
2790 
2791 		/*
2792 		 * Using simple mono increasing int for ident at
2793 		 * the moment.
2794 		 */
2795 		dp->ident = ldcp->next_ident;
2796 		ldcp->next_ident++;
2797 
2798 		dp->end_idx = 0;
2799 		dp->next = NULL;
2800 
2801 		/*
2802 		 * Link it onto the end of the list of drings
2803 		 * for this lane.
2804 		 */
2805 		if (ldcp->lane_in.dringp == NULL) {
2806 			D2(vswp, "%s: adding first INBOUND dring", __func__);
2807 			ldcp->lane_in.dringp = dp;
2808 		} else {
2809 			dbp = ldcp->lane_in.dringp;
2810 
2811 			while (dbp->next != NULL)
2812 				dbp = dbp->next;
2813 
2814 			dbp->next = dp;
2815 		}
2816 
2817 		/* acknowledge it */
2818 		dring_pkt->tag.vio_sid = ldcp->local_session;
2819 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2820 		dring_pkt->dring_ident = dp->ident;
2821 
2822 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2823 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
2824 
2825 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
2826 		vsw_next_milestone(ldcp);
2827 		break;
2828 
2829 	case VIO_SUBTYPE_ACK:
2830 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2831 
2832 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
2833 			return;
2834 
2835 		/*
2836 		 * Peer is acknowledging our dring info and will have
2837 		 * sent us a dring identifier which we will use to
2838 		 * refer to this ring w.r.t. our peer.
2839 		 */
2840 		dp = ldcp->lane_out.dringp;
2841 		if (dp != NULL) {
2842 			/*
2843 			 * Find the ring this ident should be associated
2844 			 * with.
2845 			 */
2846 			if (vsw_dring_match(dp, dring_pkt)) {
2847 				dring_found = 1;
2848 
2849 			} else while (dp != NULL) {
2850 				if (vsw_dring_match(dp, dring_pkt)) {
2851 					dring_found = 1;
2852 					break;
2853 				}
2854 				dp = dp->next;
2855 			}
2856 
2857 			if (dring_found == 0) {
2858 				DERR(NULL, "%s: unrecognised ring cookie",
2859 				    __func__);
2860 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2861 				return;
2862 			}
2863 
2864 		} else {
2865 			DERR(vswp, "%s: DRING ACK received but no drings "
2866 			    "allocated", __func__);
2867 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2868 			return;
2869 		}
2870 
2871 		/* store ident */
2872 		dp->ident = dring_pkt->dring_ident;
2873 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
2874 		vsw_next_milestone(ldcp);
2875 		break;
2876 
2877 	case VIO_SUBTYPE_NACK:
2878 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2879 
2880 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
2881 			return;
2882 
2883 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
2884 		vsw_next_milestone(ldcp);
2885 		break;
2886 
2887 	default:
2888 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2889 		    dring_pkt->tag.vio_subtype);
2890 	}
2891 
2892 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2893 }
2894 
2895 /*
2896  * Process a request from peer to unregister a dring.
2897  *
2898  * For the moment we just restart the handshake if our
2899  * peer endpoint attempts to unregister a dring.
2900  */
2901 void
2902 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
2903 {
2904 	vsw_t			*vswp = ldcp->ldc_vswp;
2905 	vio_dring_unreg_msg_t	*dring_pkt;
2906 
2907 	/*
2908 	 * We know this is a ctrl/dring packet so
2909 	 * cast it into the correct structure.
2910 	 */
2911 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
2912 
2913 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2914 
2915 	switch (dring_pkt->tag.vio_subtype) {
2916 	case VIO_SUBTYPE_INFO:
2917 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2918 
2919 		DWARN(vswp, "%s: restarting handshake..", __func__);
2920 		break;
2921 
2922 	case VIO_SUBTYPE_ACK:
2923 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2924 
2925 		DWARN(vswp, "%s: restarting handshake..", __func__);
2926 		break;
2927 
2928 	case VIO_SUBTYPE_NACK:
2929 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2930 
2931 		DWARN(vswp, "%s: restarting handshake..", __func__);
2932 		break;
2933 
2934 	default:
2935 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2936 		    dring_pkt->tag.vio_subtype);
2937 	}
2938 
2939 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2940 
2941 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2942 }
2943 
2944 #define	SND_MCST_NACK(ldcp, pkt) \
2945 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2946 	pkt->tag.vio_sid = ldcp->local_session; \
2947 	(void) vsw_send_msg(ldcp, (void *)pkt, \
2948 			sizeof (vnet_mcast_msg_t), B_TRUE);
2949 
2950 /*
2951  * Process a multicast request from a vnet.
2952  *
2953  * Vnet's specify a multicast address that they are interested in. This
2954  * address is used as a key into the hash table which forms the multicast
2955  * forwarding database (mFDB).
2956  *
2957  * The table keys are the multicast addresses, while the table entries
2958  * are pointers to lists of ports which wish to receive packets for the
2959  * specified multicast address.
2960  *
2961  * When a multicast packet is being switched we use the address as a key
2962  * into the hash table, and then walk the appropriate port list forwarding
2963  * the pkt to each port in turn.
2964  *
2965  * If a vnet is no longer interested in a particular multicast grouping
2966  * we simply find the correct location in the hash table and then delete
2967  * the relevant port from the port list.
2968  *
2969  * To deal with the case whereby a port is being deleted without first
2970  * removing itself from the lists in the hash table, we maintain a list
2971  * of multicast addresses the port has registered an interest in, within
2972  * the port structure itself. We then simply walk that list of addresses
2973  * using them as keys into the hash table and remove the port from the
2974  * appropriate lists.
2975  */
2976 static void
2977 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
2978 {
2979 	vnet_mcast_msg_t	*mcst_pkt;
2980 	vsw_port_t		*port = ldcp->ldc_port;
2981 	vsw_t			*vswp = ldcp->ldc_vswp;
2982 	int			i;
2983 
2984 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2985 
2986 	/*
2987 	 * We know this is a ctrl/mcast packet so
2988 	 * cast it into the correct structure.
2989 	 */
2990 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
2991 
2992 	switch (mcst_pkt->tag.vio_subtype) {
2993 	case VIO_SUBTYPE_INFO:
2994 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2995 
2996 		/*
2997 		 * Check if in correct state to receive a multicast
2998 		 * message (i.e. handshake complete). If not reset
2999 		 * the handshake.
3000 		 */
3001 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3002 			return;
3003 
3004 		/*
3005 		 * Before attempting to add or remove address check
3006 		 * that they are valid multicast addresses.
3007 		 * If not, then NACK back.
3008 		 */
3009 		for (i = 0; i < mcst_pkt->count; i++) {
3010 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3011 				DERR(vswp, "%s: invalid multicast address",
3012 				    __func__);
3013 				SND_MCST_NACK(ldcp, mcst_pkt);
3014 				return;
3015 			}
3016 		}
3017 
3018 		/*
3019 		 * Now add/remove the addresses. If this fails we
3020 		 * NACK back.
3021 		 */
3022 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3023 			SND_MCST_NACK(ldcp, mcst_pkt);
3024 			return;
3025 		}
3026 
3027 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3028 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3029 
3030 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3031 
3032 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3033 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3034 		break;
3035 
3036 	case VIO_SUBTYPE_ACK:
3037 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3038 
3039 		/*
3040 		 * We shouldn't ever get a multicast ACK message as
3041 		 * at the moment we never request multicast addresses
3042 		 * to be set on some other device. This may change in
3043 		 * the future if we have cascading switches.
3044 		 */
3045 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3046 			return;
3047 
3048 				/* Do nothing */
3049 		break;
3050 
3051 	case VIO_SUBTYPE_NACK:
3052 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3053 
3054 		/*
3055 		 * We shouldn't get a multicast NACK packet for the
3056 		 * same reasons as we shouldn't get a ACK packet.
3057 		 */
3058 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3059 			return;
3060 
3061 				/* Do nothing */
3062 		break;
3063 
3064 	default:
3065 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3066 		    mcst_pkt->tag.vio_subtype);
3067 	}
3068 
3069 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3070 }
3071 
3072 static void
3073 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3074 {
3075 	vio_rdx_msg_t	*rdx_pkt;
3076 	vsw_t		*vswp = ldcp->ldc_vswp;
3077 
3078 	/*
3079 	 * We know this is a ctrl/rdx packet so
3080 	 * cast it into the correct structure.
3081 	 */
3082 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3083 
3084 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3085 
3086 	switch (rdx_pkt->tag.vio_subtype) {
3087 	case VIO_SUBTYPE_INFO:
3088 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3089 
3090 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3091 			return;
3092 
3093 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3094 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3095 
3096 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3097 
3098 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3099 
3100 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3101 		    sizeof (vio_rdx_msg_t), B_TRUE);
3102 
3103 		vsw_next_milestone(ldcp);
3104 		break;
3105 
3106 	case VIO_SUBTYPE_ACK:
3107 		/*
3108 		 * Should be handled in-band by callback handler.
3109 		 */
3110 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3111 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3112 		break;
3113 
3114 	case VIO_SUBTYPE_NACK:
3115 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3116 
3117 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3118 			return;
3119 
3120 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3121 		vsw_next_milestone(ldcp);
3122 		break;
3123 
3124 	default:
3125 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3126 		    rdx_pkt->tag.vio_subtype);
3127 	}
3128 
3129 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3130 }
3131 
3132 static void
3133 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3134 	uint32_t msglen)
3135 {
3136 	uint16_t	env = tagp->vio_subtype_env;
3137 	vsw_t		*vswp = ldcp->ldc_vswp;
3138 
3139 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3140 
3141 	/* session id check */
3142 	if (ldcp->session_status & VSW_PEER_SESSION) {
3143 		if (ldcp->peer_session != tagp->vio_sid) {
3144 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3145 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3146 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3147 			return;
3148 		}
3149 	}
3150 
3151 	/*
3152 	 * It is an error for us to be getting data packets
3153 	 * before the handshake has completed.
3154 	 */
3155 	if (ldcp->hphase != VSW_MILESTONE4) {
3156 		DERR(vswp, "%s: got data packet before handshake complete "
3157 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3158 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3159 		DUMP_FLAGS(ldcp->lane_in.lstate);
3160 		DUMP_FLAGS(ldcp->lane_out.lstate);
3161 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3162 		return;
3163 	}
3164 
3165 	/*
3166 	 * To reduce the locking contention, release the
3167 	 * ldc_cblock here and re-acquire it once we are done
3168 	 * receiving packets.
3169 	 */
3170 	mutex_exit(&ldcp->ldc_cblock);
3171 	mutex_enter(&ldcp->ldc_rxlock);
3172 
3173 	/*
3174 	 * Switch on vio_subtype envelope, then let lower routines
3175 	 * decide if its an INFO, ACK or NACK packet.
3176 	 */
3177 	if (env == VIO_DRING_DATA) {
3178 		vsw_process_data_dring_pkt(ldcp, dpkt);
3179 	} else if (env == VIO_PKT_DATA) {
3180 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3181 	} else if (env == VIO_DESC_DATA) {
3182 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3183 	} else {
3184 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3185 	}
3186 
3187 	mutex_exit(&ldcp->ldc_rxlock);
3188 	mutex_enter(&ldcp->ldc_cblock);
3189 
3190 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3191 }
3192 
3193 #define	SND_DRING_NACK(ldcp, pkt) \
3194 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3195 	pkt->tag.vio_sid = ldcp->local_session; \
3196 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3197 			sizeof (vio_dring_msg_t), B_TRUE);
3198 
3199 static void
3200 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3201 {
3202 	vio_dring_msg_t		*dring_pkt;
3203 	vnet_public_desc_t	desc, *pub_addr = NULL;
3204 	vsw_private_desc_t	*priv_addr = NULL;
3205 	dring_info_t		*dp = NULL;
3206 	vsw_t			*vswp = ldcp->ldc_vswp;
3207 	mblk_t			*mp = NULL;
3208 	mblk_t			*bp = NULL;
3209 	mblk_t			*bpt = NULL;
3210 	size_t			nbytes = 0;
3211 	uint64_t		chain = 0;
3212 	uint64_t		len;
3213 	uint32_t		pos, start;
3214 	uint32_t		range_start, range_end;
3215 	int32_t			end, num, cnt = 0;
3216 	int			i, rv, rng_rv = 0, msg_rv = 0;
3217 	boolean_t		prev_desc_ack = B_FALSE;
3218 	int			read_attempts = 0;
3219 	struct ether_header	*ehp;
3220 
3221 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3222 
3223 	/*
3224 	 * We know this is a data/dring packet so
3225 	 * cast it into the correct structure.
3226 	 */
3227 	dring_pkt = (vio_dring_msg_t *)dpkt;
3228 
3229 	/*
3230 	 * Switch on the vio_subtype. If its INFO then we need to
3231 	 * process the data. If its an ACK we need to make sure
3232 	 * it makes sense (i.e did we send an earlier data/info),
3233 	 * and if its a NACK then we maybe attempt a retry.
3234 	 */
3235 	switch (dring_pkt->tag.vio_subtype) {
3236 	case VIO_SUBTYPE_INFO:
3237 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3238 
3239 		READ_ENTER(&ldcp->lane_in.dlistrw);
3240 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3241 		    dring_pkt->dring_ident)) == NULL) {
3242 			RW_EXIT(&ldcp->lane_in.dlistrw);
3243 
3244 			DERR(vswp, "%s(%lld): unable to find dring from "
3245 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3246 			    dring_pkt->dring_ident);
3247 
3248 			SND_DRING_NACK(ldcp, dring_pkt);
3249 			return;
3250 		}
3251 
3252 		start = pos = dring_pkt->start_idx;
3253 		end = dring_pkt->end_idx;
3254 		len = dp->num_descriptors;
3255 
3256 		range_start = range_end = pos;
3257 
3258 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3259 		    __func__, ldcp->ldc_id, start, end);
3260 
3261 		if (end == -1) {
3262 			num = -1;
3263 		} else if (end >= 0) {
3264 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3265 
3266 			/* basic sanity check */
3267 			if (end > len) {
3268 				RW_EXIT(&ldcp->lane_in.dlistrw);
3269 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3270 				    "ring length %lld", __func__,
3271 				    ldcp->ldc_id, end, len);
3272 
3273 				SND_DRING_NACK(ldcp, dring_pkt);
3274 				return;
3275 			}
3276 		} else {
3277 			RW_EXIT(&ldcp->lane_in.dlistrw);
3278 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3279 			    __func__, ldcp->ldc_id, end);
3280 			SND_DRING_NACK(ldcp, dring_pkt);
3281 			return;
3282 		}
3283 
3284 		while (cnt != num) {
3285 vsw_recheck_desc:
3286 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3287 
3288 			if ((rng_rv = vnet_dring_entry_copy(pub_addr,
3289 			    &desc, dp->dring_mtype, dp->handle,
3290 			    pos, pos)) != 0) {
3291 				DERR(vswp, "%s(%lld): unable to copy "
3292 				    "descriptor at pos %d: err %d",
3293 				    __func__, pos, ldcp->ldc_id, rng_rv);
3294 				ldcp->ldc_stats.ierrors++;
3295 				break;
3296 			}
3297 
3298 			/*
3299 			 * When given a bounded range of descriptors
3300 			 * to process, its an error to hit a descriptor
3301 			 * which is not ready. In the non-bounded case
3302 			 * (end_idx == -1) this simply indicates we have
3303 			 * reached the end of the current active range.
3304 			 */
3305 			if (desc.hdr.dstate != VIO_DESC_READY) {
3306 				/* unbound - no error */
3307 				if (end == -1) {
3308 					if (read_attempts == vsw_read_attempts)
3309 						break;
3310 
3311 					delay(drv_usectohz(vsw_desc_delay));
3312 					read_attempts++;
3313 					goto vsw_recheck_desc;
3314 				}
3315 
3316 				/* bounded - error - so NACK back */
3317 				RW_EXIT(&ldcp->lane_in.dlistrw);
3318 				DERR(vswp, "%s(%lld): descriptor not READY "
3319 				    "(%d)", __func__, ldcp->ldc_id,
3320 				    desc.hdr.dstate);
3321 				SND_DRING_NACK(ldcp, dring_pkt);
3322 				return;
3323 			}
3324 
3325 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3326 
3327 			range_end = pos;
3328 
3329 			/*
3330 			 * If we ACK'd the previous descriptor then now
3331 			 * record the new range start position for later
3332 			 * ACK's.
3333 			 */
3334 			if (prev_desc_ack) {
3335 				range_start = pos;
3336 
3337 				D2(vswp, "%s(%lld): updating range start to be "
3338 				    "%d", __func__, ldcp->ldc_id, range_start);
3339 
3340 				prev_desc_ack = B_FALSE;
3341 			}
3342 
3343 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3344 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3345 			    __func__, ldcp->ldc_id, pos, &desc,
3346 			    desc.hdr.dstate, desc.nbytes);
3347 
3348 			/*
3349 			 * Ensure that we ask ldc for an aligned
3350 			 * number of bytes. Data is padded to align on 8
3351 			 * byte boundary, desc.nbytes is actual data length,
3352 			 * i.e. minus that padding.
3353 			 */
3354 			nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7;
3355 
3356 			mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3357 			if (mp == NULL) {
3358 				ldcp->ldc_stats.rx_vio_allocb_fail++;
3359 				/*
3360 				 * No free receive buffers available, so
3361 				 * fallback onto allocb(9F). Make sure that
3362 				 * we get a data buffer which is a multiple
3363 				 * of 8 as this is required by ldc_mem_copy.
3364 				 */
3365 				DTRACE_PROBE(allocb);
3366 				if ((mp = allocb(desc.nbytes + VNET_IPALIGN + 8,
3367 				    BPRI_MED)) == NULL) {
3368 					DERR(vswp, "%s(%ld): allocb failed",
3369 					    __func__, ldcp->ldc_id);
3370 					rng_rv = vnet_dring_entry_set_dstate(
3371 					    pub_addr, dp->dring_mtype,
3372 					    dp->handle, pos, pos,
3373 					    VIO_DESC_DONE);
3374 					ldcp->ldc_stats.ierrors++;
3375 					ldcp->ldc_stats.rx_allocb_fail++;
3376 					break;
3377 				}
3378 			}
3379 
3380 			rv = ldc_mem_copy(ldcp->ldc_handle,
3381 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3382 			    desc.memcookie, desc.ncookies, LDC_COPY_IN);
3383 			if (rv != 0) {
3384 				DERR(vswp, "%s(%d): unable to copy in data "
3385 				    "from %d cookies in desc %d (rv %d)",
3386 				    __func__, ldcp->ldc_id, desc.ncookies,
3387 				    pos, rv);
3388 				freemsg(mp);
3389 
3390 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3391 				    dp->dring_mtype, dp->handle, pos, pos,
3392 				    VIO_DESC_DONE);
3393 				ldcp->ldc_stats.ierrors++;
3394 				break;
3395 			} else {
3396 				D2(vswp, "%s(%d): copied in %ld bytes"
3397 				    " using %d cookies", __func__,
3398 				    ldcp->ldc_id, nbytes, desc.ncookies);
3399 			}
3400 
3401 			/* adjust the read pointer to skip over the padding */
3402 			mp->b_rptr += VNET_IPALIGN;
3403 
3404 			/* point to the actual end of data */
3405 			mp->b_wptr = mp->b_rptr + desc.nbytes;
3406 
3407 			/* update statistics */
3408 			ehp = (struct ether_header *)mp->b_rptr;
3409 			if (IS_BROADCAST(ehp))
3410 				ldcp->ldc_stats.brdcstrcv++;
3411 			else if (IS_MULTICAST(ehp))
3412 				ldcp->ldc_stats.multircv++;
3413 
3414 			ldcp->ldc_stats.ipackets++;
3415 			ldcp->ldc_stats.rbytes += desc.nbytes;
3416 
3417 			/*
3418 			 * IPALIGN space can be used for VLAN_TAG
3419 			 */
3420 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3421 			    VSW_VNETPORT, mp);
3422 
3423 			/* build a chain of received packets */
3424 			if (bp == NULL) {
3425 				/* first pkt */
3426 				bp = mp;
3427 				bp->b_next = bp->b_prev = NULL;
3428 				bpt = bp;
3429 				chain = 1;
3430 			} else {
3431 				mp->b_next = mp->b_prev = NULL;
3432 				bpt->b_next = mp;
3433 				bpt = mp;
3434 				chain++;
3435 			}
3436 
3437 			/* mark we are finished with this descriptor */
3438 			if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3439 			    dp->dring_mtype, dp->handle, pos, pos,
3440 			    VIO_DESC_DONE)) != 0) {
3441 				DERR(vswp, "%s(%lld): unable to update "
3442 				    "dstate at pos %d: err %d",
3443 				    __func__, pos, ldcp->ldc_id, rng_rv);
3444 				ldcp->ldc_stats.ierrors++;
3445 				break;
3446 			}
3447 
3448 			/*
3449 			 * Send an ACK back to peer if requested.
3450 			 */
3451 			if (desc.hdr.ack) {
3452 				dring_pkt->start_idx = range_start;
3453 				dring_pkt->end_idx = range_end;
3454 
3455 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3456 				    " requested", __func__, ldcp->ldc_id,
3457 				    dring_pkt->start_idx, dring_pkt->end_idx);
3458 
3459 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3460 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3461 				dring_pkt->tag.vio_sid = ldcp->local_session;
3462 
3463 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3464 				    sizeof (vio_dring_msg_t), B_FALSE);
3465 
3466 				/*
3467 				 * Check if ACK was successfully sent. If not
3468 				 * we break and deal with that below.
3469 				 */
3470 				if (msg_rv != 0)
3471 					break;
3472 
3473 				prev_desc_ack = B_TRUE;
3474 				range_start = pos;
3475 			}
3476 
3477 			/* next descriptor */
3478 			pos = (pos + 1) % len;
3479 			cnt++;
3480 
3481 			/*
3482 			 * Break out of loop here and stop processing to
3483 			 * allow some other network device (or disk) to
3484 			 * get access to the cpu.
3485 			 */
3486 			if (chain > vsw_chain_len) {
3487 				D3(vswp, "%s(%lld): switching chain of %d "
3488 				    "msgs", __func__, ldcp->ldc_id, chain);
3489 				break;
3490 			}
3491 		}
3492 		RW_EXIT(&ldcp->lane_in.dlistrw);
3493 
3494 		/* send the chain of packets to be switched */
3495 		if (bp != NULL) {
3496 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3497 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3498 			    __func__, ldcp->ldc_id, chain);
3499 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3500 			    ldcp->ldc_port, NULL);
3501 		}
3502 
3503 		/*
3504 		 * If when we encountered an error when attempting to
3505 		 * access an imported dring, initiate a connection reset.
3506 		 */
3507 		if (rng_rv != 0) {
3508 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3509 			break;
3510 		}
3511 
3512 		/*
3513 		 * If when we attempted to send the ACK we found that the
3514 		 * channel had been reset then now handle this. We deal with
3515 		 * it here as we cannot reset the channel while holding the
3516 		 * dlistrw lock, and we don't want to acquire/release it
3517 		 * continuously in the above loop, as a channel reset should
3518 		 * be a rare event.
3519 		 */
3520 		if (msg_rv == ECONNRESET) {
3521 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3522 			break;
3523 		}
3524 
3525 		DTRACE_PROBE1(msg_cnt, int, cnt);
3526 
3527 		/*
3528 		 * We are now finished so ACK back with the state
3529 		 * set to STOPPING so our peer knows we are finished
3530 		 */
3531 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3532 		dring_pkt->tag.vio_sid = ldcp->local_session;
3533 
3534 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3535 
3536 		DTRACE_PROBE(stop_process_sent);
3537 
3538 		/*
3539 		 * We have not processed any more descriptors beyond
3540 		 * the last one we ACK'd.
3541 		 */
3542 		if (prev_desc_ack)
3543 			range_start = range_end;
3544 
3545 		dring_pkt->start_idx = range_start;
3546 		dring_pkt->end_idx = range_end;
3547 
3548 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3549 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3550 		    dring_pkt->end_idx);
3551 
3552 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3553 		    sizeof (vio_dring_msg_t), B_TRUE);
3554 		break;
3555 
3556 	case VIO_SUBTYPE_ACK:
3557 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3558 		/*
3559 		 * Verify that the relevant descriptors are all
3560 		 * marked as DONE
3561 		 */
3562 		READ_ENTER(&ldcp->lane_out.dlistrw);
3563 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3564 		    dring_pkt->dring_ident)) == NULL) {
3565 			RW_EXIT(&ldcp->lane_out.dlistrw);
3566 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3567 			return;
3568 		}
3569 
3570 		start = end = 0;
3571 		start = dring_pkt->start_idx;
3572 		end = dring_pkt->end_idx;
3573 		len = dp->num_descriptors;
3574 
3575 
3576 		mutex_enter(&dp->dlock);
3577 		dp->last_ack_recv = end;
3578 		ldcp->ldc_stats.dring_data_acks++;
3579 		mutex_exit(&dp->dlock);
3580 
3581 		(void) vsw_reclaim_dring(dp, start);
3582 
3583 		/*
3584 		 * If our peer is stopping processing descriptors then
3585 		 * we check to make sure it has processed all the descriptors
3586 		 * we have updated. If not then we send it a new message
3587 		 * to prompt it to restart.
3588 		 */
3589 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3590 			DTRACE_PROBE(stop_process_recv);
3591 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3592 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3593 			    dring_pkt->end_idx);
3594 
3595 			/*
3596 			 * Check next descriptor in public section of ring.
3597 			 * If its marked as READY then we need to prompt our
3598 			 * peer to start processing the ring again.
3599 			 */
3600 			i = (end + 1) % len;
3601 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3602 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3603 
3604 			/*
3605 			 * Hold the restart lock across all of this to
3606 			 * make sure that its not possible for us to
3607 			 * decide that a msg needs to be sent in the future
3608 			 * but the sending code having already checked is
3609 			 * about to exit.
3610 			 */
3611 			mutex_enter(&dp->restart_lock);
3612 			ldcp->ldc_stats.dring_stopped_acks++;
3613 			mutex_enter(&priv_addr->dstate_lock);
3614 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3615 
3616 				mutex_exit(&priv_addr->dstate_lock);
3617 
3618 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3619 				dring_pkt->tag.vio_sid = ldcp->local_session;
3620 
3621 				dring_pkt->start_idx = (end + 1) % len;
3622 				dring_pkt->end_idx = -1;
3623 
3624 				D2(vswp, "%s(%lld) : sending restart msg:"
3625 				    " %d : %d", __func__, ldcp->ldc_id,
3626 				    dring_pkt->start_idx, dring_pkt->end_idx);
3627 
3628 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3629 				    sizeof (vio_dring_msg_t), B_FALSE);
3630 				ldcp->ldc_stats.dring_data_msgs++;
3631 
3632 			} else {
3633 				mutex_exit(&priv_addr->dstate_lock);
3634 				dp->restart_reqd = B_TRUE;
3635 			}
3636 			mutex_exit(&dp->restart_lock);
3637 		}
3638 		RW_EXIT(&ldcp->lane_out.dlistrw);
3639 
3640 		/* only do channel reset after dropping dlistrw lock */
3641 		if (msg_rv == ECONNRESET)
3642 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3643 
3644 		break;
3645 
3646 	case VIO_SUBTYPE_NACK:
3647 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3648 		    __func__, ldcp->ldc_id);
3649 		/*
3650 		 * Something is badly wrong if we are getting NACK's
3651 		 * for our data pkts. So reset the channel.
3652 		 */
3653 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3654 
3655 		break;
3656 
3657 	default:
3658 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3659 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3660 	}
3661 
3662 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3663 }
3664 
3665 /*
3666  * dummy pkt data handler function for vnet protocol version 1.0
3667  */
3668 static void
3669 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3670 {
3671 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3672 }
3673 
3674 /*
3675  * This function handles raw pkt data messages received over the channel.
3676  * Currently, only priority-eth-type frames are received through this mechanism.
3677  * In this case, the frame(data) is present within the message itself which
3678  * is copied into an mblk before switching it.
3679  */
3680 static void
3681 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3682 {
3683 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3684 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3685 	uint32_t		size;
3686 	mblk_t			*mp;
3687 	vsw_t			*vswp = ldcp->ldc_vswp;
3688 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3689 	lane_t			*lp = &ldcp->lane_out;
3690 
3691 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3692 	if (size < ETHERMIN || size > lp->mtu) {
3693 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3694 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3695 		    ldcp->ldc_id, size);
3696 		return;
3697 	}
3698 
3699 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3700 	if (mp == NULL) {
3701 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3702 		if (mp == NULL) {
3703 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3704 			DWARN(vswp, "%s(%lld) allocb failure, "
3705 			    "unable to process priority frame\n", __func__,
3706 			    ldcp->ldc_id);
3707 			return;
3708 		}
3709 	}
3710 
3711 	/* skip over the extra space for vlan tag */
3712 	mp->b_rptr += VLAN_TAGSZ;
3713 
3714 	/* copy the frame from the payload of raw data msg into the mblk */
3715 	bcopy(dpkt->data, mp->b_rptr, size);
3716 	mp->b_wptr = mp->b_rptr + size;
3717 
3718 	/* update stats */
3719 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3720 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3721 
3722 	/*
3723 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3724 	 */
3725 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3726 
3727 	/* switch the frame to destination */
3728 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3729 }
3730 
3731 /*
3732  * Process an in-band descriptor message (most likely from
3733  * OBP).
3734  */
3735 static void
3736 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3737 {
3738 	vnet_ibnd_desc_t	*ibnd_desc;
3739 	dring_info_t		*dp = NULL;
3740 	vsw_private_desc_t	*priv_addr = NULL;
3741 	vsw_t			*vswp = ldcp->ldc_vswp;
3742 	mblk_t			*mp = NULL;
3743 	size_t			nbytes = 0;
3744 	size_t			off = 0;
3745 	uint64_t		idx = 0;
3746 	uint32_t		num = 1, len, datalen = 0;
3747 	uint64_t		ncookies = 0;
3748 	int			i, rv;
3749 	int			j = 0;
3750 
3751 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3752 
3753 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3754 
3755 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3756 	case VIO_SUBTYPE_INFO:
3757 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3758 
3759 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3760 			return;
3761 
3762 		/*
3763 		 * Data is padded to align on a 8 byte boundary,
3764 		 * nbytes is actual data length, i.e. minus that
3765 		 * padding.
3766 		 */
3767 		datalen = ibnd_desc->nbytes;
3768 
3769 		D2(vswp, "%s(%lld): processing inband desc : "
3770 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3771 
3772 		ncookies = ibnd_desc->ncookies;
3773 
3774 		/*
3775 		 * allocb(9F) returns an aligned data block. We
3776 		 * need to ensure that we ask ldc for an aligned
3777 		 * number of bytes also.
3778 		 */
3779 		nbytes = datalen;
3780 		if (nbytes & 0x7) {
3781 			off = 8 - (nbytes & 0x7);
3782 			nbytes += off;
3783 		}
3784 
3785 		/* alloc extra space for VLAN_TAG */
3786 		mp = allocb(datalen + 8, BPRI_MED);
3787 		if (mp == NULL) {
3788 			DERR(vswp, "%s(%lld): allocb failed",
3789 			    __func__, ldcp->ldc_id);
3790 			ldcp->ldc_stats.rx_allocb_fail++;
3791 			return;
3792 		}
3793 
3794 		/* skip over the extra space for VLAN_TAG */
3795 		mp->b_rptr += 8;
3796 
3797 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3798 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3799 		    LDC_COPY_IN);
3800 
3801 		if (rv != 0) {
3802 			DERR(vswp, "%s(%d): unable to copy in data from "
3803 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3804 			freemsg(mp);
3805 			ldcp->ldc_stats.ierrors++;
3806 			return;
3807 		}
3808 
3809 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3810 		    __func__, ldcp->ldc_id, nbytes, ncookies);
3811 
3812 		/* point to the actual end of data */
3813 		mp->b_wptr = mp->b_rptr + datalen;
3814 		ldcp->ldc_stats.ipackets++;
3815 		ldcp->ldc_stats.rbytes += datalen;
3816 
3817 		/*
3818 		 * We ACK back every in-band descriptor message we process
3819 		 */
3820 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3821 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3822 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3823 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3824 
3825 		/*
3826 		 * there is extra space alloc'd for VLAN_TAG
3827 		 */
3828 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3829 
3830 		/* send the packet to be switched */
3831 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3832 		    ldcp->ldc_port, NULL);
3833 
3834 		break;
3835 
3836 	case VIO_SUBTYPE_ACK:
3837 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3838 
3839 		/* Verify the ACK is valid */
3840 		idx = ibnd_desc->hdr.desc_handle;
3841 
3842 		if (idx >= vsw_ntxds) {
3843 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3844 			    "(idx %ld)", vswp->instance, idx);
3845 			return;
3846 		}
3847 
3848 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3849 			DERR(vswp, "%s: no dring found", __func__);
3850 			return;
3851 		}
3852 
3853 		len = dp->num_descriptors;
3854 		/*
3855 		 * If the descriptor we are being ACK'ed for is not the
3856 		 * one we expected, then pkts were lost somwhere, either
3857 		 * when we tried to send a msg, or a previous ACK msg from
3858 		 * our peer. In either case we now reclaim the descriptors
3859 		 * in the range from the last ACK we received up to the
3860 		 * current ACK.
3861 		 */
3862 		if (idx != dp->last_ack_recv) {
3863 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3864 			    __func__, dp->last_ack_recv, idx);
3865 			num = idx >= dp->last_ack_recv ?
3866 			    idx - dp->last_ack_recv + 1:
3867 			    (len - dp->last_ack_recv + 1) + idx;
3868 		}
3869 
3870 		/*
3871 		 * When we sent the in-band message to our peer we
3872 		 * marked the copy in our private ring as READY. We now
3873 		 * check that the descriptor we are being ACK'ed for is in
3874 		 * fact READY, i.e. it is one we have shared with our peer.
3875 		 *
3876 		 * If its not we flag an error, but still reset the descr
3877 		 * back to FREE.
3878 		 */
3879 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3880 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3881 			mutex_enter(&priv_addr->dstate_lock);
3882 			if (priv_addr->dstate != VIO_DESC_READY) {
3883 				DERR(vswp, "%s: (%ld) desc at index %ld not "
3884 				    "READY (0x%lx)", __func__,
3885 				    ldcp->ldc_id, idx, priv_addr->dstate);
3886 				DERR(vswp, "%s: bound %d: ncookies %ld : "
3887 				    "datalen %ld", __func__,
3888 				    priv_addr->bound, priv_addr->ncookies,
3889 				    priv_addr->datalen);
3890 			}
3891 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3892 			    ldcp->ldc_id, idx);
3893 			/* release resources associated with sent msg */
3894 			priv_addr->datalen = 0;
3895 			priv_addr->dstate = VIO_DESC_FREE;
3896 			mutex_exit(&priv_addr->dstate_lock);
3897 		}
3898 		/* update to next expected value */
3899 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3900 
3901 		break;
3902 
3903 	case VIO_SUBTYPE_NACK:
3904 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3905 
3906 		/*
3907 		 * We should only get a NACK if our peer doesn't like
3908 		 * something about a message we have sent it. If this
3909 		 * happens we just release the resources associated with
3910 		 * the message. (We are relying on higher layers to decide
3911 		 * whether or not to resend.
3912 		 */
3913 
3914 		/* limit check */
3915 		idx = ibnd_desc->hdr.desc_handle;
3916 
3917 		if (idx >= vsw_ntxds) {
3918 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3919 			    __func__, idx);
3920 			return;
3921 		}
3922 
3923 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3924 			DERR(vswp, "%s: no dring found", __func__);
3925 			return;
3926 		}
3927 
3928 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3929 
3930 		/* move to correct location in ring */
3931 		priv_addr += idx;
3932 
3933 		/* release resources associated with sent msg */
3934 		mutex_enter(&priv_addr->dstate_lock);
3935 		priv_addr->datalen = 0;
3936 		priv_addr->dstate = VIO_DESC_FREE;
3937 		mutex_exit(&priv_addr->dstate_lock);
3938 
3939 		break;
3940 
3941 	default:
3942 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3943 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3944 	}
3945 
3946 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3947 }
3948 
3949 static void
3950 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3951 {
3952 	_NOTE(ARGUNUSED(epkt))
3953 
3954 	vsw_t		*vswp = ldcp->ldc_vswp;
3955 	uint16_t	env = tagp->vio_subtype_env;
3956 
3957 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3958 
3959 	/*
3960 	 * Error vio_subtypes have yet to be defined. So for
3961 	 * the moment we can't do anything.
3962 	 */
3963 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3964 
3965 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3966 }
3967 
3968 /* transmit the packet over the given port */
3969 int
3970 vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
3971 {
3972 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
3973 	vsw_ldc_t 	*ldcp;
3974 	int		status = 0;
3975 	uint32_t	n;
3976 
3977 	READ_ENTER(&ldcl->lockrw);
3978 	/*
3979 	 * Note for now, we have a single channel.
3980 	 */
3981 	ldcp = ldcl->head;
3982 	if (ldcp == NULL) {
3983 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
3984 		freemsgchain(mp);
3985 		RW_EXIT(&ldcl->lockrw);
3986 		return (1);
3987 	}
3988 
3989 	n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3990 
3991 	count -= n;
3992 	if (count == 0) {
3993 		goto vsw_portsend_exit;
3994 	}
3995 
3996 	status = ldcp->tx(ldcp, mp, mpt, count);
3997 
3998 vsw_portsend_exit:
3999 	RW_EXIT(&ldcl->lockrw);
4000 
4001 	return (status);
4002 }
4003 
4004 /*
4005  * Break up frames into 2 seperate chains: normal and
4006  * priority, based on the frame type. The number of
4007  * priority frames is also counted and returned.
4008  *
4009  * Params:
4010  * 	vswp:	pointer to the instance of vsw
4011  *	np:	head of packet chain to be broken
4012  *	npt:	tail of packet chain to be broken
4013  *
4014  * Returns:
4015  *	np:	head of normal data packets
4016  *	npt:	tail of normal data packets
4017  *	hp:	head of high priority packets
4018  *	hpt:	tail of high priority packets
4019  */
4020 static uint32_t
4021 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4022 	mblk_t **hp, mblk_t **hpt)
4023 {
4024 	mblk_t			*tmp = NULL;
4025 	mblk_t			*smp = NULL;
4026 	mblk_t			*hmp = NULL;	/* high prio pkts head */
4027 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4028 	mblk_t			*nmp = NULL;	/* normal pkts head */
4029 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4030 	uint32_t		count = 0;
4031 	int			i;
4032 	struct ether_header	*ehp;
4033 	uint32_t		num_types;
4034 	uint16_t		*types;
4035 
4036 	tmp = *np;
4037 	while (tmp != NULL) {
4038 
4039 		smp = tmp;
4040 		tmp = tmp->b_next;
4041 		smp->b_next = NULL;
4042 		smp->b_prev = NULL;
4043 
4044 		ehp = (struct ether_header *)smp->b_rptr;
4045 		num_types = vswp->pri_num_types;
4046 		types = vswp->pri_types;
4047 		for (i = 0; i < num_types; i++) {
4048 			if (ehp->ether_type == types[i]) {
4049 				/* high priority frame */
4050 
4051 				if (hmp != NULL) {
4052 					hmpt->b_next = smp;
4053 					hmpt = smp;
4054 				} else {
4055 					hmp = hmpt = smp;
4056 				}
4057 				count++;
4058 				break;
4059 			}
4060 		}
4061 		if (i == num_types) {
4062 			/* normal data frame */
4063 
4064 			if (nmp != NULL) {
4065 				nmpt->b_next = smp;
4066 				nmpt = smp;
4067 			} else {
4068 				nmp = nmpt = smp;
4069 			}
4070 		}
4071 	}
4072 
4073 	*hp = hmp;
4074 	*hpt = hmpt;
4075 	*np = nmp;
4076 	*npt = nmpt;
4077 
4078 	return (count);
4079 }
4080 
4081 /*
4082  * Wrapper function to transmit normal and/or priority frames over the channel.
4083  */
4084 static int
4085 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4086 {
4087 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4088 	mblk_t			*tmp;
4089 	mblk_t			*smp;
4090 	mblk_t			*hmp;	/* high prio pkts head */
4091 	mblk_t			*hmpt;	/* high prio pkts tail */
4092 	mblk_t			*nmp;	/* normal pkts head */
4093 	mblk_t			*nmpt;	/* normal pkts tail */
4094 	uint32_t		n = 0;
4095 	vsw_t			*vswp = ldcp->ldc_vswp;
4096 
4097 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4098 	ASSERT(count != 0);
4099 
4100 	nmp = mp;
4101 	nmpt = mpt;
4102 
4103 	/* gather any priority frames from the chain of packets */
4104 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4105 
4106 	/* transmit priority frames */
4107 	tmp = hmp;
4108 	while (tmp != NULL) {
4109 		smp = tmp;
4110 		tmp = tmp->b_next;
4111 		smp->b_next = NULL;
4112 		vsw_ldcsend_pkt(ldcp, smp);
4113 	}
4114 
4115 	count -= n;
4116 
4117 	if (count == 0) {
4118 		/* no normal data frames to process */
4119 		return (0);
4120 	}
4121 
4122 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4123 }
4124 
4125 /*
4126  * Wrapper function to transmit normal frames over the channel.
4127  */
4128 static int
4129 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4130 {
4131 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4132 	mblk_t		*tmp = NULL;
4133 
4134 	ASSERT(count != 0);
4135 	/*
4136 	 * If the TX thread is enabled, then queue the
4137 	 * ordinary frames and signal the tx thread.
4138 	 */
4139 	if (ldcp->tx_thread != NULL) {
4140 
4141 		mutex_enter(&ldcp->tx_thr_lock);
4142 
4143 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4144 			/*
4145 			 * If we reached queue limit,
4146 			 * do not queue new packets,
4147 			 * drop them.
4148 			 */
4149 			ldcp->ldc_stats.tx_qfull += count;
4150 			mutex_exit(&ldcp->tx_thr_lock);
4151 			freemsgchain(mp);
4152 			goto exit;
4153 		}
4154 		if (ldcp->tx_mhead == NULL) {
4155 			ldcp->tx_mhead = mp;
4156 			ldcp->tx_mtail = mpt;
4157 			cv_signal(&ldcp->tx_thr_cv);
4158 		} else {
4159 			ldcp->tx_mtail->b_next = mp;
4160 			ldcp->tx_mtail = mpt;
4161 		}
4162 		ldcp->tx_cnt += count;
4163 		mutex_exit(&ldcp->tx_thr_lock);
4164 	} else {
4165 		while (mp != NULL) {
4166 			tmp = mp->b_next;
4167 			mp->b_next = mp->b_prev = NULL;
4168 			(void) vsw_ldcsend(ldcp, mp, 1);
4169 			mp = tmp;
4170 		}
4171 	}
4172 
4173 exit:
4174 	return (0);
4175 }
4176 
4177 /*
4178  * This function transmits the frame in the payload of a raw data
4179  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4180  * send special frames with high priorities, without going through
4181  * the normal data path which uses descriptor ring mechanism.
4182  */
4183 static void
4184 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4185 {
4186 	vio_raw_data_msg_t	*pkt;
4187 	mblk_t			*bp;
4188 	mblk_t			*nmp = NULL;
4189 	caddr_t			dst;
4190 	uint32_t		mblksz;
4191 	uint32_t		size;
4192 	uint32_t		nbytes;
4193 	int			rv;
4194 	vsw_t			*vswp = ldcp->ldc_vswp;
4195 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4196 
4197 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4198 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4199 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4200 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4201 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4202 		    ldcp->lane_out.lstate);
4203 		goto send_pkt_exit;
4204 	}
4205 
4206 	size = msgsize(mp);
4207 
4208 	/* frame size bigger than available payload len of raw data msg ? */
4209 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4210 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4211 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4212 		    ldcp->ldc_id, size);
4213 		goto send_pkt_exit;
4214 	}
4215 
4216 	if (size < ETHERMIN)
4217 		size = ETHERMIN;
4218 
4219 	/* alloc space for a raw data message */
4220 	nmp = vio_allocb(vswp->pri_tx_vmp);
4221 	if (nmp == NULL) {
4222 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4223 		DWARN(vswp, "vio_allocb failed\n");
4224 		goto send_pkt_exit;
4225 	}
4226 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4227 
4228 	/* copy frame into the payload of raw data message */
4229 	dst = (caddr_t)pkt->data;
4230 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4231 		mblksz = MBLKL(bp);
4232 		bcopy(bp->b_rptr, dst, mblksz);
4233 		dst += mblksz;
4234 	}
4235 
4236 	/* setup the raw data msg */
4237 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4238 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4239 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4240 	pkt->tag.vio_sid = ldcp->local_session;
4241 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4242 
4243 	/* send the msg over ldc */
4244 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4245 	if (rv != 0) {
4246 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4247 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4248 		    ldcp->ldc_id);
4249 		goto send_pkt_exit;
4250 	}
4251 
4252 	/* update stats */
4253 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4254 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4255 
4256 send_pkt_exit:
4257 	if (nmp != NULL)
4258 		freemsg(nmp);
4259 	freemsg(mp);
4260 }
4261 
4262 /*
4263  * Transmit the packet over the given LDC channel.
4264  *
4265  * The 'retries' argument indicates how many times a packet
4266  * is retried before it is dropped. Note, the retry is done
4267  * only for a resource related failure, for all other failures
4268  * the packet is dropped immediately.
4269  */
4270 static int
4271 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4272 {
4273 	int i;
4274 	int rc;
4275 	int status = 0;
4276 	vsw_port_t *port = ldcp->ldc_port;
4277 	dring_info_t *dp = NULL;
4278 
4279 
4280 	for (i = 0; i < retries; ) {
4281 		/*
4282 		 * Send the message out using the appropriate
4283 		 * transmit function which will free mblock when it
4284 		 * is finished with it.
4285 		 */
4286 		mutex_enter(&port->tx_lock);
4287 		if (port->transmit != NULL) {
4288 			status = (*port->transmit)(ldcp, mp);
4289 		}
4290 		if (status == LDC_TX_SUCCESS) {
4291 			mutex_exit(&port->tx_lock);
4292 			break;
4293 		}
4294 		i++;	/* increment the counter here */
4295 
4296 		/* If its the last retry, then update the oerror */
4297 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4298 			ldcp->ldc_stats.oerrors++;
4299 		}
4300 		mutex_exit(&port->tx_lock);
4301 
4302 		if (status != LDC_TX_NORESOURCES) {
4303 			/*
4304 			 * No retrying required for errors un-related
4305 			 * to resources.
4306 			 */
4307 			break;
4308 		}
4309 		READ_ENTER(&ldcp->lane_out.dlistrw);
4310 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4311 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4312 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4313 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4314 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4315 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4316 		} else {
4317 			/*
4318 			 * If there is no dring or the xfer_mode is
4319 			 * set to DESC_MODE(ie., OBP), then simply break here.
4320 			 */
4321 			RW_EXIT(&ldcp->lane_out.dlistrw);
4322 			break;
4323 		}
4324 		RW_EXIT(&ldcp->lane_out.dlistrw);
4325 
4326 		/*
4327 		 * Delay only if none were reclaimed
4328 		 * and its not the last retry.
4329 		 */
4330 		if ((rc == 0) && (i < retries)) {
4331 			delay(drv_usectohz(vsw_ldc_tx_delay));
4332 		}
4333 	}
4334 	freemsg(mp);
4335 	return (status);
4336 }
4337 
4338 /*
4339  * Send packet out via descriptor ring to a logical device.
4340  */
4341 static int
4342 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4343 {
4344 	vio_dring_msg_t		dring_pkt;
4345 	dring_info_t		*dp = NULL;
4346 	vsw_private_desc_t	*priv_desc = NULL;
4347 	vnet_public_desc_t	*pub = NULL;
4348 	vsw_t			*vswp = ldcp->ldc_vswp;
4349 	mblk_t			*bp;
4350 	size_t			n, size;
4351 	caddr_t			bufp;
4352 	int			idx;
4353 	int			status = LDC_TX_SUCCESS;
4354 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4355 	lane_t			*lp = &ldcp->lane_out;
4356 
4357 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4358 
4359 	/* TODO: make test a macro */
4360 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4361 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4362 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4363 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4364 		    ldcp->lane_out.lstate);
4365 		ldcp->ldc_stats.oerrors++;
4366 		return (LDC_TX_FAILURE);
4367 	}
4368 
4369 	/*
4370 	 * Note - using first ring only, this may change
4371 	 * in the future.
4372 	 */
4373 	READ_ENTER(&ldcp->lane_out.dlistrw);
4374 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4375 		RW_EXIT(&ldcp->lane_out.dlistrw);
4376 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4377 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4378 		ldcp->ldc_stats.oerrors++;
4379 		return (LDC_TX_FAILURE);
4380 	}
4381 
4382 	size = msgsize(mp);
4383 	if (size > (size_t)lp->mtu) {
4384 		RW_EXIT(&ldcp->lane_out.dlistrw);
4385 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4386 		    ldcp->ldc_id, size);
4387 		ldcp->ldc_stats.oerrors++;
4388 		return (LDC_TX_FAILURE);
4389 	}
4390 
4391 	/*
4392 	 * Find a free descriptor
4393 	 *
4394 	 * Note: for the moment we are assuming that we will only
4395 	 * have one dring going from the switch to each of its
4396 	 * peers. This may change in the future.
4397 	 */
4398 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4399 		D2(vswp, "%s(%lld): no descriptor available for ring "
4400 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4401 
4402 		/* nothing more we can do */
4403 		status = LDC_TX_NORESOURCES;
4404 		ldcp->ldc_stats.tx_no_desc++;
4405 		goto vsw_dringsend_free_exit;
4406 	} else {
4407 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4408 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4409 	}
4410 
4411 	/* copy data into the descriptor */
4412 	bufp = priv_desc->datap;
4413 	bufp += VNET_IPALIGN;
4414 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4415 		n = MBLKL(bp);
4416 		bcopy(bp->b_rptr, bufp, n);
4417 		bufp += n;
4418 	}
4419 
4420 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4421 
4422 	pub = priv_desc->descp;
4423 	pub->nbytes = priv_desc->datalen;
4424 
4425 	/* update statistics */
4426 	if (IS_BROADCAST(ehp))
4427 		ldcp->ldc_stats.brdcstxmt++;
4428 	else if (IS_MULTICAST(ehp))
4429 		ldcp->ldc_stats.multixmt++;
4430 	ldcp->ldc_stats.opackets++;
4431 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4432 
4433 	mutex_enter(&priv_desc->dstate_lock);
4434 	pub->hdr.dstate = VIO_DESC_READY;
4435 	mutex_exit(&priv_desc->dstate_lock);
4436 
4437 	/*
4438 	 * Determine whether or not we need to send a message to our
4439 	 * peer prompting them to read our newly updated descriptor(s).
4440 	 */
4441 	mutex_enter(&dp->restart_lock);
4442 	if (dp->restart_reqd) {
4443 		dp->restart_reqd = B_FALSE;
4444 		ldcp->ldc_stats.dring_data_msgs++;
4445 		mutex_exit(&dp->restart_lock);
4446 
4447 		/*
4448 		 * Send a vio_dring_msg to peer to prompt them to read
4449 		 * the updated descriptor ring.
4450 		 */
4451 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4452 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4453 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4454 		dring_pkt.tag.vio_sid = ldcp->local_session;
4455 
4456 		/* Note - for now using first ring */
4457 		dring_pkt.dring_ident = dp->ident;
4458 
4459 		/*
4460 		 * If last_ack_recv is -1 then we know we've not
4461 		 * received any ack's yet, so this must be the first
4462 		 * msg sent, so set the start to the begining of the ring.
4463 		 */
4464 		mutex_enter(&dp->dlock);
4465 		if (dp->last_ack_recv == -1) {
4466 			dring_pkt.start_idx = 0;
4467 		} else {
4468 			dring_pkt.start_idx =
4469 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4470 		}
4471 		dring_pkt.end_idx = -1;
4472 		mutex_exit(&dp->dlock);
4473 
4474 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4475 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4476 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4477 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4478 		    dring_pkt.end_idx);
4479 
4480 		RW_EXIT(&ldcp->lane_out.dlistrw);
4481 
4482 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4483 		    sizeof (vio_dring_msg_t), B_TRUE);
4484 
4485 		return (status);
4486 
4487 	} else {
4488 		mutex_exit(&dp->restart_lock);
4489 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4490 		    ldcp->ldc_id, idx);
4491 	}
4492 
4493 vsw_dringsend_free_exit:
4494 
4495 	RW_EXIT(&ldcp->lane_out.dlistrw);
4496 
4497 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4498 	return (status);
4499 }
4500 
4501 /*
4502  * Send an in-band descriptor message over ldc.
4503  */
4504 static int
4505 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4506 {
4507 	vsw_t			*vswp = ldcp->ldc_vswp;
4508 	vnet_ibnd_desc_t	ibnd_msg;
4509 	vsw_private_desc_t	*priv_desc = NULL;
4510 	dring_info_t		*dp = NULL;
4511 	size_t			n, size = 0;
4512 	caddr_t			bufp;
4513 	mblk_t			*bp;
4514 	int			idx, i;
4515 	int			status = LDC_TX_SUCCESS;
4516 	static int		warn_msg = 1;
4517 	lane_t			*lp = &ldcp->lane_out;
4518 
4519 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4520 
4521 	ASSERT(mp != NULL);
4522 
4523 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4524 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4525 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4526 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4527 		    ldcp->lane_out.lstate);
4528 		ldcp->ldc_stats.oerrors++;
4529 		return (LDC_TX_FAILURE);
4530 	}
4531 
4532 	/*
4533 	 * only expect single dring to exist, which we use
4534 	 * as an internal buffer, rather than a transfer channel.
4535 	 */
4536 	READ_ENTER(&ldcp->lane_out.dlistrw);
4537 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4538 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4539 		    __func__, ldcp->ldc_id);
4540 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4541 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4542 		RW_EXIT(&ldcp->lane_out.dlistrw);
4543 		ldcp->ldc_stats.oerrors++;
4544 		return (LDC_TX_FAILURE);
4545 	}
4546 
4547 	size = msgsize(mp);
4548 	if (size > (size_t)lp->mtu) {
4549 		RW_EXIT(&ldcp->lane_out.dlistrw);
4550 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4551 		    ldcp->ldc_id, size);
4552 		ldcp->ldc_stats.oerrors++;
4553 		return (LDC_TX_FAILURE);
4554 	}
4555 
4556 	/*
4557 	 * Find a free descriptor in our buffer ring
4558 	 */
4559 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4560 		RW_EXIT(&ldcp->lane_out.dlistrw);
4561 		if (warn_msg) {
4562 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4563 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4564 			warn_msg = 0;
4565 		}
4566 
4567 		/* nothing more we can do */
4568 		status = LDC_TX_NORESOURCES;
4569 		goto vsw_descrsend_free_exit;
4570 	} else {
4571 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4572 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4573 		warn_msg = 1;
4574 	}
4575 
4576 	/* copy data into the descriptor */
4577 	bufp = priv_desc->datap;
4578 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4579 		n = MBLKL(bp);
4580 		bcopy(bp->b_rptr, bufp, n);
4581 		bufp += n;
4582 	}
4583 
4584 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4585 
4586 	/* create and send the in-band descp msg */
4587 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4588 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4589 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4590 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4591 
4592 	/*
4593 	 * Copy the mem cookies describing the data from the
4594 	 * private region of the descriptor ring into the inband
4595 	 * descriptor.
4596 	 */
4597 	for (i = 0; i < priv_desc->ncookies; i++) {
4598 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4599 		    sizeof (ldc_mem_cookie_t));
4600 	}
4601 
4602 	ibnd_msg.hdr.desc_handle = idx;
4603 	ibnd_msg.ncookies = priv_desc->ncookies;
4604 	ibnd_msg.nbytes = size;
4605 
4606 	ldcp->ldc_stats.opackets++;
4607 	ldcp->ldc_stats.obytes += size;
4608 
4609 	RW_EXIT(&ldcp->lane_out.dlistrw);
4610 
4611 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4612 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4613 
4614 vsw_descrsend_free_exit:
4615 
4616 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4617 	return (status);
4618 }
4619 
4620 static void
4621 vsw_send_ver(void *arg)
4622 {
4623 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4624 	vsw_t		*vswp = ldcp->ldc_vswp;
4625 	lane_t		*lp = &ldcp->lane_out;
4626 	vio_ver_msg_t	ver_msg;
4627 
4628 	D1(vswp, "%s enter", __func__);
4629 
4630 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4631 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4632 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4633 	ver_msg.tag.vio_sid = ldcp->local_session;
4634 
4635 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4636 		ver_msg.ver_major = vsw_versions[0].ver_major;
4637 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4638 	} else {
4639 		/* use the major,minor that we've ack'd */
4640 		lane_t	*lpi = &ldcp->lane_in;
4641 		ver_msg.ver_major = lpi->ver_major;
4642 		ver_msg.ver_minor = lpi->ver_minor;
4643 	}
4644 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4645 
4646 	lp->lstate |= VSW_VER_INFO_SENT;
4647 	lp->ver_major = ver_msg.ver_major;
4648 	lp->ver_minor = ver_msg.ver_minor;
4649 
4650 	DUMP_TAG(ver_msg.tag);
4651 
4652 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4653 
4654 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4655 }
4656 
4657 static void
4658 vsw_send_attr(vsw_ldc_t *ldcp)
4659 {
4660 	vsw_t			*vswp = ldcp->ldc_vswp;
4661 	lane_t			*lp = &ldcp->lane_out;
4662 	vnet_attr_msg_t		attr_msg;
4663 
4664 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4665 
4666 	/*
4667 	 * Subtype is set to INFO by default
4668 	 */
4669 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4670 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4671 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4672 	attr_msg.tag.vio_sid = ldcp->local_session;
4673 
4674 	/* payload copied from default settings for lane */
4675 	attr_msg.mtu = lp->mtu;
4676 	attr_msg.addr_type = lp->addr_type;
4677 	attr_msg.xfer_mode = lp->xfer_mode;
4678 	attr_msg.ack_freq = lp->xfer_mode;
4679 
4680 	READ_ENTER(&vswp->if_lockrw);
4681 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4682 	RW_EXIT(&vswp->if_lockrw);
4683 
4684 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4685 
4686 	DUMP_TAG(attr_msg.tag);
4687 
4688 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4689 
4690 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4691 }
4692 
4693 /*
4694  * Create dring info msg (which also results in the creation of
4695  * a dring).
4696  */
4697 static vio_dring_reg_msg_t *
4698 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4699 {
4700 	vio_dring_reg_msg_t	*mp;
4701 	dring_info_t		*dp;
4702 	vsw_t			*vswp = ldcp->ldc_vswp;
4703 
4704 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4705 
4706 	/*
4707 	 * If we can't create a dring, obviously no point sending
4708 	 * a message.
4709 	 */
4710 	if ((dp = vsw_create_dring(ldcp)) == NULL)
4711 		return (NULL);
4712 
4713 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4714 
4715 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4716 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4717 	mp->tag.vio_subtype_env = VIO_DRING_REG;
4718 	mp->tag.vio_sid = ldcp->local_session;
4719 
4720 	/* payload */
4721 	mp->num_descriptors = dp->num_descriptors;
4722 	mp->descriptor_size = dp->descriptor_size;
4723 	mp->options = dp->options;
4724 	mp->ncookies = dp->ncookies;
4725 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4726 
4727 	mp->dring_ident = 0;
4728 
4729 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4730 
4731 	return (mp);
4732 }
4733 
4734 static void
4735 vsw_send_dring_info(vsw_ldc_t *ldcp)
4736 {
4737 	vio_dring_reg_msg_t	*dring_msg;
4738 	vsw_t			*vswp = ldcp->ldc_vswp;
4739 
4740 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4741 
4742 	dring_msg = vsw_create_dring_info_pkt(ldcp);
4743 	if (dring_msg == NULL) {
4744 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4745 		    vswp->instance, __func__);
4746 		return;
4747 	}
4748 
4749 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4750 
4751 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4752 
4753 	(void) vsw_send_msg(ldcp, dring_msg,
4754 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4755 
4756 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4757 
4758 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4759 }
4760 
4761 static void
4762 vsw_send_rdx(vsw_ldc_t *ldcp)
4763 {
4764 	vsw_t		*vswp = ldcp->ldc_vswp;
4765 	vio_rdx_msg_t	rdx_msg;
4766 
4767 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4768 
4769 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4770 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4771 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4772 	rdx_msg.tag.vio_sid = ldcp->local_session;
4773 
4774 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4775 
4776 	DUMP_TAG(rdx_msg.tag);
4777 
4778 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4779 
4780 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4781 }
4782 
4783 /*
4784  * Generic routine to send message out over ldc channel.
4785  *
4786  * It is possible that when we attempt to write over the ldc channel
4787  * that we get notified that it has been reset. Depending on the value
4788  * of the handle_reset flag we either handle that event here or simply
4789  * notify the caller that the channel was reset.
4790  */
4791 int
4792 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
4793 {
4794 	int			rv;
4795 	size_t			msglen = size;
4796 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
4797 	vsw_t			*vswp = ldcp->ldc_vswp;
4798 	vio_dring_msg_t		*dmsg;
4799 	vio_raw_data_msg_t	*rmsg;
4800 	vnet_ibnd_desc_t	*imsg;
4801 	boolean_t		data_msg = B_FALSE;
4802 
4803 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
4804 	    ldcp->ldc_id, size);
4805 
4806 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
4807 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
4808 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
4809 
4810 	mutex_enter(&ldcp->ldc_txlock);
4811 
4812 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
4813 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
4814 			dmsg = (vio_dring_msg_t *)tag;
4815 			dmsg->seq_num = ldcp->lane_out.seq_num;
4816 			data_msg = B_TRUE;
4817 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
4818 			rmsg = (vio_raw_data_msg_t *)tag;
4819 			rmsg->seq_num = ldcp->lane_out.seq_num;
4820 			data_msg = B_TRUE;
4821 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
4822 			imsg = (vnet_ibnd_desc_t *)tag;
4823 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
4824 			data_msg = B_TRUE;
4825 		}
4826 	}
4827 
4828 	do {
4829 		msglen = size;
4830 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
4831 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
4832 
4833 	if (rv == 0 && data_msg == B_TRUE) {
4834 		ldcp->lane_out.seq_num++;
4835 	}
4836 
4837 	if ((rv != 0) || (msglen != size)) {
4838 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
4839 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
4840 		ldcp->ldc_stats.oerrors++;
4841 	}
4842 
4843 	mutex_exit(&ldcp->ldc_txlock);
4844 
4845 	/*
4846 	 * If channel has been reset we either handle it here or
4847 	 * simply report back that it has been reset and let caller
4848 	 * decide what to do.
4849 	 */
4850 	if (rv == ECONNRESET) {
4851 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
4852 
4853 		/*
4854 		 * N.B - must never be holding the dlistrw lock when
4855 		 * we do a reset of the channel.
4856 		 */
4857 		if (handle_reset) {
4858 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4859 		}
4860 	}
4861 
4862 	return (rv);
4863 }
4864 
4865 /*
4866  * Remove the specified address from the list of address maintained
4867  * in this port node.
4868  */
4869 mcst_addr_t *
4870 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4871 {
4872 	vsw_t		*vswp = NULL;
4873 	vsw_port_t	*port = NULL;
4874 	mcst_addr_t	*prev_p = NULL;
4875 	mcst_addr_t	*curr_p = NULL;
4876 
4877 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4878 	    __func__, devtype, addr);
4879 
4880 	if (devtype == VSW_VNETPORT) {
4881 		port = (vsw_port_t *)arg;
4882 		mutex_enter(&port->mca_lock);
4883 		prev_p = curr_p = port->mcap;
4884 	} else {
4885 		vswp = (vsw_t *)arg;
4886 		mutex_enter(&vswp->mca_lock);
4887 		prev_p = curr_p = vswp->mcap;
4888 	}
4889 
4890 	while (curr_p != NULL) {
4891 		if (curr_p->addr == addr) {
4892 			D2(NULL, "%s: address found", __func__);
4893 			/* match found */
4894 			if (prev_p == curr_p) {
4895 				/* list head */
4896 				if (devtype == VSW_VNETPORT)
4897 					port->mcap = curr_p->nextp;
4898 				else
4899 					vswp->mcap = curr_p->nextp;
4900 			} else {
4901 				prev_p->nextp = curr_p->nextp;
4902 			}
4903 			break;
4904 		} else {
4905 			prev_p = curr_p;
4906 			curr_p = curr_p->nextp;
4907 		}
4908 	}
4909 
4910 	if (devtype == VSW_VNETPORT)
4911 		mutex_exit(&port->mca_lock);
4912 	else
4913 		mutex_exit(&vswp->mca_lock);
4914 
4915 	D1(NULL, "%s: exit", __func__);
4916 
4917 	return (curr_p);
4918 }
4919 
4920 /*
4921  * Creates a descriptor ring (dring) and links it into the
4922  * link of outbound drings for this channel.
4923  *
4924  * Returns NULL if creation failed.
4925  */
4926 static dring_info_t *
4927 vsw_create_dring(vsw_ldc_t *ldcp)
4928 {
4929 	vsw_private_desc_t	*priv_addr = NULL;
4930 	vsw_t			*vswp = ldcp->ldc_vswp;
4931 	ldc_mem_info_t		minfo;
4932 	dring_info_t		*dp, *tp;
4933 	int			i;
4934 
4935 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4936 
4937 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4938 
4939 	/* create public section of ring */
4940 	if ((ldc_mem_dring_create(vsw_ntxds,
4941 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
4942 
4943 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
4944 		    "failed", ldcp->ldc_id);
4945 		goto create_fail_exit;
4946 	}
4947 
4948 	ASSERT(dp->handle != NULL);
4949 
4950 	/*
4951 	 * Get the base address of the public section of the ring.
4952 	 */
4953 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4954 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
4955 		    ldcp->ldc_id);
4956 		goto dring_fail_exit;
4957 	} else {
4958 		ASSERT(minfo.vaddr != 0);
4959 		dp->pub_addr = minfo.vaddr;
4960 	}
4961 
4962 	dp->num_descriptors = vsw_ntxds;
4963 	dp->descriptor_size = VSW_PUB_SIZE;
4964 	dp->options = VIO_TX_DRING;
4965 	dp->ncookies = 1;	/* guaranteed by ldc */
4966 
4967 	/*
4968 	 * create private portion of ring
4969 	 */
4970 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
4971 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
4972 
4973 	if (vsw_setup_ring(ldcp, dp)) {
4974 		DERR(vswp, "%s: unable to setup ring", __func__);
4975 		goto dring_fail_exit;
4976 	}
4977 
4978 	/* haven't used any descriptors yet */
4979 	dp->end_idx = 0;
4980 	dp->last_ack_recv = -1;
4981 
4982 	/* bind dring to the channel */
4983 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
4984 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
4985 	    &dp->cookie[0], &dp->ncookies)) != 0) {
4986 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
4987 		    "%lld", ldcp->ldc_id);
4988 		goto dring_fail_exit;
4989 	}
4990 
4991 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4992 	dp->restart_reqd = B_TRUE;
4993 
4994 	/*
4995 	 * Only ever create rings for outgoing lane. Link it onto
4996 	 * end of list.
4997 	 */
4998 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4999 	if (ldcp->lane_out.dringp == NULL) {
5000 		D2(vswp, "vsw_create_dring: adding first outbound ring");
5001 		ldcp->lane_out.dringp = dp;
5002 	} else {
5003 		tp = ldcp->lane_out.dringp;
5004 		while (tp->next != NULL)
5005 			tp = tp->next;
5006 
5007 		tp->next = dp;
5008 	}
5009 	RW_EXIT(&ldcp->lane_out.dlistrw);
5010 
5011 	return (dp);
5012 
5013 dring_fail_exit:
5014 	(void) ldc_mem_dring_destroy(dp->handle);
5015 
5016 create_fail_exit:
5017 	if (dp->priv_addr != NULL) {
5018 		priv_addr = dp->priv_addr;
5019 		for (i = 0; i < vsw_ntxds; i++) {
5020 			if (priv_addr->memhandle != NULL)
5021 				(void) ldc_mem_free_handle(
5022 				    priv_addr->memhandle);
5023 			priv_addr++;
5024 		}
5025 		kmem_free(dp->priv_addr,
5026 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5027 	}
5028 	mutex_destroy(&dp->dlock);
5029 
5030 	kmem_free(dp, sizeof (dring_info_t));
5031 	return (NULL);
5032 }
5033 
5034 /*
5035  * Create a ring consisting of just a private portion and link
5036  * it into the list of rings for the outbound lane.
5037  *
5038  * These type of rings are used primarily for temporary data
5039  * storage (i.e. as data buffers).
5040  */
5041 void
5042 vsw_create_privring(vsw_ldc_t *ldcp)
5043 {
5044 	dring_info_t		*dp, *tp;
5045 	vsw_t			*vswp = ldcp->ldc_vswp;
5046 
5047 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5048 
5049 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5050 
5051 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5052 
5053 	/* no public section */
5054 	dp->pub_addr = NULL;
5055 
5056 	dp->priv_addr = kmem_zalloc(
5057 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5058 
5059 	dp->num_descriptors = vsw_ntxds;
5060 
5061 	if (vsw_setup_ring(ldcp, dp)) {
5062 		DERR(vswp, "%s: setup of ring failed", __func__);
5063 		kmem_free(dp->priv_addr,
5064 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5065 		mutex_destroy(&dp->dlock);
5066 		kmem_free(dp, sizeof (dring_info_t));
5067 		return;
5068 	}
5069 
5070 	/* haven't used any descriptors yet */
5071 	dp->end_idx = 0;
5072 
5073 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5074 	dp->restart_reqd = B_TRUE;
5075 
5076 	/*
5077 	 * Only ever create rings for outgoing lane. Link it onto
5078 	 * end of list.
5079 	 */
5080 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5081 	if (ldcp->lane_out.dringp == NULL) {
5082 		D2(vswp, "%s: adding first outbound privring", __func__);
5083 		ldcp->lane_out.dringp = dp;
5084 	} else {
5085 		tp = ldcp->lane_out.dringp;
5086 		while (tp->next != NULL)
5087 			tp = tp->next;
5088 
5089 		tp->next = dp;
5090 	}
5091 	RW_EXIT(&ldcp->lane_out.dlistrw);
5092 
5093 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5094 }
5095 
5096 /*
5097  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5098  * failure.
5099  */
5100 int
5101 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5102 {
5103 	vnet_public_desc_t	*pub_addr = NULL;
5104 	vsw_private_desc_t	*priv_addr = NULL;
5105 	vsw_t			*vswp = ldcp->ldc_vswp;
5106 	uint64_t		*tmpp;
5107 	uint64_t		offset = 0;
5108 	uint32_t		ncookies = 0;
5109 	static char		*name = "vsw_setup_ring";
5110 	int			i, j, nc, rv;
5111 	size_t			data_sz;
5112 
5113 	priv_addr = dp->priv_addr;
5114 	pub_addr = dp->pub_addr;
5115 
5116 	/* public section may be null but private should never be */
5117 	ASSERT(priv_addr != NULL);
5118 
5119 	/*
5120 	 * Allocate the region of memory which will be used to hold
5121 	 * the data the descriptors will refer to.
5122 	 */
5123 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5124 	data_sz = VNET_ROUNDUP_2K(data_sz);
5125 	dp->desc_data_sz = data_sz;
5126 	dp->data_sz = vsw_ntxds * data_sz;
5127 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5128 
5129 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5130 	    dp->data_sz, dp->data_addr);
5131 
5132 	tmpp = (uint64_t *)dp->data_addr;
5133 	offset = dp->desc_data_sz/sizeof (tmpp);
5134 
5135 	/*
5136 	 * Initialise some of the private and public (if they exist)
5137 	 * descriptor fields.
5138 	 */
5139 	for (i = 0; i < vsw_ntxds; i++) {
5140 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5141 
5142 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5143 		    &priv_addr->memhandle)) != 0) {
5144 			DERR(vswp, "%s: alloc mem handle failed", name);
5145 			goto setup_ring_cleanup;
5146 		}
5147 
5148 		priv_addr->datap = (void *)tmpp;
5149 
5150 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5151 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5152 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5153 		    &(priv_addr->memcookie[0]), &ncookies);
5154 		if (rv != 0) {
5155 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5156 			    "(rv %d)", name, ldcp->ldc_id, rv);
5157 			goto setup_ring_cleanup;
5158 		}
5159 		priv_addr->bound = 1;
5160 
5161 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5162 		    name, i, priv_addr->memcookie[0].addr,
5163 		    priv_addr->memcookie[0].size);
5164 
5165 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5166 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5167 			    "invalid num of cookies (%d) for size 0x%llx",
5168 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5169 
5170 			goto setup_ring_cleanup;
5171 		} else {
5172 			for (j = 1; j < ncookies; j++) {
5173 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5174 				    &(priv_addr->memcookie[j]));
5175 				if (rv != 0) {
5176 					DERR(vswp, "%s: ldc_mem_nextcookie "
5177 					    "failed rv (%d)", name, rv);
5178 					goto setup_ring_cleanup;
5179 				}
5180 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5181 				    "size 0x%llx", name, j,
5182 				    priv_addr->memcookie[j].addr,
5183 				    priv_addr->memcookie[j].size);
5184 			}
5185 
5186 		}
5187 		priv_addr->ncookies = ncookies;
5188 		priv_addr->dstate = VIO_DESC_FREE;
5189 
5190 		if (pub_addr != NULL) {
5191 
5192 			/* link pub and private sides */
5193 			priv_addr->descp = pub_addr;
5194 
5195 			pub_addr->ncookies = priv_addr->ncookies;
5196 
5197 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5198 				bcopy(&priv_addr->memcookie[nc],
5199 				    &pub_addr->memcookie[nc],
5200 				    sizeof (ldc_mem_cookie_t));
5201 			}
5202 
5203 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5204 			pub_addr++;
5205 		}
5206 
5207 		/*
5208 		 * move to next element in the dring and the next
5209 		 * position in the data buffer.
5210 		 */
5211 		priv_addr++;
5212 		tmpp += offset;
5213 	}
5214 
5215 	return (0);
5216 
5217 setup_ring_cleanup:
5218 	priv_addr = dp->priv_addr;
5219 
5220 	for (j = 0; j < i; j++) {
5221 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5222 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5223 
5224 		mutex_destroy(&priv_addr->dstate_lock);
5225 
5226 		priv_addr++;
5227 	}
5228 	kmem_free(dp->data_addr, dp->data_sz);
5229 
5230 	return (1);
5231 }
5232 
5233 /*
5234  * Searches the private section of a ring for a free descriptor,
5235  * starting at the location of the last free descriptor found
5236  * previously.
5237  *
5238  * Returns 0 if free descriptor is available, and updates state
5239  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5240  *
5241  * FUTURE: might need to return contiguous range of descriptors
5242  * as dring info msg assumes all will be contiguous.
5243  */
5244 static int
5245 vsw_dring_find_free_desc(dring_info_t *dringp,
5246 		vsw_private_desc_t **priv_p, int *idx)
5247 {
5248 	vsw_private_desc_t	*addr = NULL;
5249 	int			num = vsw_ntxds;
5250 	int			ret = 1;
5251 
5252 	D1(NULL, "%s enter\n", __func__);
5253 
5254 	ASSERT(dringp->priv_addr != NULL);
5255 
5256 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5257 	    __func__, dringp, dringp->end_idx);
5258 
5259 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5260 
5261 	mutex_enter(&addr->dstate_lock);
5262 	if (addr->dstate == VIO_DESC_FREE) {
5263 		addr->dstate = VIO_DESC_READY;
5264 		*priv_p = addr;
5265 		*idx = dringp->end_idx;
5266 		dringp->end_idx = (dringp->end_idx + 1) % num;
5267 		ret = 0;
5268 
5269 	}
5270 	mutex_exit(&addr->dstate_lock);
5271 
5272 	/* ring full */
5273 	if (ret == 1) {
5274 		D2(NULL, "%s: no desp free: started at %d", __func__,
5275 		    dringp->end_idx);
5276 	}
5277 
5278 	D1(NULL, "%s: exit\n", __func__);
5279 
5280 	return (ret);
5281 }
5282 
5283 /*
5284  * Map from a dring identifier to the ring itself. Returns
5285  * pointer to ring or NULL if no match found.
5286  *
5287  * Should be called with dlistrw rwlock held as reader.
5288  */
5289 static dring_info_t *
5290 vsw_ident2dring(lane_t *lane, uint64_t ident)
5291 {
5292 	dring_info_t	*dp = NULL;
5293 
5294 	if ((dp = lane->dringp) == NULL) {
5295 		return (NULL);
5296 	} else {
5297 		if (dp->ident == ident)
5298 			return (dp);
5299 
5300 		while (dp != NULL) {
5301 			if (dp->ident == ident)
5302 				break;
5303 			dp = dp->next;
5304 		}
5305 	}
5306 
5307 	return (dp);
5308 }
5309 
5310 /*
5311  * Set the default lane attributes. These are copied into
5312  * the attr msg we send to our peer. If they are not acceptable
5313  * then (currently) the handshake ends.
5314  */
5315 static void
5316 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5317 {
5318 	bzero(lp, sizeof (lane_t));
5319 
5320 	READ_ENTER(&vswp->if_lockrw);
5321 	ether_copy(&(vswp->if_addr), &(lp->addr));
5322 	RW_EXIT(&vswp->if_lockrw);
5323 
5324 	lp->mtu = vswp->max_frame_size;
5325 	lp->addr_type = ADDR_TYPE_MAC;
5326 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5327 	lp->ack_freq = 0;	/* for shared mode */
5328 	lp->seq_num = VNET_ISS;
5329 }
5330 
5331 /*
5332  * Verify that the attributes are acceptable.
5333  *
5334  * FUTURE: If some attributes are not acceptable, change them
5335  * our desired values.
5336  */
5337 static int
5338 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5339 {
5340 	int			ret = 0;
5341 	struct ether_addr	ea;
5342 	vsw_port_t		*port = ldcp->ldc_port;
5343 	lane_t			*lp = &ldcp->lane_out;
5344 
5345 	D1(NULL, "vsw_check_attr enter\n");
5346 
5347 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5348 	    (pkt->xfer_mode != lp->xfer_mode)) {
5349 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5350 		ret = 1;
5351 	}
5352 
5353 	/* Only support MAC addresses at moment. */
5354 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5355 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5356 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5357 		ret = 1;
5358 	}
5359 
5360 	/*
5361 	 * MAC address supplied by device should match that stored
5362 	 * in the vsw-port OBP node. Need to decide what to do if they
5363 	 * don't match, for the moment just warn but don't fail.
5364 	 */
5365 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5366 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5367 		DERR(NULL, "vsw_check_attr: device supplied address "
5368 		    "0x%llx doesn't match node address 0x%llx\n",
5369 		    pkt->addr, port->p_macaddr);
5370 	}
5371 
5372 	/*
5373 	 * Ack freq only makes sense in pkt mode, in shared
5374 	 * mode the ring descriptors say whether or not to
5375 	 * send back an ACK.
5376 	 */
5377 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5378 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5379 	    (VSW_VER_LT(ldcp, 1, 2) &&
5380 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5381 		if (pkt->ack_freq > 0) {
5382 			D2(NULL, "vsw_check_attr: non zero ack freq "
5383 			    " in SHM mode\n");
5384 			ret = 1;
5385 		}
5386 	}
5387 
5388 	/*
5389 	 * Note: for the moment we only support ETHER
5390 	 * frames. This may change in the future.
5391 	 */
5392 	if ((pkt->mtu > lp->mtu) || (pkt->mtu <= 0)) {
5393 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5394 		    pkt->mtu);
5395 		ret = 1;
5396 	}
5397 
5398 	D1(NULL, "vsw_check_attr exit\n");
5399 
5400 	return (ret);
5401 }
5402 
5403 /*
5404  * Returns 1 if there is a problem, 0 otherwise.
5405  */
5406 static int
5407 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5408 {
5409 	_NOTE(ARGUNUSED(pkt))
5410 
5411 	int	ret = 0;
5412 
5413 	D1(NULL, "vsw_check_dring_info enter\n");
5414 
5415 	if ((pkt->num_descriptors == 0) ||
5416 	    (pkt->descriptor_size == 0) ||
5417 	    (pkt->ncookies != 1)) {
5418 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5419 		ret = 1;
5420 	}
5421 
5422 	D1(NULL, "vsw_check_dring_info exit\n");
5423 
5424 	return (ret);
5425 }
5426 
5427 /*
5428  * Returns 1 if two memory cookies match. Otherwise returns 0.
5429  */
5430 static int
5431 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5432 {
5433 	if ((m1->addr != m2->addr) ||
5434 	    (m2->size != m2->size)) {
5435 		return (0);
5436 	} else {
5437 		return (1);
5438 	}
5439 }
5440 
5441 /*
5442  * Returns 1 if ring described in reg message matches that
5443  * described by dring_info structure. Otherwise returns 0.
5444  */
5445 static int
5446 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5447 {
5448 	if ((msg->descriptor_size != dp->descriptor_size) ||
5449 	    (msg->num_descriptors != dp->num_descriptors) ||
5450 	    (msg->ncookies != dp->ncookies) ||
5451 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5452 		return (0);
5453 	} else {
5454 		return (1);
5455 	}
5456 
5457 }
5458 
5459 static caddr_t
5460 vsw_print_ethaddr(uint8_t *a, char *ebuf)
5461 {
5462 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
5463 	    a[0], a[1], a[2], a[3], a[4], a[5]);
5464 	return (ebuf);
5465 }
5466 
5467 /*
5468  * Reset and free all the resources associated with
5469  * the channel.
5470  */
5471 static void
5472 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5473 {
5474 	dring_info_t		*dp, *dpp;
5475 	lane_t			*lp = NULL;
5476 	int			rv = 0;
5477 
5478 	ASSERT(ldcp != NULL);
5479 
5480 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5481 
5482 	if (dir == INBOUND) {
5483 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5484 		    " of channel %lld", __func__, ldcp->ldc_id);
5485 		lp = &ldcp->lane_in;
5486 	} else {
5487 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5488 		    " of channel %lld", __func__, ldcp->ldc_id);
5489 		lp = &ldcp->lane_out;
5490 	}
5491 
5492 	lp->lstate = VSW_LANE_INACTIV;
5493 	lp->seq_num = VNET_ISS;
5494 
5495 	if (lp->dringp) {
5496 		if (dir == INBOUND) {
5497 			WRITE_ENTER(&lp->dlistrw);
5498 			dp = lp->dringp;
5499 			while (dp != NULL) {
5500 				dpp = dp->next;
5501 				if (dp->handle != NULL)
5502 					(void) ldc_mem_dring_unmap(dp->handle);
5503 				kmem_free(dp, sizeof (dring_info_t));
5504 				dp = dpp;
5505 			}
5506 			RW_EXIT(&lp->dlistrw);
5507 		} else {
5508 			/*
5509 			 * unbind, destroy exported dring, free dring struct
5510 			 */
5511 			WRITE_ENTER(&lp->dlistrw);
5512 			dp = lp->dringp;
5513 			rv = vsw_free_ring(dp);
5514 			RW_EXIT(&lp->dlistrw);
5515 		}
5516 		if (rv == 0) {
5517 			lp->dringp = NULL;
5518 		}
5519 	}
5520 
5521 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5522 }
5523 
5524 /*
5525  * Free ring and all associated resources.
5526  *
5527  * Should be called with dlistrw rwlock held as writer.
5528  */
5529 static int
5530 vsw_free_ring(dring_info_t *dp)
5531 {
5532 	vsw_private_desc_t	*paddr = NULL;
5533 	dring_info_t		*dpp;
5534 	int			i, rv = 1;
5535 
5536 	while (dp != NULL) {
5537 		mutex_enter(&dp->dlock);
5538 		dpp = dp->next;
5539 		if (dp->priv_addr != NULL) {
5540 			/*
5541 			 * First unbind and free the memory handles
5542 			 * stored in each descriptor within the ring.
5543 			 */
5544 			for (i = 0; i < vsw_ntxds; i++) {
5545 				paddr = (vsw_private_desc_t *)
5546 				    dp->priv_addr + i;
5547 				if (paddr->memhandle != NULL) {
5548 					if (paddr->bound == 1) {
5549 						rv = ldc_mem_unbind_handle(
5550 						    paddr->memhandle);
5551 
5552 						if (rv != 0) {
5553 							DERR(NULL, "error "
5554 							"unbinding handle for "
5555 							"ring 0x%llx at pos %d",
5556 							    dp, i);
5557 							mutex_exit(&dp->dlock);
5558 							return (rv);
5559 						}
5560 						paddr->bound = 0;
5561 					}
5562 
5563 					rv = ldc_mem_free_handle(
5564 					    paddr->memhandle);
5565 					if (rv != 0) {
5566 						DERR(NULL, "error freeing "
5567 						    "handle for ring 0x%llx "
5568 						    "at pos %d", dp, i);
5569 						mutex_exit(&dp->dlock);
5570 						return (rv);
5571 					}
5572 					paddr->memhandle = NULL;
5573 				}
5574 				mutex_destroy(&paddr->dstate_lock);
5575 			}
5576 			kmem_free(dp->priv_addr,
5577 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5578 		}
5579 
5580 		/*
5581 		 * Now unbind and destroy the ring itself.
5582 		 */
5583 		if (dp->handle != NULL) {
5584 			(void) ldc_mem_dring_unbind(dp->handle);
5585 			(void) ldc_mem_dring_destroy(dp->handle);
5586 		}
5587 
5588 		if (dp->data_addr != NULL) {
5589 			kmem_free(dp->data_addr, dp->data_sz);
5590 		}
5591 
5592 		mutex_exit(&dp->dlock);
5593 		mutex_destroy(&dp->dlock);
5594 		mutex_destroy(&dp->restart_lock);
5595 		kmem_free(dp, sizeof (dring_info_t));
5596 
5597 		dp = dpp;
5598 	}
5599 	return (0);
5600 }
5601 
5602 /*
5603  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5604  * This thread is woken up by the LDC interrupt handler to process
5605  * LDC packets and receive data.
5606  */
5607 static void
5608 vsw_ldc_rx_worker(void *arg)
5609 {
5610 	callb_cpr_t	cprinfo;
5611 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5612 	vsw_t *vswp = ldcp->ldc_vswp;
5613 
5614 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5615 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5616 	    "vsw_rx_thread");
5617 	mutex_enter(&ldcp->rx_thr_lock);
5618 	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5619 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5620 
5621 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5622 		/*
5623 		 * Wait until the data is received or a stop
5624 		 * request is received.
5625 		 */
5626 		while (!(ldcp->rx_thr_flags &
5627 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5628 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5629 		}
5630 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5631 
5632 		/*
5633 		 * First process the stop request.
5634 		 */
5635 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5636 			D2(vswp, "%s(%lld):Rx thread stopped\n",
5637 			    __func__, ldcp->ldc_id);
5638 			break;
5639 		}
5640 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5641 		mutex_exit(&ldcp->rx_thr_lock);
5642 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5643 		    __func__, ldcp->ldc_id);
5644 		mutex_enter(&ldcp->ldc_cblock);
5645 		vsw_process_pkt(ldcp);
5646 		mutex_exit(&ldcp->ldc_cblock);
5647 		mutex_enter(&ldcp->rx_thr_lock);
5648 	}
5649 
5650 	/*
5651 	 * Update the run status and wakeup the thread that
5652 	 * has sent the stop request.
5653 	 */
5654 	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5655 	cv_signal(&ldcp->rx_thr_cv);
5656 	CALLB_CPR_EXIT(&cprinfo);
5657 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5658 	thread_exit();
5659 }
5660 
5661 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5662 static void
5663 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5664 {
5665 	vsw_t *vswp = ldcp->ldc_vswp;
5666 
5667 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5668 	/*
5669 	 * Send a stop request by setting the stop flag and
5670 	 * wait until the receive thread stops.
5671 	 */
5672 	mutex_enter(&ldcp->rx_thr_lock);
5673 	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5674 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5675 		cv_signal(&ldcp->rx_thr_cv);
5676 		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5677 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5678 		}
5679 	}
5680 	mutex_exit(&ldcp->rx_thr_lock);
5681 	ldcp->rx_thread = NULL;
5682 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5683 }
5684 
5685 /*
5686  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5687  * This thread is woken up by the vsw_portsend to transmit
5688  * packets.
5689  */
5690 static void
5691 vsw_ldc_tx_worker(void *arg)
5692 {
5693 	callb_cpr_t	cprinfo;
5694 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5695 	vsw_t *vswp = ldcp->ldc_vswp;
5696 	mblk_t *mp;
5697 	mblk_t *tmp;
5698 
5699 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5700 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5701 	    "vnet_tx_thread");
5702 	mutex_enter(&ldcp->tx_thr_lock);
5703 	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5704 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5705 
5706 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5707 		/*
5708 		 * Wait until the data is received or a stop
5709 		 * request is received.
5710 		 */
5711 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5712 		    (ldcp->tx_mhead == NULL)) {
5713 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5714 		}
5715 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5716 
5717 		/*
5718 		 * First process the stop request.
5719 		 */
5720 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5721 			D2(vswp, "%s(%lld):tx thread stopped\n",
5722 			    __func__, ldcp->ldc_id);
5723 			break;
5724 		}
5725 		mp = ldcp->tx_mhead;
5726 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5727 		ldcp->tx_cnt = 0;
5728 		mutex_exit(&ldcp->tx_thr_lock);
5729 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5730 		    __func__, ldcp->ldc_id);
5731 		while (mp != NULL) {
5732 			tmp = mp->b_next;
5733 			mp->b_next = mp->b_prev = NULL;
5734 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5735 			mp = tmp;
5736 		}
5737 		mutex_enter(&ldcp->tx_thr_lock);
5738 	}
5739 
5740 	/*
5741 	 * Update the run status and wakeup the thread that
5742 	 * has sent the stop request.
5743 	 */
5744 	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5745 	cv_signal(&ldcp->tx_thr_cv);
5746 	CALLB_CPR_EXIT(&cprinfo);
5747 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5748 	thread_exit();
5749 }
5750 
5751 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5752 static void
5753 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5754 {
5755 	vsw_t *vswp = ldcp->ldc_vswp;
5756 
5757 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5758 	/*
5759 	 * Send a stop request by setting the stop flag and
5760 	 * wait until the receive thread stops.
5761 	 */
5762 	mutex_enter(&ldcp->tx_thr_lock);
5763 	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5764 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5765 		cv_signal(&ldcp->tx_thr_cv);
5766 		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5767 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5768 		}
5769 	}
5770 	mutex_exit(&ldcp->tx_thr_lock);
5771 	ldcp->tx_thread = NULL;
5772 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5773 }
5774 
5775 /* vsw_reclaim_dring -- reclaim descriptors */
5776 static int
5777 vsw_reclaim_dring(dring_info_t *dp, int start)
5778 {
5779 	int i, j, len;
5780 	vsw_private_desc_t *priv_addr;
5781 	vnet_public_desc_t *pub_addr;
5782 
5783 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5784 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5785 	len = dp->num_descriptors;
5786 
5787 	D2(NULL, "%s: start index %ld\n", __func__, start);
5788 
5789 	j = 0;
5790 	for (i = start; j < len; i = (i + 1) % len, j++) {
5791 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5792 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5793 
5794 		mutex_enter(&priv_addr->dstate_lock);
5795 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
5796 			mutex_exit(&priv_addr->dstate_lock);
5797 			break;
5798 		}
5799 		pub_addr->hdr.dstate = VIO_DESC_FREE;
5800 		priv_addr->dstate = VIO_DESC_FREE;
5801 		/* clear all the fields */
5802 		priv_addr->datalen = 0;
5803 		pub_addr->hdr.ack = 0;
5804 		mutex_exit(&priv_addr->dstate_lock);
5805 
5806 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
5807 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
5808 	}
5809 	return (j);
5810 }
5811 
5812 /*
5813  * Debugging routines
5814  */
5815 static void
5816 display_state(void)
5817 {
5818 	vsw_t		*vswp;
5819 	vsw_port_list_t	*plist;
5820 	vsw_port_t 	*port;
5821 	vsw_ldc_list_t	*ldcl;
5822 	vsw_ldc_t 	*ldcp;
5823 	extern vsw_t 	*vsw_head;
5824 
5825 	cmn_err(CE_NOTE, "***** system state *****");
5826 
5827 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
5828 		plist = &vswp->plist;
5829 		READ_ENTER(&plist->lockrw);
5830 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
5831 		    vswp->instance, plist->num_ports);
5832 
5833 		for (port = plist->head; port != NULL; port = port->p_next) {
5834 			ldcl = &port->p_ldclist;
5835 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
5836 			    port->p_instance, port->num_ldcs);
5837 			READ_ENTER(&ldcl->lockrw);
5838 			ldcp = ldcl->head;
5839 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
5840 				cmn_err(CE_CONT, "chan %lu : dev %d : "
5841 				    "status %d : phase %u\n",
5842 				    ldcp->ldc_id, ldcp->dev_class,
5843 				    ldcp->ldc_status, ldcp->hphase);
5844 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
5845 				    "psession %lu\n", ldcp->ldc_id,
5846 				    ldcp->local_session, ldcp->peer_session);
5847 
5848 				cmn_err(CE_CONT, "Inbound lane:\n");
5849 				display_lane(&ldcp->lane_in);
5850 				cmn_err(CE_CONT, "Outbound lane:\n");
5851 				display_lane(&ldcp->lane_out);
5852 			}
5853 			RW_EXIT(&ldcl->lockrw);
5854 		}
5855 		RW_EXIT(&plist->lockrw);
5856 	}
5857 	cmn_err(CE_NOTE, "***** system state *****");
5858 }
5859 
5860 static void
5861 display_lane(lane_t *lp)
5862 {
5863 	dring_info_t	*drp;
5864 
5865 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
5866 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
5867 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
5868 	    lp->addr_type, lp->addr, lp->xfer_mode);
5869 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
5870 
5871 	cmn_err(CE_CONT, "Dring info:\n");
5872 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
5873 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
5874 		    drp->num_descriptors, drp->descriptor_size);
5875 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
5876 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
5877 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
5878 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
5879 		    drp->ident, drp->end_idx);
5880 		display_ring(drp);
5881 	}
5882 }
5883 
5884 static void
5885 display_ring(dring_info_t *dringp)
5886 {
5887 	uint64_t		i;
5888 	uint64_t		priv_count = 0;
5889 	uint64_t		pub_count = 0;
5890 	vnet_public_desc_t	*pub_addr = NULL;
5891 	vsw_private_desc_t	*priv_addr = NULL;
5892 
5893 	for (i = 0; i < vsw_ntxds; i++) {
5894 		if (dringp->pub_addr != NULL) {
5895 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
5896 
5897 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
5898 				pub_count++;
5899 		}
5900 
5901 		if (dringp->priv_addr != NULL) {
5902 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
5903 
5904 			if (priv_addr->dstate == VIO_DESC_FREE)
5905 				priv_count++;
5906 		}
5907 	}
5908 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
5909 	    i, priv_count, pub_count);
5910 }
5911 
5912 static void
5913 dump_flags(uint64_t state)
5914 {
5915 	int	i;
5916 
5917 	typedef struct flag_name {
5918 		int	flag_val;
5919 		char	*flag_name;
5920 	} flag_name_t;
5921 
5922 	flag_name_t	flags[] = {
5923 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
5924 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
5925 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
5926 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
5927 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
5928 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
5929 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
5930 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
5931 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
5932 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
5933 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
5934 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
5935 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
5936 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
5937 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
5938 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
5939 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
5940 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
5941 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
5942 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
5943 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
5944 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
5945 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
5946 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
5947 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
5948 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
5949 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
5950 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
5951 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
5952 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
5953 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
5954 
5955 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
5956 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
5957 		if (state & flags[i].flag_val)
5958 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
5959 	}
5960 }
5961