xref: /titanic_44/usr/src/uts/sun4v/io/vsw_ldc.c (revision a38ddfee9c8c6b6c5a2947ff52fd2338362a4444)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mac.h>
62 #include <sys/mdeg.h>
63 #include <sys/ldc.h>
64 #include <sys/vsw_fdb.h>
65 #include <sys/vsw.h>
66 #include <sys/vio_mailbox.h>
67 #include <sys/vnet_mailbox.h>
68 #include <sys/vnet_common.h>
69 #include <sys/vio_util.h>
70 #include <sys/sdt.h>
71 #include <sys/atomic.h>
72 #include <sys/callb.h>
73 #include <sys/vlan.h>
74 
75 /* Port add/deletion/etc routines */
76 static	int vsw_port_delete(vsw_port_t *port);
77 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
78 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
79 static	int vsw_init_ldcs(vsw_port_t *port);
80 static	int vsw_uninit_ldcs(vsw_port_t *port);
81 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
82 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
83 static	int vsw_drain_ldcs(vsw_port_t *port);
84 static	int vsw_drain_port_taskq(vsw_port_t *port);
85 static	void vsw_marker_task(void *);
86 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
87 int vsw_detach_ports(vsw_t *vswp);
88 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
89 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
90 int vsw_port_detach(vsw_t *vswp, int p_instance);
91 int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
92 int vsw_port_attach(vsw_port_t *portp);
93 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
94 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
95 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
96 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
97 void vsw_reset_ports(vsw_t *vswp);
98 void vsw_port_reset(vsw_port_t *portp);
99 
100 /* Interrupt routines */
101 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
102 
103 /* Handshake routines */
104 static	void vsw_ldc_reinit(vsw_ldc_t *);
105 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
106 static	void vsw_conn_task(void *);
107 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
108 static	void vsw_next_milestone(vsw_ldc_t *);
109 static	int vsw_supported_version(vio_ver_msg_t *);
110 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
111 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
112 
113 /* Data processing routines */
114 static void vsw_process_pkt(void *);
115 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
116 static void vsw_process_ctrl_pkt(void *);
117 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
123 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
124 	uint32_t);
125 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
126 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
127 static void vsw_process_pkt_data(void *, void *, uint32_t);
128 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
129 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
130 
131 /* Switching/data transmit routines */
132 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
133 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
134 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
135 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
136 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
138 
139 /* Packet creation routines */
140 static void vsw_send_ver(void *);
141 static void vsw_send_attr(vsw_ldc_t *);
142 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
143 static void vsw_send_dring_info(vsw_ldc_t *);
144 static void vsw_send_rdx(vsw_ldc_t *);
145 
146 /* Dring routines */
147 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
148 static void vsw_create_privring(vsw_ldc_t *);
149 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
150 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
151     int *);
152 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
153 static int vsw_reclaim_dring(dring_info_t *dp, int start);
154 
155 static void vsw_set_lane_attr(vsw_t *, lane_t *);
156 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
157 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
158 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
159 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
160 
161 /* Rcv/Tx thread routines */
162 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
163 static void vsw_ldc_tx_worker(void *arg);
164 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
165 static void vsw_ldc_rx_worker(void *arg);
166 
167 /* Misc support routines */
168 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
169 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
170 static void vsw_free_ring(dring_info_t *);
171 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
172 static int vsw_get_same_dest_list(struct ether_header *ehp,
173     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
174 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
175 
176 /* Debugging routines */
177 static void dump_flags(uint64_t);
178 static void display_state(void);
179 static void display_lane(lane_t *);
180 static void display_ring(dring_info_t *);
181 
182 /*
183  * Functions imported from other files.
184  */
185 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
186 extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
187 extern void vsw_reconfig_hw(vsw_t *);
188 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
189 extern void vsw_del_mcst_port(vsw_port_t *port);
190 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
191 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
192 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
193 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
194 extern void vsw_create_vlans(void *arg, int type);
195 extern void vsw_destroy_vlans(void *arg, int type);
196 extern void vsw_vlan_add_ids(void *arg, int type);
197 extern void vsw_vlan_remove_ids(void *arg, int type);
198 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
199 	struct ether_header *ehp, uint16_t *vidp);
200 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
201 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
202 	mblk_t **npt);
203 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
204 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
205 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
206 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
207 extern void vsw_hio_stop_port(vsw_port_t *portp);
208 extern void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr);
209 
210 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
211 
212 /*
213  * Tunables used in this file.
214  */
215 extern int vsw_num_handshakes;
216 extern int vsw_wretries;
217 extern int vsw_desc_delay;
218 extern int vsw_read_attempts;
219 extern int vsw_ldc_tx_delay;
220 extern int vsw_ldc_tx_retries;
221 extern boolean_t vsw_ldc_rxthr_enabled;
222 extern boolean_t vsw_ldc_txthr_enabled;
223 extern uint32_t vsw_ntxds;
224 extern uint32_t vsw_max_tx_qcount;
225 extern uint32_t vsw_chain_len;
226 extern uint32_t vsw_mblk_size1;
227 extern uint32_t vsw_mblk_size2;
228 extern uint32_t vsw_mblk_size3;
229 extern uint32_t vsw_mblk_size4;
230 extern uint32_t vsw_num_mblks1;
231 extern uint32_t vsw_num_mblks2;
232 extern uint32_t vsw_num_mblks3;
233 extern uint32_t vsw_num_mblks4;
234 extern boolean_t vsw_obp_ver_proto_workaround;
235 extern uint32_t vsw_publish_macaddr_count;
236 extern boolean_t vsw_jumbo_rxpools;
237 
238 #define	LDC_ENTER_LOCK(ldcp)	\
239 				mutex_enter(&((ldcp)->ldc_cblock));\
240 				mutex_enter(&((ldcp)->ldc_rxlock));\
241 				mutex_enter(&((ldcp)->ldc_txlock));
242 #define	LDC_EXIT_LOCK(ldcp)	\
243 				mutex_exit(&((ldcp)->ldc_txlock));\
244 				mutex_exit(&((ldcp)->ldc_rxlock));\
245 				mutex_exit(&((ldcp)->ldc_cblock));
246 
247 #define	VSW_VER_EQ(ldcp, major, minor)	\
248 	((ldcp)->lane_out.ver_major == (major) &&	\
249 	    (ldcp)->lane_out.ver_minor == (minor))
250 
251 #define	VSW_VER_LT(ldcp, major, minor)	\
252 	(((ldcp)->lane_out.ver_major < (major)) ||	\
253 	    ((ldcp)->lane_out.ver_major == (major) &&	\
254 	    (ldcp)->lane_out.ver_minor < (minor)))
255 
256 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
257 	(((ldcp)->lane_out.ver_major > (major)) ||	\
258 	    ((ldcp)->lane_out.ver_major == (major) &&	\
259 	    (ldcp)->lane_out.ver_minor >= (minor)))
260 
261 /* supported versions */
262 static	ver_sup_t	vsw_versions[] = { {1, 4} };
263 
264 /*
265  * For the moment the state dump routines have their own
266  * private flag.
267  */
268 #define	DUMP_STATE	0
269 
270 #if DUMP_STATE
271 
272 #define	DUMP_TAG(tag) \
273 {			\
274 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
275 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
276 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
277 }
278 
279 #define	DUMP_TAG_PTR(tag) \
280 {			\
281 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
282 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
283 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
284 }
285 
286 #define	DUMP_FLAGS(flags) dump_flags(flags);
287 #define	DISPLAY_STATE()	display_state()
288 
289 #else
290 
291 #define	DUMP_TAG(tag)
292 #define	DUMP_TAG_PTR(tag)
293 #define	DUMP_FLAGS(state)
294 #define	DISPLAY_STATE()
295 
296 #endif	/* DUMP_STATE */
297 
298 /*
299  * Attach the specified port.
300  *
301  * Returns 0 on success, 1 on failure.
302  */
303 int
304 vsw_port_attach(vsw_port_t *port)
305 {
306 	vsw_t			*vswp = port->p_vswp;
307 	vsw_port_list_t		*plist = &vswp->plist;
308 	vsw_port_t		*p, **pp;
309 	int			i;
310 	int			nids = port->num_ldcs;
311 	uint64_t		*ldcids;
312 
313 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
314 
315 	/* port already exists? */
316 	READ_ENTER(&plist->lockrw);
317 	for (p = plist->head; p != NULL; p = p->p_next) {
318 		if (p->p_instance == port->p_instance) {
319 			DWARN(vswp, "%s: port instance %d already attached",
320 			    __func__, p->p_instance);
321 			RW_EXIT(&plist->lockrw);
322 			return (1);
323 		}
324 	}
325 	RW_EXIT(&plist->lockrw);
326 
327 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
328 
329 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
330 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
331 
332 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
333 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
334 	port->state = VSW_PORT_INIT;
335 
336 	D2(vswp, "%s: %d nids", __func__, nids);
337 	ldcids = port->ldc_ids;
338 	for (i = 0; i < nids; i++) {
339 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
340 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
341 			DERR(vswp, "%s: ldc_attach failed", __func__);
342 
343 			rw_destroy(&port->p_ldclist.lockrw);
344 
345 			cv_destroy(&port->state_cv);
346 			mutex_destroy(&port->state_lock);
347 
348 			mutex_destroy(&port->tx_lock);
349 			mutex_destroy(&port->mca_lock);
350 			kmem_free(port, sizeof (vsw_port_t));
351 			return (1);
352 		}
353 	}
354 
355 	if (vswp->switching_setup_done == B_TRUE) {
356 		/*
357 		 * If the underlying physical device has been setup,
358 		 * program the mac address of this port in it.
359 		 * Otherwise, port macaddr will be set after the physical
360 		 * device is successfully setup by the timeout handler.
361 		 */
362 		mutex_enter(&vswp->hw_lock);
363 		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
364 		mutex_exit(&vswp->hw_lock);
365 	}
366 
367 	/* create the fdb entry for this port/mac address */
368 	vsw_fdbe_add(vswp, port);
369 
370 	vsw_create_vlans(port, VSW_VNETPORT);
371 
372 	WRITE_ENTER(&plist->lockrw);
373 
374 	/* link it into the list of ports for this vsw instance */
375 	pp = (vsw_port_t **)(&plist->head);
376 	port->p_next = *pp;
377 	*pp = port;
378 	plist->num_ports++;
379 
380 	RW_EXIT(&plist->lockrw);
381 
382 	/*
383 	 * Initialise the port and any ldc's under it.
384 	 */
385 	(void) vsw_init_ldcs(port);
386 
387 	/* announce macaddr of vnet to the physical switch */
388 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
389 		vsw_publish_macaddr(vswp, (uint8_t *)&(port->p_macaddr));
390 	}
391 
392 	D1(vswp, "%s: exit", __func__);
393 	return (0);
394 }
395 
396 /*
397  * Detach the specified port.
398  *
399  * Returns 0 on success, 1 on failure.
400  */
401 int
402 vsw_port_detach(vsw_t *vswp, int p_instance)
403 {
404 	vsw_port_t	*port = NULL;
405 	vsw_port_list_t	*plist = &vswp->plist;
406 
407 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
408 
409 	WRITE_ENTER(&plist->lockrw);
410 
411 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
412 		RW_EXIT(&plist->lockrw);
413 		return (1);
414 	}
415 
416 	if (vsw_plist_del_node(vswp, port)) {
417 		RW_EXIT(&plist->lockrw);
418 		return (1);
419 	}
420 
421 	/* cleanup any HybridIO for this port */
422 	vsw_hio_stop_port(port);
423 
424 	/*
425 	 * No longer need to hold writer lock on port list now
426 	 * that we have unlinked the target port from the list.
427 	 */
428 	RW_EXIT(&plist->lockrw);
429 
430 	/* Remove the fdb entry for this port/mac address */
431 	vsw_fdbe_del(vswp, &(port->p_macaddr));
432 	vsw_destroy_vlans(port, VSW_VNETPORT);
433 
434 	/* Remove any multicast addresses.. */
435 	vsw_del_mcst_port(port);
436 
437 	/* Remove address if was programmed into HW. */
438 	mutex_enter(&vswp->hw_lock);
439 
440 	/*
441 	 * Port's address may not have been set in hardware. This could
442 	 * happen if the underlying physical device is not yet available and
443 	 * vsw_setup_switching_timeout() may be in progress.
444 	 * We remove its addr from hardware only if it has been set before.
445 	 */
446 	if (port->addr_set != VSW_ADDR_UNSET)
447 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
448 
449 	if (vswp->recfg_reqd)
450 		vsw_reconfig_hw(vswp);
451 
452 	mutex_exit(&vswp->hw_lock);
453 
454 	if (vsw_port_delete(port)) {
455 		return (1);
456 	}
457 
458 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
459 	return (0);
460 }
461 
462 /*
463  * Detach all active ports.
464  *
465  * Returns 0 on success, 1 on failure.
466  */
467 int
468 vsw_detach_ports(vsw_t *vswp)
469 {
470 	vsw_port_list_t 	*plist = &vswp->plist;
471 	vsw_port_t		*port = NULL;
472 
473 	D1(vswp, "%s: enter", __func__);
474 
475 	WRITE_ENTER(&plist->lockrw);
476 
477 	while ((port = plist->head) != NULL) {
478 		if (vsw_plist_del_node(vswp, port)) {
479 			DERR(vswp, "%s: Error deleting port %d"
480 			    " from port list", __func__, port->p_instance);
481 			RW_EXIT(&plist->lockrw);
482 			return (1);
483 		}
484 
485 		/* Remove address if was programmed into HW. */
486 		mutex_enter(&vswp->hw_lock);
487 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
488 		mutex_exit(&vswp->hw_lock);
489 
490 		/* Remove the fdb entry for this port/mac address */
491 		vsw_fdbe_del(vswp, &(port->p_macaddr));
492 		vsw_destroy_vlans(port, VSW_VNETPORT);
493 
494 		/* Remove any multicast addresses.. */
495 		vsw_del_mcst_port(port);
496 
497 		/*
498 		 * No longer need to hold the lock on the port list
499 		 * now that we have unlinked the target port from the
500 		 * list.
501 		 */
502 		RW_EXIT(&plist->lockrw);
503 		if (vsw_port_delete(port)) {
504 			DERR(vswp, "%s: Error deleting port %d",
505 			    __func__, port->p_instance);
506 			return (1);
507 		}
508 		WRITE_ENTER(&plist->lockrw);
509 	}
510 	RW_EXIT(&plist->lockrw);
511 
512 	D1(vswp, "%s: exit", __func__);
513 
514 	return (0);
515 }
516 
517 /*
518  * Delete the specified port.
519  *
520  * Returns 0 on success, 1 on failure.
521  */
522 static int
523 vsw_port_delete(vsw_port_t *port)
524 {
525 	vsw_ldc_list_t 		*ldcl;
526 	vsw_t			*vswp = port->p_vswp;
527 	int			num_ldcs;
528 
529 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
530 
531 	(void) vsw_uninit_ldcs(port);
532 
533 	/*
534 	 * Wait for any pending ctrl msg tasks which reference this
535 	 * port to finish.
536 	 */
537 	if (vsw_drain_port_taskq(port))
538 		return (1);
539 
540 	/*
541 	 * Wait for any active callbacks to finish
542 	 */
543 	if (vsw_drain_ldcs(port))
544 		return (1);
545 
546 	ldcl = &port->p_ldclist;
547 	num_ldcs = port->num_ldcs;
548 	WRITE_ENTER(&ldcl->lockrw);
549 	while (num_ldcs > 0) {
550 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
551 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
552 			    vswp->instance, ldcl->head->ldc_id);
553 			RW_EXIT(&ldcl->lockrw);
554 			port->num_ldcs = num_ldcs;
555 			return (1);
556 		}
557 		num_ldcs--;
558 	}
559 	RW_EXIT(&ldcl->lockrw);
560 
561 	rw_destroy(&port->p_ldclist.lockrw);
562 
563 	mutex_destroy(&port->mca_lock);
564 	mutex_destroy(&port->tx_lock);
565 
566 	cv_destroy(&port->state_cv);
567 	mutex_destroy(&port->state_lock);
568 
569 	if (port->num_ldcs != 0) {
570 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
571 		port->num_ldcs = 0;
572 	}
573 	kmem_free(port, sizeof (vsw_port_t));
574 
575 	D1(vswp, "%s: exit", __func__);
576 
577 	return (0);
578 }
579 
580 static int
581 vsw_init_multipools(vsw_ldc_t *ldcp, vsw_t *vswp)
582 {
583 	size_t		data_sz;
584 	int		rv;
585 	uint32_t	sz1 = 0;
586 	uint32_t	sz2 = 0;
587 	uint32_t	sz3 = 0;
588 	uint32_t	sz4 = 0;
589 
590 	/*
591 	 * We round up the mtu specified to be a multiple of 2K to limit the
592 	 * number of rx buffer pools created for a given mtu.
593 	 */
594 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
595 	data_sz = VNET_ROUNDUP_2K(data_sz);
596 
597 	/*
598 	 * If pool sizes are specified, use them. Note that the presence of
599 	 * the first tunable will be used as a hint.
600 	 */
601 	if (vsw_mblk_size1 != 0) {
602 		sz1 = vsw_mblk_size1;
603 		sz2 = vsw_mblk_size2;
604 		sz3 = vsw_mblk_size3;
605 		sz4 = vsw_mblk_size4;
606 
607 		if (sz4 == 0) { /* need 3 pools */
608 
609 			ldcp->max_rxpool_size = sz3;
610 			rv = vio_init_multipools(&ldcp->vmp,
611 			    VSW_NUM_VMPOOLS, sz1, sz2, sz3,
612 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
613 
614 		} else {
615 
616 			ldcp->max_rxpool_size = sz4;
617 			rv = vio_init_multipools(&ldcp->vmp,
618 			    VSW_NUM_VMPOOLS + 1, sz1, sz2, sz3, sz4,
619 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
620 			    vsw_num_mblks4);
621 
622 		}
623 
624 		return (rv);
625 	}
626 
627 	/*
628 	 * Pool sizes are not specified. We select the pool sizes based on the
629 	 * mtu if vnet_jumbo_rxpools is enabled.
630 	 */
631 	if (vsw_jumbo_rxpools == B_FALSE || data_sz == VNET_2K) {
632 		/*
633 		 * Receive buffer pool allocation based on mtu is disabled.
634 		 * Use the default mechanism of standard size pool allocation.
635 		 */
636 		sz1 = VSW_MBLK_SZ_128;
637 		sz2 = VSW_MBLK_SZ_256;
638 		sz3 = VSW_MBLK_SZ_2048;
639 		ldcp->max_rxpool_size = sz3;
640 
641 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
642 		    sz1, sz2, sz3,
643 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
644 
645 		return (rv);
646 	}
647 
648 	switch (data_sz) {
649 
650 	case VNET_4K:
651 
652 		sz1 = VSW_MBLK_SZ_128;
653 		sz2 = VSW_MBLK_SZ_256;
654 		sz3 = VSW_MBLK_SZ_2048;
655 		sz4 = sz3 << 1;			/* 4K */
656 		ldcp->max_rxpool_size = sz4;
657 
658 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
659 		    sz1, sz2, sz3, sz4,
660 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
661 		    vsw_num_mblks4);
662 		break;
663 
664 	default:	/* data_sz:  4K+ to 16K */
665 
666 		sz1 = VSW_MBLK_SZ_256;
667 		sz2 = VSW_MBLK_SZ_2048;
668 		sz3 = data_sz >> 1;	/* Jumbo-size/2 */
669 		sz4 = data_sz;	/* Jumbo-size */
670 		ldcp->max_rxpool_size = sz4;
671 
672 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
673 		    sz1, sz2, sz3, sz4,
674 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
675 		    vsw_num_mblks4);
676 		break;
677 	}
678 
679 	return (rv);
680 
681 }
682 
683 /*
684  * Attach a logical domain channel (ldc) under a specified port.
685  *
686  * Returns 0 on success, 1 on failure.
687  */
688 static int
689 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
690 {
691 	vsw_t 		*vswp = port->p_vswp;
692 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
693 	vsw_ldc_t 	*ldcp = NULL;
694 	ldc_attr_t 	attr;
695 	ldc_status_t	istatus;
696 	int 		status = DDI_FAILURE;
697 	char		kname[MAXNAMELEN];
698 	enum		{ PROG_init = 0x0,
699 			    PROG_callback = 0x1, PROG_rx_thread = 0x2,
700 			    PROG_tx_thread = 0x4}
701 			progress;
702 
703 	progress = PROG_init;
704 
705 	D1(vswp, "%s: enter", __func__);
706 
707 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
708 	if (ldcp == NULL) {
709 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
710 		return (1);
711 	}
712 	ldcp->ldc_id = ldc_id;
713 
714 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
715 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
716 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
717 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
718 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
719 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
720 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
721 
722 	/* required for handshake with peer */
723 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
724 	ldcp->peer_session = 0;
725 	ldcp->session_status = 0;
726 	ldcp->hss_id = 1;	/* Initial handshake session id */
727 
728 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
729 
730 	/* only set for outbound lane, inbound set by peer */
731 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
732 
733 	attr.devclass = LDC_DEV_NT_SVC;
734 	attr.instance = ddi_get_instance(vswp->dip);
735 	attr.mode = LDC_MODE_UNRELIABLE;
736 	attr.mtu = VSW_LDC_MTU;
737 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
738 	if (status != 0) {
739 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
740 		    __func__, ldc_id, status);
741 		goto ldc_attach_fail;
742 	}
743 
744 	if (vsw_ldc_rxthr_enabled) {
745 		ldcp->rx_thr_flags = 0;
746 
747 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
748 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
749 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
750 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
751 
752 		progress |= PROG_rx_thread;
753 		if (ldcp->rx_thread == NULL) {
754 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
755 			    __func__, ldc_id);
756 			goto ldc_attach_fail;
757 		}
758 	}
759 
760 	if (vsw_ldc_txthr_enabled) {
761 		ldcp->tx_thr_flags = 0;
762 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
763 
764 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
765 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
766 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
767 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
768 
769 		progress |= PROG_tx_thread;
770 		if (ldcp->tx_thread == NULL) {
771 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
772 			    __func__, ldc_id);
773 			goto ldc_attach_fail;
774 		}
775 	}
776 
777 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
778 	if (status != 0) {
779 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
780 		    __func__, ldc_id, status);
781 		(void) ldc_fini(ldcp->ldc_handle);
782 		goto ldc_attach_fail;
783 	}
784 	/*
785 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
786 	 * data msgs, including raw data msgs used to recv priority frames.
787 	 */
788 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
789 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
790 
791 	progress |= PROG_callback;
792 
793 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
794 
795 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
796 		DERR(vswp, "%s: ldc_status failed", __func__);
797 		mutex_destroy(&ldcp->status_lock);
798 		goto ldc_attach_fail;
799 	}
800 
801 	ldcp->ldc_status = istatus;
802 	ldcp->ldc_port = port;
803 	ldcp->ldc_vswp = vswp;
804 
805 	vsw_reset_vnet_proto_ops(ldcp);
806 
807 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
808 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
809 	    kname, &ldcp->ldc_stats);
810 	if (ldcp->ksp == NULL) {
811 		DERR(vswp, "%s: kstats setup failed", __func__);
812 		goto ldc_attach_fail;
813 	}
814 
815 	/* link it into the list of channels for this port */
816 	WRITE_ENTER(&ldcl->lockrw);
817 	ldcp->ldc_next = ldcl->head;
818 	ldcl->head = ldcp;
819 	RW_EXIT(&ldcl->lockrw);
820 
821 	D1(vswp, "%s: exit", __func__);
822 	return (0);
823 
824 ldc_attach_fail:
825 
826 	if (progress & PROG_callback) {
827 		(void) ldc_unreg_callback(ldcp->ldc_handle);
828 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
829 	}
830 
831 	if (progress & PROG_rx_thread) {
832 		if (ldcp->rx_thread != NULL) {
833 			vsw_stop_rx_thread(ldcp);
834 		}
835 		mutex_destroy(&ldcp->rx_thr_lock);
836 		cv_destroy(&ldcp->rx_thr_cv);
837 	}
838 
839 	if (progress & PROG_tx_thread) {
840 		if (ldcp->tx_thread != NULL) {
841 			vsw_stop_tx_thread(ldcp);
842 		}
843 		mutex_destroy(&ldcp->tx_thr_lock);
844 		cv_destroy(&ldcp->tx_thr_cv);
845 	}
846 	if (ldcp->ksp != NULL) {
847 		vgen_destroy_kstats(ldcp->ksp);
848 	}
849 	mutex_destroy(&ldcp->ldc_txlock);
850 	mutex_destroy(&ldcp->ldc_rxlock);
851 	mutex_destroy(&ldcp->ldc_cblock);
852 	mutex_destroy(&ldcp->drain_cv_lock);
853 
854 	cv_destroy(&ldcp->drain_cv);
855 
856 	rw_destroy(&ldcp->lane_in.dlistrw);
857 	rw_destroy(&ldcp->lane_out.dlistrw);
858 
859 	kmem_free(ldcp, sizeof (vsw_ldc_t));
860 
861 	return (1);
862 }
863 
864 /*
865  * Detach a logical domain channel (ldc) belonging to a
866  * particular port.
867  *
868  * Returns 0 on success, 1 on failure.
869  */
870 static int
871 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
872 {
873 	vsw_t 		*vswp = port->p_vswp;
874 	vsw_ldc_t 	*ldcp, *prev_ldcp;
875 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
876 	int 		rv;
877 
878 	prev_ldcp = ldcl->head;
879 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
880 		if (ldcp->ldc_id == ldc_id) {
881 			break;
882 		}
883 	}
884 
885 	/* specified ldc id not found */
886 	if (ldcp == NULL) {
887 		DERR(vswp, "%s: ldcp = NULL", __func__);
888 		return (1);
889 	}
890 
891 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
892 
893 	/* Stop the receive thread */
894 	if (ldcp->rx_thread != NULL) {
895 		vsw_stop_rx_thread(ldcp);
896 		mutex_destroy(&ldcp->rx_thr_lock);
897 		cv_destroy(&ldcp->rx_thr_cv);
898 	}
899 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
900 
901 	/* Stop the tx thread */
902 	if (ldcp->tx_thread != NULL) {
903 		vsw_stop_tx_thread(ldcp);
904 		mutex_destroy(&ldcp->tx_thr_lock);
905 		cv_destroy(&ldcp->tx_thr_cv);
906 		if (ldcp->tx_mhead != NULL) {
907 			freemsgchain(ldcp->tx_mhead);
908 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
909 			ldcp->tx_cnt = 0;
910 		}
911 	}
912 
913 	/* Destory kstats */
914 	vgen_destroy_kstats(ldcp->ksp);
915 
916 	/*
917 	 * Before we can close the channel we must release any mapped
918 	 * resources (e.g. drings).
919 	 */
920 	vsw_free_lane_resources(ldcp, INBOUND);
921 	vsw_free_lane_resources(ldcp, OUTBOUND);
922 
923 	/*
924 	 * If the close fails we are in serious trouble, as won't
925 	 * be able to delete the parent port.
926 	 */
927 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
928 		DERR(vswp, "%s: error %d closing channel %lld",
929 		    __func__, rv, ldcp->ldc_id);
930 		return (1);
931 	}
932 
933 	(void) ldc_fini(ldcp->ldc_handle);
934 
935 	ldcp->ldc_status = LDC_INIT;
936 	ldcp->ldc_handle = NULL;
937 	ldcp->ldc_vswp = NULL;
938 
939 
940 	/*
941 	 * Most likely some mblks are still in use and
942 	 * have not been returned to the pool. These mblks are
943 	 * added to the pool that is maintained in the device instance.
944 	 * Another attempt will be made to destroy the pool
945 	 * when the device detaches.
946 	 */
947 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
948 
949 	/* unlink it from the list */
950 	prev_ldcp = ldcp->ldc_next;
951 
952 	mutex_destroy(&ldcp->ldc_txlock);
953 	mutex_destroy(&ldcp->ldc_rxlock);
954 	mutex_destroy(&ldcp->ldc_cblock);
955 	cv_destroy(&ldcp->drain_cv);
956 	mutex_destroy(&ldcp->drain_cv_lock);
957 	mutex_destroy(&ldcp->status_lock);
958 	rw_destroy(&ldcp->lane_in.dlistrw);
959 	rw_destroy(&ldcp->lane_out.dlistrw);
960 
961 	kmem_free(ldcp, sizeof (vsw_ldc_t));
962 
963 	return (0);
964 }
965 
966 /*
967  * Open and attempt to bring up the channel. Note that channel
968  * can only be brought up if peer has also opened channel.
969  *
970  * Returns 0 if can open and bring up channel, otherwise
971  * returns 1.
972  */
973 static int
974 vsw_ldc_init(vsw_ldc_t *ldcp)
975 {
976 	vsw_t 		*vswp = ldcp->ldc_vswp;
977 	ldc_status_t	istatus = 0;
978 	int		rv;
979 
980 	D1(vswp, "%s: enter", __func__);
981 
982 	LDC_ENTER_LOCK(ldcp);
983 
984 	/* don't start at 0 in case clients don't like that */
985 	ldcp->next_ident = 1;
986 
987 	rv = ldc_open(ldcp->ldc_handle);
988 	if (rv != 0) {
989 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
990 		    __func__, ldcp->ldc_id, rv);
991 		LDC_EXIT_LOCK(ldcp);
992 		return (1);
993 	}
994 
995 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
996 		DERR(vswp, "%s: unable to get status", __func__);
997 		LDC_EXIT_LOCK(ldcp);
998 		return (1);
999 
1000 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
1001 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
1002 		    __func__, ldcp->ldc_id, istatus);
1003 		LDC_EXIT_LOCK(ldcp);
1004 		return (1);
1005 	}
1006 
1007 	mutex_enter(&ldcp->status_lock);
1008 	ldcp->ldc_status = istatus;
1009 	mutex_exit(&ldcp->status_lock);
1010 
1011 	rv = ldc_up(ldcp->ldc_handle);
1012 	if (rv != 0) {
1013 		/*
1014 		 * Not a fatal error for ldc_up() to fail, as peer
1015 		 * end point may simply not be ready yet.
1016 		 */
1017 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
1018 		    ldcp->ldc_id, rv);
1019 		LDC_EXIT_LOCK(ldcp);
1020 		return (1);
1021 	}
1022 
1023 	/*
1024 	 * ldc_up() call is non-blocking so need to explicitly
1025 	 * check channel status to see if in fact the channel
1026 	 * is UP.
1027 	 */
1028 	mutex_enter(&ldcp->status_lock);
1029 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
1030 		DERR(vswp, "%s: unable to get status", __func__);
1031 		mutex_exit(&ldcp->status_lock);
1032 		LDC_EXIT_LOCK(ldcp);
1033 		return (1);
1034 
1035 	}
1036 
1037 	if (ldcp->ldc_status == LDC_UP) {
1038 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
1039 		    ldcp->ldc_id, istatus);
1040 		mutex_exit(&ldcp->status_lock);
1041 		LDC_EXIT_LOCK(ldcp);
1042 
1043 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1044 		return (0);
1045 	}
1046 
1047 	mutex_exit(&ldcp->status_lock);
1048 	LDC_EXIT_LOCK(ldcp);
1049 
1050 	D1(vswp, "%s: exit", __func__);
1051 	return (0);
1052 }
1053 
1054 /* disable callbacks on the channel */
1055 static int
1056 vsw_ldc_uninit(vsw_ldc_t *ldcp)
1057 {
1058 	vsw_t	*vswp = ldcp->ldc_vswp;
1059 	int	rv;
1060 
1061 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
1062 
1063 	LDC_ENTER_LOCK(ldcp);
1064 
1065 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
1066 	if (rv != 0) {
1067 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
1068 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
1069 		LDC_EXIT_LOCK(ldcp);
1070 		return (1);
1071 	}
1072 
1073 	mutex_enter(&ldcp->status_lock);
1074 	ldcp->ldc_status = LDC_INIT;
1075 	mutex_exit(&ldcp->status_lock);
1076 
1077 	LDC_EXIT_LOCK(ldcp);
1078 
1079 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
1080 
1081 	return (0);
1082 }
1083 
1084 static int
1085 vsw_init_ldcs(vsw_port_t *port)
1086 {
1087 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1088 	vsw_ldc_t	*ldcp;
1089 
1090 	READ_ENTER(&ldcl->lockrw);
1091 	ldcp =  ldcl->head;
1092 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1093 		(void) vsw_ldc_init(ldcp);
1094 	}
1095 	RW_EXIT(&ldcl->lockrw);
1096 
1097 	return (0);
1098 }
1099 
1100 static int
1101 vsw_uninit_ldcs(vsw_port_t *port)
1102 {
1103 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1104 	vsw_ldc_t	*ldcp;
1105 
1106 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1107 
1108 	READ_ENTER(&ldcl->lockrw);
1109 	ldcp =  ldcl->head;
1110 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1111 		(void) vsw_ldc_uninit(ldcp);
1112 	}
1113 	RW_EXIT(&ldcl->lockrw);
1114 
1115 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1116 
1117 	return (0);
1118 }
1119 
1120 /*
1121  * Wait until the callback(s) associated with the ldcs under the specified
1122  * port have completed.
1123  *
1124  * Prior to this function being invoked each channel under this port
1125  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1126  *
1127  * A short explaination of what we are doing below..
1128  *
1129  * The simplest approach would be to have a reference counter in
1130  * the ldc structure which is increment/decremented by the callbacks as
1131  * they use the channel. The drain function could then simply disable any
1132  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1133  * there is a tiny window here - before the callback is able to get the lock
1134  * on the channel it is interrupted and this function gets to execute. It
1135  * sees that the ref count is zero and believes its free to delete the
1136  * associated data structures.
1137  *
1138  * We get around this by taking advantage of the fact that before the ldc
1139  * framework invokes a callback it sets a flag to indicate that there is a
1140  * callback active (or about to become active). If when we attempt to
1141  * unregister a callback when this active flag is set then the unregister
1142  * will fail with EWOULDBLOCK.
1143  *
1144  * If the unregister fails we do a cv_timedwait. We will either be signaled
1145  * by the callback as it is exiting (note we have to wait a short period to
1146  * allow the callback to return fully to the ldc framework and it to clear
1147  * the active flag), or by the timer expiring. In either case we again attempt
1148  * the unregister. We repeat this until we can succesfully unregister the
1149  * callback.
1150  *
1151  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1152  * the case where the callback has finished but the ldc framework has not yet
1153  * cleared the active flag. In this case we would never get a cv_signal.
1154  */
1155 static int
1156 vsw_drain_ldcs(vsw_port_t *port)
1157 {
1158 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1159 	vsw_ldc_t	*ldcp;
1160 	vsw_t		*vswp = port->p_vswp;
1161 
1162 	D1(vswp, "%s: enter", __func__);
1163 
1164 	READ_ENTER(&ldcl->lockrw);
1165 
1166 	ldcp = ldcl->head;
1167 
1168 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1169 		/*
1170 		 * If we can unregister the channel callback then we
1171 		 * know that there is no callback either running or
1172 		 * scheduled to run for this channel so move on to next
1173 		 * channel in the list.
1174 		 */
1175 		mutex_enter(&ldcp->drain_cv_lock);
1176 
1177 		/* prompt active callbacks to quit */
1178 		ldcp->drain_state = VSW_LDC_DRAINING;
1179 
1180 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1181 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1182 			    ldcp->ldc_id);
1183 			mutex_exit(&ldcp->drain_cv_lock);
1184 			continue;
1185 		} else {
1186 			/*
1187 			 * If we end up here we know that either 1) a callback
1188 			 * is currently executing, 2) is about to start (i.e.
1189 			 * the ldc framework has set the active flag but
1190 			 * has not actually invoked the callback yet, or 3)
1191 			 * has finished and has returned to the ldc framework
1192 			 * but the ldc framework has not yet cleared the
1193 			 * active bit.
1194 			 *
1195 			 * Wait for it to finish.
1196 			 */
1197 			while (ldc_unreg_callback(ldcp->ldc_handle)
1198 			    == EWOULDBLOCK)
1199 				(void) cv_timedwait(&ldcp->drain_cv,
1200 				    &ldcp->drain_cv_lock, lbolt + hz);
1201 
1202 			mutex_exit(&ldcp->drain_cv_lock);
1203 			D2(vswp, "%s: unreg callback for chan %ld after "
1204 			    "timeout", __func__, ldcp->ldc_id);
1205 		}
1206 	}
1207 	RW_EXIT(&ldcl->lockrw);
1208 
1209 	D1(vswp, "%s: exit", __func__);
1210 	return (0);
1211 }
1212 
1213 /*
1214  * Wait until all tasks which reference this port have completed.
1215  *
1216  * Prior to this function being invoked each channel under this port
1217  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1218  */
1219 static int
1220 vsw_drain_port_taskq(vsw_port_t *port)
1221 {
1222 	vsw_t		*vswp = port->p_vswp;
1223 
1224 	D1(vswp, "%s: enter", __func__);
1225 
1226 	/*
1227 	 * Mark the port as in the process of being detached, and
1228 	 * dispatch a marker task to the queue so we know when all
1229 	 * relevant tasks have completed.
1230 	 */
1231 	mutex_enter(&port->state_lock);
1232 	port->state = VSW_PORT_DETACHING;
1233 
1234 	if ((vswp->taskq_p == NULL) ||
1235 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1236 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1237 		DERR(vswp, "%s: unable to dispatch marker task",
1238 		    __func__);
1239 		mutex_exit(&port->state_lock);
1240 		return (1);
1241 	}
1242 
1243 	/*
1244 	 * Wait for the marker task to finish.
1245 	 */
1246 	while (port->state != VSW_PORT_DETACHABLE)
1247 		cv_wait(&port->state_cv, &port->state_lock);
1248 
1249 	mutex_exit(&port->state_lock);
1250 
1251 	D1(vswp, "%s: exit", __func__);
1252 
1253 	return (0);
1254 }
1255 
1256 static void
1257 vsw_marker_task(void *arg)
1258 {
1259 	vsw_port_t	*port = arg;
1260 	vsw_t		*vswp = port->p_vswp;
1261 
1262 	D1(vswp, "%s: enter", __func__);
1263 
1264 	mutex_enter(&port->state_lock);
1265 
1266 	/*
1267 	 * No further tasks should be dispatched which reference
1268 	 * this port so ok to mark it as safe to detach.
1269 	 */
1270 	port->state = VSW_PORT_DETACHABLE;
1271 
1272 	cv_signal(&port->state_cv);
1273 
1274 	mutex_exit(&port->state_lock);
1275 
1276 	D1(vswp, "%s: exit", __func__);
1277 }
1278 
1279 vsw_port_t *
1280 vsw_lookup_port(vsw_t *vswp, int p_instance)
1281 {
1282 	vsw_port_list_t *plist = &vswp->plist;
1283 	vsw_port_t	*port;
1284 
1285 	for (port = plist->head; port != NULL; port = port->p_next) {
1286 		if (port->p_instance == p_instance) {
1287 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1288 			return (port);
1289 		}
1290 	}
1291 
1292 	return (NULL);
1293 }
1294 
1295 void
1296 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1297 {
1298 	vsw_ldc_list_t 	*ldclp;
1299 	vsw_ldc_t	*ldcp;
1300 
1301 	ldclp = &portp->p_ldclist;
1302 
1303 	READ_ENTER(&ldclp->lockrw);
1304 
1305 	/*
1306 	 * NOTE: for now, we will assume we have a single channel.
1307 	 */
1308 	if (ldclp->head == NULL) {
1309 		RW_EXIT(&ldclp->lockrw);
1310 		return;
1311 	}
1312 	ldcp = ldclp->head;
1313 
1314 	mutex_enter(&ldcp->ldc_cblock);
1315 
1316 	/*
1317 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1318 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1319 	 */
1320 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1321 	    portp->nvids != 0) {
1322 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1323 	}
1324 
1325 	mutex_exit(&ldcp->ldc_cblock);
1326 
1327 	RW_EXIT(&ldclp->lockrw);
1328 }
1329 
1330 void
1331 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1332 {
1333 	vsw_ldc_list_t	*ldclp;
1334 	vsw_ldc_t	*ldcp;
1335 
1336 	ldclp = &portp->p_ldclist;
1337 
1338 	READ_ENTER(&ldclp->lockrw);
1339 
1340 	/*
1341 	 * NOTE: for now, we will assume we have a single channel.
1342 	 */
1343 	if (ldclp->head == NULL) {
1344 		RW_EXIT(&ldclp->lockrw);
1345 		return;
1346 	}
1347 	ldcp = ldclp->head;
1348 
1349 	mutex_enter(&ldcp->ldc_cblock);
1350 
1351 	/*
1352 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1353 	 * to trigger re-negotiation, which inturn trigger HybridIO
1354 	 * setup/cleanup.
1355 	 */
1356 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1357 	    (portp->p_hio_capable == B_TRUE)) {
1358 		if (immediate == B_TRUE) {
1359 			(void) ldc_down(ldcp->ldc_handle);
1360 		} else {
1361 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1362 		}
1363 	}
1364 
1365 	mutex_exit(&ldcp->ldc_cblock);
1366 
1367 	RW_EXIT(&ldclp->lockrw);
1368 }
1369 
1370 void
1371 vsw_port_reset(vsw_port_t *portp)
1372 {
1373 	vsw_ldc_list_t 	*ldclp;
1374 	vsw_ldc_t	*ldcp;
1375 
1376 	ldclp = &portp->p_ldclist;
1377 
1378 	READ_ENTER(&ldclp->lockrw);
1379 
1380 	/*
1381 	 * NOTE: for now, we will assume we have a single channel.
1382 	 */
1383 	if (ldclp->head == NULL) {
1384 		RW_EXIT(&ldclp->lockrw);
1385 		return;
1386 	}
1387 	ldcp = ldclp->head;
1388 
1389 	mutex_enter(&ldcp->ldc_cblock);
1390 
1391 	/*
1392 	 * reset channel and terminate the connection.
1393 	 */
1394 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1395 
1396 	mutex_exit(&ldcp->ldc_cblock);
1397 
1398 	RW_EXIT(&ldclp->lockrw);
1399 }
1400 
1401 void
1402 vsw_reset_ports(vsw_t *vswp)
1403 {
1404 	vsw_port_list_t	*plist = &vswp->plist;
1405 	vsw_port_t	*portp;
1406 
1407 	READ_ENTER(&plist->lockrw);
1408 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1409 		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1410 			vsw_hio_stop_port(portp);
1411 		}
1412 		vsw_port_reset(portp);
1413 	}
1414 	RW_EXIT(&plist->lockrw);
1415 }
1416 
1417 
1418 /*
1419  * Search for and remove the specified port from the port
1420  * list. Returns 0 if able to locate and remove port, otherwise
1421  * returns 1.
1422  */
1423 static int
1424 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1425 {
1426 	vsw_port_list_t *plist = &vswp->plist;
1427 	vsw_port_t	*curr_p, *prev_p;
1428 
1429 	if (plist->head == NULL)
1430 		return (1);
1431 
1432 	curr_p = prev_p = plist->head;
1433 
1434 	while (curr_p != NULL) {
1435 		if (curr_p == port) {
1436 			if (prev_p == curr_p) {
1437 				plist->head = curr_p->p_next;
1438 			} else {
1439 				prev_p->p_next = curr_p->p_next;
1440 			}
1441 			plist->num_ports--;
1442 			break;
1443 		} else {
1444 			prev_p = curr_p;
1445 			curr_p = curr_p->p_next;
1446 		}
1447 	}
1448 	return (0);
1449 }
1450 
1451 /*
1452  * Interrupt handler for ldc messages.
1453  */
1454 static uint_t
1455 vsw_ldc_cb(uint64_t event, caddr_t arg)
1456 {
1457 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1458 	vsw_t 		*vswp = ldcp->ldc_vswp;
1459 
1460 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1461 
1462 	mutex_enter(&ldcp->ldc_cblock);
1463 	ldcp->ldc_stats.callbacks++;
1464 
1465 	mutex_enter(&ldcp->status_lock);
1466 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1467 		mutex_exit(&ldcp->status_lock);
1468 		mutex_exit(&ldcp->ldc_cblock);
1469 		return (LDC_SUCCESS);
1470 	}
1471 	mutex_exit(&ldcp->status_lock);
1472 
1473 	if (event & LDC_EVT_UP) {
1474 		/*
1475 		 * Channel has come up.
1476 		 */
1477 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1478 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1479 
1480 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1481 
1482 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1483 	}
1484 
1485 	if (event & LDC_EVT_READ) {
1486 		/*
1487 		 * Data available for reading.
1488 		 */
1489 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1490 		    __func__, ldcp->ldc_id, event);
1491 
1492 		if (ldcp->rx_thread != NULL) {
1493 			/*
1494 			 * If the receive thread is enabled, then
1495 			 * wakeup the receive thread to process the
1496 			 * LDC messages.
1497 			 */
1498 			mutex_exit(&ldcp->ldc_cblock);
1499 			mutex_enter(&ldcp->rx_thr_lock);
1500 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1501 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1502 				cv_signal(&ldcp->rx_thr_cv);
1503 			}
1504 			mutex_exit(&ldcp->rx_thr_lock);
1505 			mutex_enter(&ldcp->ldc_cblock);
1506 		} else {
1507 			vsw_process_pkt(ldcp);
1508 		}
1509 
1510 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1511 
1512 		goto vsw_cb_exit;
1513 	}
1514 
1515 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1516 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1517 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1518 
1519 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1520 	}
1521 
1522 	/*
1523 	 * Catch either LDC_EVT_WRITE which we don't support or any
1524 	 * unknown event.
1525 	 */
1526 	if (event &
1527 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1528 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1529 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1530 	}
1531 
1532 vsw_cb_exit:
1533 	mutex_exit(&ldcp->ldc_cblock);
1534 
1535 	/*
1536 	 * Let the drain function know we are finishing if it
1537 	 * is waiting.
1538 	 */
1539 	mutex_enter(&ldcp->drain_cv_lock);
1540 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1541 		cv_signal(&ldcp->drain_cv);
1542 	mutex_exit(&ldcp->drain_cv_lock);
1543 
1544 	return (LDC_SUCCESS);
1545 }
1546 
1547 /*
1548  * Reinitialise data structures associated with the channel.
1549  */
1550 static void
1551 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1552 {
1553 	vsw_t		*vswp = ldcp->ldc_vswp;
1554 	vsw_port_t	*port;
1555 	vsw_ldc_list_t	*ldcl;
1556 
1557 	D1(vswp, "%s: enter", __func__);
1558 
1559 	/* free receive mblk pools for the channel */
1560 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
1561 
1562 	port = ldcp->ldc_port;
1563 	ldcl = &port->p_ldclist;
1564 
1565 	READ_ENTER(&ldcl->lockrw);
1566 
1567 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1568 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1569 
1570 	vsw_free_lane_resources(ldcp, INBOUND);
1571 	vsw_free_lane_resources(ldcp, OUTBOUND);
1572 	RW_EXIT(&ldcl->lockrw);
1573 
1574 	ldcp->lane_in.lstate = 0;
1575 	ldcp->lane_out.lstate = 0;
1576 
1577 	/* Remove the fdb entry for this port/mac address */
1578 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1579 
1580 	/* remove the port from vlans it has been assigned to */
1581 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1582 
1583 	/*
1584 	 * Remove parent port from any multicast groups
1585 	 * it may have registered with. Client must resend
1586 	 * multicast add command after handshake completes.
1587 	 */
1588 	vsw_del_mcst_port(port);
1589 
1590 	ldcp->peer_session = 0;
1591 	ldcp->session_status = 0;
1592 	ldcp->hcnt = 0;
1593 	ldcp->hphase = VSW_MILESTONE0;
1594 
1595 	vsw_reset_vnet_proto_ops(ldcp);
1596 
1597 	D1(vswp, "%s: exit", __func__);
1598 }
1599 
1600 /*
1601  * Process a connection event.
1602  *
1603  * Note - care must be taken to ensure that this function is
1604  * not called with the dlistrw lock held.
1605  */
1606 static void
1607 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1608 {
1609 	vsw_t		*vswp = ldcp->ldc_vswp;
1610 	vsw_conn_evt_t	*conn = NULL;
1611 
1612 	D1(vswp, "%s: enter", __func__);
1613 
1614 	/*
1615 	 * Check if either a reset or restart event is pending
1616 	 * or in progress. If so just return.
1617 	 *
1618 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1619 	 * being received by the callback handler, or a ECONNRESET error
1620 	 * code being returned from a ldc_read() or ldc_write() call.
1621 	 *
1622 	 * A VSW_CONN_RESTART event occurs when some error checking code
1623 	 * decides that there is a problem with data from the channel,
1624 	 * and that the handshake should be restarted.
1625 	 */
1626 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1627 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1628 		return;
1629 
1630 	/*
1631 	 * If it is an LDC_UP event we first check the recorded
1632 	 * state of the channel. If this is UP then we know that
1633 	 * the channel moving to the UP state has already been dealt
1634 	 * with and don't need to dispatch a  new task.
1635 	 *
1636 	 * The reason for this check is that when we do a ldc_up(),
1637 	 * depending on the state of the peer, we may or may not get
1638 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1639 	 * every time we do ldc_up() we explicitly check the channel
1640 	 * status to see has it come up (ldc_up() is asynch and will
1641 	 * complete at some undefined time), and take the appropriate
1642 	 * action.
1643 	 *
1644 	 * The flip side of this is that we may get a LDC_UP event
1645 	 * when we have already seen that the channel is up and have
1646 	 * dealt with that.
1647 	 */
1648 	mutex_enter(&ldcp->status_lock);
1649 	if (evt == VSW_CONN_UP) {
1650 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1651 			mutex_exit(&ldcp->status_lock);
1652 			return;
1653 		}
1654 	}
1655 	mutex_exit(&ldcp->status_lock);
1656 
1657 	/*
1658 	 * The transaction group id allows us to identify and discard
1659 	 * any tasks which are still pending on the taskq and refer
1660 	 * to the handshake session we are about to restart or reset.
1661 	 * These stale messages no longer have any real meaning.
1662 	 */
1663 	(void) atomic_inc_32(&ldcp->hss_id);
1664 
1665 	ASSERT(vswp->taskq_p != NULL);
1666 
1667 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1668 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1669 		    " connection event", vswp->instance);
1670 		goto err_exit;
1671 	}
1672 
1673 	conn->evt = evt;
1674 	conn->ldcp = ldcp;
1675 
1676 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1677 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1678 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1679 		    vswp->instance);
1680 
1681 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1682 		goto err_exit;
1683 	}
1684 
1685 	D1(vswp, "%s: exit", __func__);
1686 	return;
1687 
1688 err_exit:
1689 	/*
1690 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1691 	 * that future requests will at least be attempted and will hopefully
1692 	 * succeed.
1693 	 */
1694 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1695 		ldcp->reset_active = 0;
1696 }
1697 
1698 /*
1699  * Deal with events relating to a connection. Invoked from a taskq.
1700  */
1701 static void
1702 vsw_conn_task(void *arg)
1703 {
1704 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1705 	vsw_ldc_t	*ldcp = NULL;
1706 	vsw_port_t	*portp;
1707 	vsw_t		*vswp = NULL;
1708 	uint16_t	evt;
1709 	ldc_status_t	curr_status;
1710 
1711 	ldcp = conn->ldcp;
1712 	evt = conn->evt;
1713 	vswp = ldcp->ldc_vswp;
1714 	portp = ldcp->ldc_port;
1715 
1716 	D1(vswp, "%s: enter", __func__);
1717 
1718 	/* can safely free now have copied out data */
1719 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1720 
1721 	mutex_enter(&ldcp->status_lock);
1722 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1723 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1724 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1725 		mutex_exit(&ldcp->status_lock);
1726 		return;
1727 	}
1728 
1729 	/*
1730 	 * If we wish to restart the handshake on this channel, then if
1731 	 * the channel is UP we bring it DOWN to flush the underlying
1732 	 * ldc queue.
1733 	 */
1734 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1735 		(void) ldc_down(ldcp->ldc_handle);
1736 
1737 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1738 		vsw_hio_stop(vswp, ldcp);
1739 	}
1740 
1741 	/*
1742 	 * re-init all the associated data structures.
1743 	 */
1744 	vsw_ldc_reinit(ldcp);
1745 
1746 	/*
1747 	 * Bring the channel back up (note it does no harm to
1748 	 * do this even if the channel is already UP, Just
1749 	 * becomes effectively a no-op).
1750 	 */
1751 	(void) ldc_up(ldcp->ldc_handle);
1752 
1753 	/*
1754 	 * Check if channel is now UP. This will only happen if
1755 	 * peer has also done a ldc_up().
1756 	 */
1757 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1758 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1759 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1760 		mutex_exit(&ldcp->status_lock);
1761 		return;
1762 	}
1763 
1764 	ldcp->ldc_status = curr_status;
1765 
1766 	/* channel UP so restart handshake by sending version info */
1767 	if (curr_status == LDC_UP) {
1768 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1769 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1770 			    " handshake attempts (%d) on channel %ld",
1771 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1772 			mutex_exit(&ldcp->status_lock);
1773 			return;
1774 		}
1775 
1776 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1777 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1778 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1779 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1780 			    vswp->instance);
1781 
1782 			/*
1783 			 * Don't count as valid restart attempt if couldn't
1784 			 * send version msg.
1785 			 */
1786 			if (ldcp->hcnt > 0)
1787 				ldcp->hcnt--;
1788 		}
1789 	}
1790 
1791 	/*
1792 	 * Mark that the process is complete by clearing the flag.
1793 	 *
1794 	 * Note is it possible that the taskq dispatch above may have failed,
1795 	 * most likely due to memory shortage. We still clear the flag so
1796 	 * future attempts will at least be attempted and will hopefully
1797 	 * succeed.
1798 	 */
1799 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1800 		ldcp->reset_active = 0;
1801 
1802 	mutex_exit(&ldcp->status_lock);
1803 
1804 	D1(vswp, "%s: exit", __func__);
1805 }
1806 
1807 /*
1808  * returns 0 if legal for event signified by flag to have
1809  * occured at the time it did. Otherwise returns 1.
1810  */
1811 int
1812 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1813 {
1814 	vsw_t		*vswp = ldcp->ldc_vswp;
1815 	uint64_t	state;
1816 	uint64_t	phase;
1817 
1818 	if (dir == INBOUND)
1819 		state = ldcp->lane_in.lstate;
1820 	else
1821 		state = ldcp->lane_out.lstate;
1822 
1823 	phase = ldcp->hphase;
1824 
1825 	switch (flag) {
1826 	case VSW_VER_INFO_RECV:
1827 		if (phase > VSW_MILESTONE0) {
1828 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1829 			    " when in state %d\n", ldcp->ldc_id, phase);
1830 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1831 			return (1);
1832 		}
1833 		break;
1834 
1835 	case VSW_VER_ACK_RECV:
1836 	case VSW_VER_NACK_RECV:
1837 		if (!(state & VSW_VER_INFO_SENT)) {
1838 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1839 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1840 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1841 			return (1);
1842 		} else
1843 			state &= ~VSW_VER_INFO_SENT;
1844 		break;
1845 
1846 	case VSW_ATTR_INFO_RECV:
1847 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1848 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1849 			    " when in state %d\n", ldcp->ldc_id, phase);
1850 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1851 			return (1);
1852 		}
1853 		break;
1854 
1855 	case VSW_ATTR_ACK_RECV:
1856 	case VSW_ATTR_NACK_RECV:
1857 		if (!(state & VSW_ATTR_INFO_SENT)) {
1858 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1859 			    " or ATTR_NACK when in state %d\n",
1860 			    ldcp->ldc_id, phase);
1861 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1862 			return (1);
1863 		} else
1864 			state &= ~VSW_ATTR_INFO_SENT;
1865 		break;
1866 
1867 	case VSW_DRING_INFO_RECV:
1868 		if (phase < VSW_MILESTONE1) {
1869 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1870 			    " when in state %d\n", ldcp->ldc_id, phase);
1871 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1872 			return (1);
1873 		}
1874 		break;
1875 
1876 	case VSW_DRING_ACK_RECV:
1877 	case VSW_DRING_NACK_RECV:
1878 		if (!(state & VSW_DRING_INFO_SENT)) {
1879 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1880 			    " or DRING_NACK when in state %d\n",
1881 			    ldcp->ldc_id, phase);
1882 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1883 			return (1);
1884 		} else
1885 			state &= ~VSW_DRING_INFO_SENT;
1886 		break;
1887 
1888 	case VSW_RDX_INFO_RECV:
1889 		if (phase < VSW_MILESTONE3) {
1890 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1891 			    " when in state %d\n", ldcp->ldc_id, phase);
1892 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1893 			return (1);
1894 		}
1895 		break;
1896 
1897 	case VSW_RDX_ACK_RECV:
1898 	case VSW_RDX_NACK_RECV:
1899 		if (!(state & VSW_RDX_INFO_SENT)) {
1900 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1901 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1902 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1903 			return (1);
1904 		} else
1905 			state &= ~VSW_RDX_INFO_SENT;
1906 		break;
1907 
1908 	case VSW_MCST_INFO_RECV:
1909 		if (phase < VSW_MILESTONE3) {
1910 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1911 			    " when in state %d\n", ldcp->ldc_id, phase);
1912 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1913 			return (1);
1914 		}
1915 		break;
1916 
1917 	default:
1918 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1919 		    ldcp->ldc_id, flag);
1920 		return (1);
1921 	}
1922 
1923 	if (dir == INBOUND)
1924 		ldcp->lane_in.lstate = state;
1925 	else
1926 		ldcp->lane_out.lstate = state;
1927 
1928 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1929 
1930 	return (0);
1931 }
1932 
1933 void
1934 vsw_next_milestone(vsw_ldc_t *ldcp)
1935 {
1936 	vsw_t		*vswp = ldcp->ldc_vswp;
1937 	vsw_port_t	*portp = ldcp->ldc_port;
1938 
1939 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1940 	    ldcp->ldc_id, ldcp->hphase);
1941 
1942 	DUMP_FLAGS(ldcp->lane_in.lstate);
1943 	DUMP_FLAGS(ldcp->lane_out.lstate);
1944 
1945 	switch (ldcp->hphase) {
1946 
1947 	case VSW_MILESTONE0:
1948 		/*
1949 		 * If we haven't started to handshake with our peer,
1950 		 * start to do so now.
1951 		 */
1952 		if (ldcp->lane_out.lstate == 0) {
1953 			D2(vswp, "%s: (chan %lld) starting handshake "
1954 			    "with peer", __func__, ldcp->ldc_id);
1955 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1956 		}
1957 
1958 		/*
1959 		 * Only way to pass this milestone is to have successfully
1960 		 * negotiated version info.
1961 		 */
1962 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1963 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1964 
1965 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1966 			    __func__, ldcp->ldc_id);
1967 
1968 			vsw_set_vnet_proto_ops(ldcp);
1969 
1970 			/*
1971 			 * Next milestone is passed when attribute
1972 			 * information has been successfully exchanged.
1973 			 */
1974 			ldcp->hphase = VSW_MILESTONE1;
1975 			vsw_send_attr(ldcp);
1976 
1977 		}
1978 		break;
1979 
1980 	case VSW_MILESTONE1:
1981 		/*
1982 		 * Only way to pass this milestone is to have successfully
1983 		 * negotiated attribute information.
1984 		 */
1985 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1986 
1987 			ldcp->hphase = VSW_MILESTONE2;
1988 
1989 			/*
1990 			 * If the peer device has said it wishes to
1991 			 * use descriptor rings then we send it our ring
1992 			 * info, otherwise we just set up a private ring
1993 			 * which we use an internal buffer
1994 			 */
1995 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1996 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1997 			    (VSW_VER_LT(ldcp, 1, 2) &&
1998 			    (ldcp->lane_in.xfer_mode ==
1999 			    VIO_DRING_MODE_V1_0))) {
2000 				vsw_send_dring_info(ldcp);
2001 			}
2002 		}
2003 		break;
2004 
2005 	case VSW_MILESTONE2:
2006 		/*
2007 		 * If peer has indicated in its attribute message that
2008 		 * it wishes to use descriptor rings then the only way
2009 		 * to pass this milestone is for us to have received
2010 		 * valid dring info.
2011 		 *
2012 		 * If peer is not using descriptor rings then just fall
2013 		 * through.
2014 		 */
2015 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2016 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2017 		    (VSW_VER_LT(ldcp, 1, 2) &&
2018 		    (ldcp->lane_in.xfer_mode ==
2019 		    VIO_DRING_MODE_V1_0))) {
2020 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
2021 				break;
2022 		}
2023 
2024 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
2025 		    __func__, ldcp->ldc_id);
2026 
2027 		ldcp->hphase = VSW_MILESTONE3;
2028 		vsw_send_rdx(ldcp);
2029 		break;
2030 
2031 	case VSW_MILESTONE3:
2032 		/*
2033 		 * Pass this milestone when all paramaters have been
2034 		 * successfully exchanged and RDX sent in both directions.
2035 		 *
2036 		 * Mark outbound lane as available to transmit data.
2037 		 */
2038 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
2039 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
2040 
2041 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
2042 			    __func__, ldcp->ldc_id);
2043 			D2(vswp, "%s: ** handshake complete (0x%llx : "
2044 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
2045 			    ldcp->lane_out.lstate);
2046 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
2047 			ldcp->hphase = VSW_MILESTONE4;
2048 			ldcp->hcnt = 0;
2049 			DISPLAY_STATE();
2050 			/* Start HIO if enabled and capable */
2051 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
2052 				D2(vswp, "%s: start HybridIO setup", __func__);
2053 				vsw_hio_start(vswp, ldcp);
2054 			}
2055 		} else {
2056 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
2057 			    __func__, ldcp->lane_in.lstate,
2058 			    ldcp->lane_out.lstate);
2059 		}
2060 		break;
2061 
2062 	case VSW_MILESTONE4:
2063 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
2064 		    ldcp->ldc_id);
2065 		break;
2066 
2067 	default:
2068 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
2069 		    ldcp->ldc_id, ldcp->hphase);
2070 	}
2071 
2072 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
2073 	    ldcp->hphase);
2074 }
2075 
2076 /*
2077  * Check if major version is supported.
2078  *
2079  * Returns 0 if finds supported major number, and if necessary
2080  * adjusts the minor field.
2081  *
2082  * Returns 1 if can't match major number exactly. Sets mjor/minor
2083  * to next lowest support values, or to zero if no other values possible.
2084  */
2085 static int
2086 vsw_supported_version(vio_ver_msg_t *vp)
2087 {
2088 	int	i;
2089 
2090 	D1(NULL, "vsw_supported_version: enter");
2091 
2092 	for (i = 0; i < VSW_NUM_VER; i++) {
2093 		if (vsw_versions[i].ver_major == vp->ver_major) {
2094 			/*
2095 			 * Matching or lower major version found. Update
2096 			 * minor number if necessary.
2097 			 */
2098 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
2099 				D2(NULL, "%s: adjusting minor value from %d "
2100 				    "to %d", __func__, vp->ver_minor,
2101 				    vsw_versions[i].ver_minor);
2102 				vp->ver_minor = vsw_versions[i].ver_minor;
2103 			}
2104 
2105 			return (0);
2106 		}
2107 
2108 		/*
2109 		 * If the message contains a higher major version number, set
2110 		 * the message's major/minor versions to the current values
2111 		 * and return false, so this message will get resent with
2112 		 * these values.
2113 		 */
2114 		if (vsw_versions[i].ver_major < vp->ver_major) {
2115 			D2(NULL, "%s: adjusting major and minor "
2116 			    "values to %d, %d\n",
2117 			    __func__, vsw_versions[i].ver_major,
2118 			    vsw_versions[i].ver_minor);
2119 			vp->ver_major = vsw_versions[i].ver_major;
2120 			vp->ver_minor = vsw_versions[i].ver_minor;
2121 			return (1);
2122 		}
2123 	}
2124 
2125 	/* No match was possible, zero out fields */
2126 	vp->ver_major = 0;
2127 	vp->ver_minor = 0;
2128 
2129 	D1(NULL, "vsw_supported_version: exit");
2130 
2131 	return (1);
2132 }
2133 
2134 /*
2135  * Set vnet-protocol-version dependent functions based on version.
2136  */
2137 static void
2138 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
2139 {
2140 	vsw_t	*vswp = ldcp->ldc_vswp;
2141 	lane_t	*lp = &ldcp->lane_out;
2142 
2143 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2144 		/*
2145 		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
2146 		 * Support), set the mtu in our attributes to max_frame_size.
2147 		 */
2148 		lp->mtu = vswp->max_frame_size;
2149 	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
2150 		/*
2151 		 * If the version negotiated with peer is == 1.3 (Vlan Tag
2152 		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
2153 		 */
2154 		lp->mtu = ETHERMAX + VLAN_TAGSZ;
2155 	} else {
2156 		vsw_port_t	*portp = ldcp->ldc_port;
2157 		/*
2158 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2159 		 * We can negotiate that size with those peers provided only
2160 		 * pvid is defined for our peer and there are no vids. Then we
2161 		 * can send/recv only untagged frames of max size ETHERMAX.
2162 		 * Note that pvid of the peer can be different, as vsw has to
2163 		 * serve the vnet in that vlan even if itself is not assigned
2164 		 * to that vlan.
2165 		 */
2166 		if (portp->nvids == 0) {
2167 			lp->mtu = ETHERMAX;
2168 		}
2169 	}
2170 
2171 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2172 		/* Versions >= 1.2 */
2173 
2174 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2175 			/*
2176 			 * enable priority routines and pkt mode only if
2177 			 * at least one pri-eth-type is specified in MD.
2178 			 */
2179 			ldcp->tx = vsw_ldctx_pri;
2180 			ldcp->rx_pktdata = vsw_process_pkt_data;
2181 
2182 			/* set xfer mode for vsw_send_attr() */
2183 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2184 		} else {
2185 			/* no priority eth types defined in MD */
2186 
2187 			ldcp->tx = vsw_ldctx;
2188 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2189 
2190 			/* set xfer mode for vsw_send_attr() */
2191 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2192 		}
2193 
2194 	} else {
2195 		/* Versions prior to 1.2  */
2196 
2197 		vsw_reset_vnet_proto_ops(ldcp);
2198 	}
2199 }
2200 
2201 /*
2202  * Reset vnet-protocol-version dependent functions to v1.0.
2203  */
2204 static void
2205 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2206 {
2207 	lane_t	*lp = &ldcp->lane_out;
2208 
2209 	ldcp->tx = vsw_ldctx;
2210 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2211 
2212 	/* set xfer mode for vsw_send_attr() */
2213 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2214 }
2215 
2216 /*
2217  * Main routine for processing messages received over LDC.
2218  */
2219 static void
2220 vsw_process_pkt(void *arg)
2221 {
2222 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2223 	vsw_t 		*vswp = ldcp->ldc_vswp;
2224 	size_t		msglen;
2225 	vio_msg_tag_t	*tagp;
2226 	uint64_t	*ldcmsg;
2227 	int 		rv = 0;
2228 
2229 
2230 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2231 
2232 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2233 
2234 	ldcmsg = ldcp->ldcmsg;
2235 	/*
2236 	 * If channel is up read messages until channel is empty.
2237 	 */
2238 	do {
2239 		msglen = ldcp->msglen;
2240 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2241 
2242 		if (rv != 0) {
2243 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2244 			    __func__, ldcp->ldc_id, rv, msglen);
2245 		}
2246 
2247 		/* channel has been reset */
2248 		if (rv == ECONNRESET) {
2249 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2250 			break;
2251 		}
2252 
2253 		if (msglen == 0) {
2254 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2255 			    ldcp->ldc_id);
2256 			break;
2257 		}
2258 
2259 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2260 		    ldcp->ldc_id, msglen);
2261 
2262 		/*
2263 		 * Figure out what sort of packet we have gotten by
2264 		 * examining the msg tag, and then switch it appropriately.
2265 		 */
2266 		tagp = (vio_msg_tag_t *)ldcmsg;
2267 
2268 		switch (tagp->vio_msgtype) {
2269 		case VIO_TYPE_CTRL:
2270 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2271 			break;
2272 		case VIO_TYPE_DATA:
2273 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2274 			break;
2275 		case VIO_TYPE_ERR:
2276 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2277 			break;
2278 		default:
2279 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2280 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2281 			break;
2282 		}
2283 	} while (msglen);
2284 
2285 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2286 }
2287 
2288 /*
2289  * Dispatch a task to process a VIO control message.
2290  */
2291 static void
2292 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2293 {
2294 	vsw_ctrl_task_t		*ctaskp = NULL;
2295 	vsw_port_t		*port = ldcp->ldc_port;
2296 	vsw_t			*vswp = port->p_vswp;
2297 
2298 	D1(vswp, "%s: enter", __func__);
2299 
2300 	/*
2301 	 * We need to handle RDX ACK messages in-band as once they
2302 	 * are exchanged it is possible that we will get an
2303 	 * immediate (legitimate) data packet.
2304 	 */
2305 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2306 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2307 
2308 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2309 			return;
2310 
2311 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2312 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2313 		    "(ostate 0x%llx : hphase %d)", __func__,
2314 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2315 		vsw_next_milestone(ldcp);
2316 		return;
2317 	}
2318 
2319 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2320 
2321 	if (ctaskp == NULL) {
2322 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2323 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2324 		return;
2325 	}
2326 
2327 	ctaskp->ldcp = ldcp;
2328 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2329 	ctaskp->hss_id = ldcp->hss_id;
2330 
2331 	/*
2332 	 * Dispatch task to processing taskq if port is not in
2333 	 * the process of being detached.
2334 	 */
2335 	mutex_enter(&port->state_lock);
2336 	if (port->state == VSW_PORT_INIT) {
2337 		if ((vswp->taskq_p == NULL) ||
2338 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2339 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2340 			mutex_exit(&port->state_lock);
2341 			DERR(vswp, "%s: unable to dispatch task to taskq",
2342 			    __func__);
2343 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2344 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2345 			return;
2346 		}
2347 	} else {
2348 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2349 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2350 		    "task", __func__, port->p_instance);
2351 	}
2352 
2353 	mutex_exit(&port->state_lock);
2354 
2355 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2356 	    ldcp->ldc_id);
2357 	D1(vswp, "%s: exit", __func__);
2358 }
2359 
2360 /*
2361  * Process a VIO ctrl message. Invoked from taskq.
2362  */
2363 static void
2364 vsw_process_ctrl_pkt(void *arg)
2365 {
2366 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2367 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2368 	vsw_t 		*vswp = ldcp->ldc_vswp;
2369 	vio_msg_tag_t	tag;
2370 	uint16_t	env;
2371 
2372 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2373 
2374 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2375 	env = tag.vio_subtype_env;
2376 
2377 	/* stale pkt check */
2378 	if (ctaskp->hss_id < ldcp->hss_id) {
2379 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2380 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2381 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2382 		return;
2383 	}
2384 
2385 	/* session id check */
2386 	if (ldcp->session_status & VSW_PEER_SESSION) {
2387 		if (ldcp->peer_session != tag.vio_sid) {
2388 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2389 			    __func__, ldcp->ldc_id, tag.vio_sid);
2390 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2391 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2392 			return;
2393 		}
2394 	}
2395 
2396 	/*
2397 	 * Switch on vio_subtype envelope, then let lower routines
2398 	 * decide if its an INFO, ACK or NACK packet.
2399 	 */
2400 	switch (env) {
2401 	case VIO_VER_INFO:
2402 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2403 		break;
2404 	case VIO_DRING_REG:
2405 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2406 		break;
2407 	case VIO_DRING_UNREG:
2408 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2409 		break;
2410 	case VIO_ATTR_INFO:
2411 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2412 		break;
2413 	case VNET_MCAST_INFO:
2414 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2415 		break;
2416 	case VIO_RDX:
2417 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2418 		break;
2419 	case VIO_DDS_INFO:
2420 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2421 		break;
2422 	default:
2423 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2424 	}
2425 
2426 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2427 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2428 }
2429 
2430 /*
2431  * Version negotiation. We can end up here either because our peer
2432  * has responded to a handshake message we have sent it, or our peer
2433  * has initiated a handshake with us. If its the former then can only
2434  * be ACK or NACK, if its the later can only be INFO.
2435  *
2436  * If its an ACK we move to the next stage of the handshake, namely
2437  * attribute exchange. If its a NACK we see if we can specify another
2438  * version, if we can't we stop.
2439  *
2440  * If it is an INFO we reset all params associated with communication
2441  * in that direction over this channel (remember connection is
2442  * essentially 2 independent simplex channels).
2443  */
2444 void
2445 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2446 {
2447 	vio_ver_msg_t	*ver_pkt;
2448 	vsw_t 		*vswp = ldcp->ldc_vswp;
2449 
2450 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2451 
2452 	/*
2453 	 * We know this is a ctrl/version packet so
2454 	 * cast it into the correct structure.
2455 	 */
2456 	ver_pkt = (vio_ver_msg_t *)pkt;
2457 
2458 	switch (ver_pkt->tag.vio_subtype) {
2459 	case VIO_SUBTYPE_INFO:
2460 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2461 
2462 		/*
2463 		 * Record the session id, which we will use from now
2464 		 * until we see another VER_INFO msg. Even then the
2465 		 * session id in most cases will be unchanged, execpt
2466 		 * if channel was reset.
2467 		 */
2468 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2469 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2470 			DERR(vswp, "%s: updating session id for chan %lld "
2471 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2472 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2473 		}
2474 
2475 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2476 		ldcp->session_status |= VSW_PEER_SESSION;
2477 
2478 		/* Legal message at this time ? */
2479 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2480 			return;
2481 
2482 		/*
2483 		 * First check the device class. Currently only expect
2484 		 * to be talking to a network device. In the future may
2485 		 * also talk to another switch.
2486 		 */
2487 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2488 			DERR(vswp, "%s: illegal device class %d", __func__,
2489 			    ver_pkt->dev_class);
2490 
2491 			ver_pkt->tag.vio_sid = ldcp->local_session;
2492 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2493 
2494 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2495 
2496 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2497 			    sizeof (vio_ver_msg_t), B_TRUE);
2498 
2499 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2500 			vsw_next_milestone(ldcp);
2501 			return;
2502 		} else {
2503 			ldcp->dev_class = ver_pkt->dev_class;
2504 		}
2505 
2506 		/*
2507 		 * Now check the version.
2508 		 */
2509 		if (vsw_supported_version(ver_pkt) == 0) {
2510 			/*
2511 			 * Support this major version and possibly
2512 			 * adjusted minor version.
2513 			 */
2514 
2515 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2516 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2517 
2518 			/* Store accepted values */
2519 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2520 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2521 
2522 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2523 
2524 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2525 
2526 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2527 				/*
2528 				 * Send a version info message
2529 				 * using the accepted version that
2530 				 * we are about to ack. Also note that
2531 				 * we send our ver info before we ack.
2532 				 * Otherwise, as soon as receiving the
2533 				 * ack, obp sends attr info msg, which
2534 				 * breaks vsw_check_flag() invoked
2535 				 * from vsw_process_ctrl_attr_pkt();
2536 				 * as we also need VSW_VER_ACK_RECV to
2537 				 * be set in lane_out.lstate, before
2538 				 * we can receive attr info.
2539 				 */
2540 				vsw_send_ver(ldcp);
2541 			}
2542 		} else {
2543 			/*
2544 			 * NACK back with the next lower major/minor
2545 			 * pairing we support (if don't suuport any more
2546 			 * versions then they will be set to zero.
2547 			 */
2548 
2549 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2550 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2551 
2552 			/* Store updated values */
2553 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2554 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2555 
2556 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2557 
2558 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2559 		}
2560 
2561 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2562 		ver_pkt->tag.vio_sid = ldcp->local_session;
2563 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2564 		    sizeof (vio_ver_msg_t), B_TRUE);
2565 
2566 		vsw_next_milestone(ldcp);
2567 		break;
2568 
2569 	case VIO_SUBTYPE_ACK:
2570 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2571 
2572 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2573 			return;
2574 
2575 		/* Store updated values */
2576 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2577 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2578 
2579 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2580 		vsw_next_milestone(ldcp);
2581 
2582 		break;
2583 
2584 	case VIO_SUBTYPE_NACK:
2585 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2586 
2587 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2588 			return;
2589 
2590 		/*
2591 		 * If our peer sent us a NACK with the ver fields set to
2592 		 * zero then there is nothing more we can do. Otherwise see
2593 		 * if we support either the version suggested, or a lesser
2594 		 * one.
2595 		 */
2596 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2597 			DERR(vswp, "%s: peer unable to negotiate any "
2598 			    "further.", __func__);
2599 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2600 			vsw_next_milestone(ldcp);
2601 			return;
2602 		}
2603 
2604 		/*
2605 		 * Check to see if we support this major version or
2606 		 * a lower one. If we don't then maj/min will be set
2607 		 * to zero.
2608 		 */
2609 		(void) vsw_supported_version(ver_pkt);
2610 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2611 			/* Nothing more we can do */
2612 			DERR(vswp, "%s: version negotiation failed.\n",
2613 			    __func__);
2614 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2615 			vsw_next_milestone(ldcp);
2616 		} else {
2617 			/* found a supported major version */
2618 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2619 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2620 
2621 			D2(vswp, "%s: resending with updated values (%x, %x)",
2622 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2623 
2624 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2625 			ver_pkt->tag.vio_sid = ldcp->local_session;
2626 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2627 
2628 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2629 
2630 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2631 			    sizeof (vio_ver_msg_t), B_TRUE);
2632 
2633 			vsw_next_milestone(ldcp);
2634 
2635 		}
2636 		break;
2637 
2638 	default:
2639 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2640 		    ver_pkt->tag.vio_subtype);
2641 	}
2642 
2643 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2644 }
2645 
2646 /*
2647  * Process an attribute packet. We can end up here either because our peer
2648  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2649  * peer has sent us an attribute INFO message
2650  *
2651  * If its an ACK we then move to the next stage of the handshake which
2652  * is to send our descriptor ring info to our peer. If its a NACK then
2653  * there is nothing more we can (currently) do.
2654  *
2655  * If we get a valid/acceptable INFO packet (and we have already negotiated
2656  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2657  * NACK back and reset channel state to INACTIV.
2658  *
2659  * FUTURE: in time we will probably negotiate over attributes, but for
2660  * the moment unacceptable attributes are regarded as a fatal error.
2661  *
2662  */
2663 void
2664 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2665 {
2666 	vnet_attr_msg_t		*attr_pkt;
2667 	vsw_t			*vswp = ldcp->ldc_vswp;
2668 	vsw_port_t		*port = ldcp->ldc_port;
2669 	uint64_t		macaddr = 0;
2670 	lane_t			*lane_out = &ldcp->lane_out;
2671 	lane_t			*lane_in = &ldcp->lane_in;
2672 	uint32_t		mtu;
2673 	boolean_t		ack = B_TRUE;
2674 	int			i;
2675 
2676 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2677 
2678 	/*
2679 	 * We know this is a ctrl/attr packet so
2680 	 * cast it into the correct structure.
2681 	 */
2682 	attr_pkt = (vnet_attr_msg_t *)pkt;
2683 
2684 	switch (attr_pkt->tag.vio_subtype) {
2685 	case VIO_SUBTYPE_INFO:
2686 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2687 
2688 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2689 			return;
2690 
2691 		/*
2692 		 * If the attributes are unacceptable then we NACK back.
2693 		 */
2694 		if (vsw_check_attr(attr_pkt, ldcp)) {
2695 			ack = B_FALSE;
2696 
2697 			DERR(vswp, "%s (chan %d): invalid attributes",
2698 			    __func__, ldcp->ldc_id);
2699 
2700 		} else {
2701 
2702 			if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2703 				/*
2704 				 * Versions >= 1.4:
2705 				 * The mtu is negotiated down to the
2706 				 * minimum of our mtu and peer's mtu.
2707 				 */
2708 				mtu = MIN(attr_pkt->mtu, vswp->max_frame_size);
2709 
2710 				/*
2711 				 * If we have received an ack for the attr info
2712 				 * that we sent, then check if the mtu computed
2713 				 * above matches the mtu that the peer had ack'd
2714 				 * (saved in local hparams). If they don't
2715 				 * match, we fail the handshake.
2716 				 */
2717 				if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2718 					if (mtu != lane_out->mtu) {
2719 						/* send NACK */
2720 						ack = B_FALSE;
2721 					}
2722 				} else {
2723 					/*
2724 					 * Save the mtu computed above in our
2725 					 * attr parameters, so it gets sent in
2726 					 * the attr info from us to the peer.
2727 					 */
2728 					lane_out->mtu = mtu;
2729 				}
2730 			}
2731 
2732 		}
2733 
2734 		if (ack == B_FALSE) {
2735 
2736 			vsw_free_lane_resources(ldcp, INBOUND);
2737 
2738 			attr_pkt->tag.vio_sid = ldcp->local_session;
2739 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2740 
2741 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2742 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2743 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2744 			    sizeof (vnet_attr_msg_t), B_TRUE);
2745 
2746 			vsw_next_milestone(ldcp);
2747 			return;
2748 		}
2749 
2750 		/*
2751 		 * Otherwise store attributes for this lane and update
2752 		 * lane state.
2753 		 */
2754 		lane_in->mtu = attr_pkt->mtu;
2755 		lane_in->addr = attr_pkt->addr;
2756 		lane_in->addr_type = attr_pkt->addr_type;
2757 		lane_in->xfer_mode = attr_pkt->xfer_mode;
2758 		lane_in->ack_freq = attr_pkt->ack_freq;
2759 
2760 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2761 			/* save the MIN mtu in the msg to be replied */
2762 			attr_pkt->mtu = mtu;
2763 		}
2764 
2765 		macaddr = lane_in->addr;
2766 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2767 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2768 			macaddr >>= 8;
2769 		}
2770 
2771 		/* create the fdb entry for this port/mac address */
2772 		vsw_fdbe_add(vswp, port);
2773 
2774 		/* add the port to the specified vlans */
2775 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2776 
2777 		/* setup device specifc xmit routines */
2778 		mutex_enter(&port->tx_lock);
2779 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2780 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2781 		    (VSW_VER_LT(ldcp, 1, 2) &&
2782 		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2783 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2784 			port->transmit = vsw_dringsend;
2785 		} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2786 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2787 			vsw_create_privring(ldcp);
2788 			port->transmit = vsw_descrsend;
2789 			lane_out->xfer_mode = VIO_DESC_MODE;
2790 		}
2791 
2792 		/*
2793 		 * HybridIO is supported only vnet, not by OBP.
2794 		 * So, set hio_capable to true only when in DRING mode.
2795 		 */
2796 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2797 		    (lane_in->xfer_mode != VIO_DESC_MODE)) {
2798 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2799 		} else {
2800 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2801 		}
2802 
2803 		mutex_exit(&port->tx_lock);
2804 
2805 		attr_pkt->tag.vio_sid = ldcp->local_session;
2806 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2807 
2808 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2809 
2810 		lane_in->lstate |= VSW_ATTR_ACK_SENT;
2811 
2812 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2813 		    sizeof (vnet_attr_msg_t), B_TRUE);
2814 
2815 		vsw_next_milestone(ldcp);
2816 		break;
2817 
2818 	case VIO_SUBTYPE_ACK:
2819 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2820 
2821 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2822 			return;
2823 
2824 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2825 			/*
2826 			 * Versions >= 1.4:
2827 			 * The ack msg sent by the peer contains the minimum of
2828 			 * our mtu (that we had sent in our attr info) and the
2829 			 * peer's mtu.
2830 			 *
2831 			 * If we have sent an ack for the attr info msg from
2832 			 * the peer, check if the mtu that was computed then
2833 			 * (saved in lane_out params) matches the mtu that the
2834 			 * peer has ack'd. If they don't match, we fail the
2835 			 * handshake.
2836 			 */
2837 			if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2838 				if (lane_out->mtu != attr_pkt->mtu) {
2839 					return;
2840 				}
2841 			} else {
2842 				/*
2843 				 * If the mtu ack'd by the peer is > our mtu
2844 				 * fail handshake. Otherwise, save the mtu, so
2845 				 * we can validate it when we receive attr info
2846 				 * from our peer.
2847 				 */
2848 				if (attr_pkt->mtu > lane_out->mtu) {
2849 					return;
2850 				}
2851 				if (attr_pkt->mtu <= lane_out->mtu) {
2852 					lane_out->mtu = attr_pkt->mtu;
2853 				}
2854 			}
2855 		}
2856 
2857 		lane_out->lstate |= VSW_ATTR_ACK_RECV;
2858 		vsw_next_milestone(ldcp);
2859 		break;
2860 
2861 	case VIO_SUBTYPE_NACK:
2862 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2863 
2864 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2865 			return;
2866 
2867 		lane_out->lstate |= VSW_ATTR_NACK_RECV;
2868 		vsw_next_milestone(ldcp);
2869 		break;
2870 
2871 	default:
2872 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2873 		    attr_pkt->tag.vio_subtype);
2874 	}
2875 
2876 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2877 }
2878 
2879 /*
2880  * Process a dring info packet. We can end up here either because our peer
2881  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2882  * peer has sent us a dring INFO message.
2883  *
2884  * If we get a valid/acceptable INFO packet (and we have already negotiated
2885  * a version) we ACK back and update the lane state, otherwise we NACK back.
2886  *
2887  * FUTURE: nothing to stop client from sending us info on multiple dring's
2888  * but for the moment we will just use the first one we are given.
2889  *
2890  */
2891 void
2892 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2893 {
2894 	vio_dring_reg_msg_t	*dring_pkt;
2895 	vsw_t			*vswp = ldcp->ldc_vswp;
2896 	ldc_mem_info_t		minfo;
2897 	dring_info_t		*dp, *dbp;
2898 	int			dring_found = 0;
2899 
2900 	/*
2901 	 * We know this is a ctrl/dring packet so
2902 	 * cast it into the correct structure.
2903 	 */
2904 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2905 
2906 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2907 
2908 	switch (dring_pkt->tag.vio_subtype) {
2909 	case VIO_SUBTYPE_INFO:
2910 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2911 
2912 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2913 			return;
2914 
2915 		/*
2916 		 * If the dring params are unacceptable then we NACK back.
2917 		 */
2918 		if (vsw_check_dring_info(dring_pkt)) {
2919 
2920 			DERR(vswp, "%s (%lld): invalid dring info",
2921 			    __func__, ldcp->ldc_id);
2922 
2923 			vsw_free_lane_resources(ldcp, INBOUND);
2924 
2925 			dring_pkt->tag.vio_sid = ldcp->local_session;
2926 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2927 
2928 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2929 
2930 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2931 
2932 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2933 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2934 
2935 			vsw_next_milestone(ldcp);
2936 			return;
2937 		}
2938 
2939 		/*
2940 		 * Otherwise, attempt to map in the dring using the
2941 		 * cookie. If that succeeds we send back a unique dring
2942 		 * identifier that the sending side will use in future
2943 		 * to refer to this descriptor ring.
2944 		 */
2945 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2946 
2947 		dp->num_descriptors = dring_pkt->num_descriptors;
2948 		dp->descriptor_size = dring_pkt->descriptor_size;
2949 		dp->options = dring_pkt->options;
2950 		dp->ncookies = dring_pkt->ncookies;
2951 
2952 		/*
2953 		 * Note: should only get one cookie. Enforced in
2954 		 * the ldc layer.
2955 		 */
2956 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2957 		    sizeof (ldc_mem_cookie_t));
2958 
2959 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2960 		    dp->num_descriptors, dp->descriptor_size);
2961 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2962 		    dp->options, dp->ncookies);
2963 
2964 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2965 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2966 		    LDC_DIRECT_MAP, &(dp->handle))) != 0) {
2967 
2968 			DERR(vswp, "%s: dring_map failed\n", __func__);
2969 
2970 			kmem_free(dp, sizeof (dring_info_t));
2971 			vsw_free_lane_resources(ldcp, INBOUND);
2972 
2973 			dring_pkt->tag.vio_sid = ldcp->local_session;
2974 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2975 
2976 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2977 
2978 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2979 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2980 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2981 
2982 			vsw_next_milestone(ldcp);
2983 			return;
2984 		}
2985 
2986 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2987 
2988 			DERR(vswp, "%s: dring_addr failed\n", __func__);
2989 
2990 			kmem_free(dp, sizeof (dring_info_t));
2991 			vsw_free_lane_resources(ldcp, INBOUND);
2992 
2993 			dring_pkt->tag.vio_sid = ldcp->local_session;
2994 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2995 
2996 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2997 
2998 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2999 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3000 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
3001 
3002 			vsw_next_milestone(ldcp);
3003 			return;
3004 		} else {
3005 			/* store the address of the pub part of ring */
3006 			dp->pub_addr = minfo.vaddr;
3007 
3008 			/* cache the dring mtype */
3009 			dp->dring_mtype = minfo.mtype;
3010 		}
3011 
3012 		/* no private section as we are importing */
3013 		dp->priv_addr = NULL;
3014 
3015 		/*
3016 		 * Using simple mono increasing int for ident at
3017 		 * the moment.
3018 		 */
3019 		dp->ident = ldcp->next_ident;
3020 		ldcp->next_ident++;
3021 
3022 		dp->end_idx = 0;
3023 		dp->next = NULL;
3024 
3025 		/*
3026 		 * Link it onto the end of the list of drings
3027 		 * for this lane.
3028 		 */
3029 		if (ldcp->lane_in.dringp == NULL) {
3030 			D2(vswp, "%s: adding first INBOUND dring", __func__);
3031 			ldcp->lane_in.dringp = dp;
3032 		} else {
3033 			dbp = ldcp->lane_in.dringp;
3034 
3035 			while (dbp->next != NULL)
3036 				dbp = dbp->next;
3037 
3038 			dbp->next = dp;
3039 		}
3040 
3041 		/* acknowledge it */
3042 		dring_pkt->tag.vio_sid = ldcp->local_session;
3043 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3044 		dring_pkt->dring_ident = dp->ident;
3045 
3046 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3047 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
3048 
3049 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
3050 		vsw_next_milestone(ldcp);
3051 		break;
3052 
3053 	case VIO_SUBTYPE_ACK:
3054 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3055 
3056 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
3057 			return;
3058 
3059 		/*
3060 		 * Peer is acknowledging our dring info and will have
3061 		 * sent us a dring identifier which we will use to
3062 		 * refer to this ring w.r.t. our peer.
3063 		 */
3064 		dp = ldcp->lane_out.dringp;
3065 		if (dp != NULL) {
3066 			/*
3067 			 * Find the ring this ident should be associated
3068 			 * with.
3069 			 */
3070 			if (vsw_dring_match(dp, dring_pkt)) {
3071 				dring_found = 1;
3072 
3073 			} else while (dp != NULL) {
3074 				if (vsw_dring_match(dp, dring_pkt)) {
3075 					dring_found = 1;
3076 					break;
3077 				}
3078 				dp = dp->next;
3079 			}
3080 
3081 			if (dring_found == 0) {
3082 				DERR(NULL, "%s: unrecognised ring cookie",
3083 				    __func__);
3084 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3085 				return;
3086 			}
3087 
3088 		} else {
3089 			DERR(vswp, "%s: DRING ACK received but no drings "
3090 			    "allocated", __func__);
3091 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3092 			return;
3093 		}
3094 
3095 		/* store ident */
3096 		dp->ident = dring_pkt->dring_ident;
3097 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
3098 		vsw_next_milestone(ldcp);
3099 		break;
3100 
3101 	case VIO_SUBTYPE_NACK:
3102 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3103 
3104 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3105 			return;
3106 
3107 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
3108 		vsw_next_milestone(ldcp);
3109 		break;
3110 
3111 	default:
3112 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3113 		    dring_pkt->tag.vio_subtype);
3114 	}
3115 
3116 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3117 }
3118 
3119 /*
3120  * Process a request from peer to unregister a dring.
3121  *
3122  * For the moment we just restart the handshake if our
3123  * peer endpoint attempts to unregister a dring.
3124  */
3125 void
3126 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3127 {
3128 	vsw_t			*vswp = ldcp->ldc_vswp;
3129 	vio_dring_unreg_msg_t	*dring_pkt;
3130 
3131 	/*
3132 	 * We know this is a ctrl/dring packet so
3133 	 * cast it into the correct structure.
3134 	 */
3135 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3136 
3137 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3138 
3139 	switch (dring_pkt->tag.vio_subtype) {
3140 	case VIO_SUBTYPE_INFO:
3141 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3142 
3143 		DWARN(vswp, "%s: restarting handshake..", __func__);
3144 		break;
3145 
3146 	case VIO_SUBTYPE_ACK:
3147 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3148 
3149 		DWARN(vswp, "%s: restarting handshake..", __func__);
3150 		break;
3151 
3152 	case VIO_SUBTYPE_NACK:
3153 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3154 
3155 		DWARN(vswp, "%s: restarting handshake..", __func__);
3156 		break;
3157 
3158 	default:
3159 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3160 		    dring_pkt->tag.vio_subtype);
3161 	}
3162 
3163 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3164 
3165 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3166 }
3167 
3168 #define	SND_MCST_NACK(ldcp, pkt) \
3169 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3170 	pkt->tag.vio_sid = ldcp->local_session; \
3171 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3172 			sizeof (vnet_mcast_msg_t), B_TRUE);
3173 
3174 /*
3175  * Process a multicast request from a vnet.
3176  *
3177  * Vnet's specify a multicast address that they are interested in. This
3178  * address is used as a key into the hash table which forms the multicast
3179  * forwarding database (mFDB).
3180  *
3181  * The table keys are the multicast addresses, while the table entries
3182  * are pointers to lists of ports which wish to receive packets for the
3183  * specified multicast address.
3184  *
3185  * When a multicast packet is being switched we use the address as a key
3186  * into the hash table, and then walk the appropriate port list forwarding
3187  * the pkt to each port in turn.
3188  *
3189  * If a vnet is no longer interested in a particular multicast grouping
3190  * we simply find the correct location in the hash table and then delete
3191  * the relevant port from the port list.
3192  *
3193  * To deal with the case whereby a port is being deleted without first
3194  * removing itself from the lists in the hash table, we maintain a list
3195  * of multicast addresses the port has registered an interest in, within
3196  * the port structure itself. We then simply walk that list of addresses
3197  * using them as keys into the hash table and remove the port from the
3198  * appropriate lists.
3199  */
3200 static void
3201 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3202 {
3203 	vnet_mcast_msg_t	*mcst_pkt;
3204 	vsw_port_t		*port = ldcp->ldc_port;
3205 	vsw_t			*vswp = ldcp->ldc_vswp;
3206 	int			i;
3207 
3208 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3209 
3210 	/*
3211 	 * We know this is a ctrl/mcast packet so
3212 	 * cast it into the correct structure.
3213 	 */
3214 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3215 
3216 	switch (mcst_pkt->tag.vio_subtype) {
3217 	case VIO_SUBTYPE_INFO:
3218 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3219 
3220 		/*
3221 		 * Check if in correct state to receive a multicast
3222 		 * message (i.e. handshake complete). If not reset
3223 		 * the handshake.
3224 		 */
3225 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3226 			return;
3227 
3228 		/*
3229 		 * Before attempting to add or remove address check
3230 		 * that they are valid multicast addresses.
3231 		 * If not, then NACK back.
3232 		 */
3233 		for (i = 0; i < mcst_pkt->count; i++) {
3234 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3235 				DERR(vswp, "%s: invalid multicast address",
3236 				    __func__);
3237 				SND_MCST_NACK(ldcp, mcst_pkt);
3238 				return;
3239 			}
3240 		}
3241 
3242 		/*
3243 		 * Now add/remove the addresses. If this fails we
3244 		 * NACK back.
3245 		 */
3246 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3247 			SND_MCST_NACK(ldcp, mcst_pkt);
3248 			return;
3249 		}
3250 
3251 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3252 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3253 
3254 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3255 
3256 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3257 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3258 		break;
3259 
3260 	case VIO_SUBTYPE_ACK:
3261 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3262 
3263 		/*
3264 		 * We shouldn't ever get a multicast ACK message as
3265 		 * at the moment we never request multicast addresses
3266 		 * to be set on some other device. This may change in
3267 		 * the future if we have cascading switches.
3268 		 */
3269 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3270 			return;
3271 
3272 				/* Do nothing */
3273 		break;
3274 
3275 	case VIO_SUBTYPE_NACK:
3276 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3277 
3278 		/*
3279 		 * We shouldn't get a multicast NACK packet for the
3280 		 * same reasons as we shouldn't get a ACK packet.
3281 		 */
3282 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3283 			return;
3284 
3285 				/* Do nothing */
3286 		break;
3287 
3288 	default:
3289 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3290 		    mcst_pkt->tag.vio_subtype);
3291 	}
3292 
3293 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3294 }
3295 
3296 static void
3297 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3298 {
3299 	vio_rdx_msg_t	*rdx_pkt;
3300 	vsw_t		*vswp = ldcp->ldc_vswp;
3301 
3302 	/*
3303 	 * We know this is a ctrl/rdx packet so
3304 	 * cast it into the correct structure.
3305 	 */
3306 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3307 
3308 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3309 
3310 	switch (rdx_pkt->tag.vio_subtype) {
3311 	case VIO_SUBTYPE_INFO:
3312 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3313 
3314 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3315 			return;
3316 
3317 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3318 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3319 
3320 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3321 
3322 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3323 
3324 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3325 		    sizeof (vio_rdx_msg_t), B_TRUE);
3326 
3327 		vsw_next_milestone(ldcp);
3328 		break;
3329 
3330 	case VIO_SUBTYPE_ACK:
3331 		/*
3332 		 * Should be handled in-band by callback handler.
3333 		 */
3334 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3335 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3336 		break;
3337 
3338 	case VIO_SUBTYPE_NACK:
3339 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3340 
3341 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3342 			return;
3343 
3344 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3345 		vsw_next_milestone(ldcp);
3346 		break;
3347 
3348 	default:
3349 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3350 		    rdx_pkt->tag.vio_subtype);
3351 	}
3352 
3353 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3354 }
3355 
3356 static void
3357 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3358 	uint32_t msglen)
3359 {
3360 	uint16_t	env = tagp->vio_subtype_env;
3361 	vsw_t		*vswp = ldcp->ldc_vswp;
3362 
3363 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3364 
3365 	/* session id check */
3366 	if (ldcp->session_status & VSW_PEER_SESSION) {
3367 		if (ldcp->peer_session != tagp->vio_sid) {
3368 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3369 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3370 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3371 			return;
3372 		}
3373 	}
3374 
3375 	/*
3376 	 * It is an error for us to be getting data packets
3377 	 * before the handshake has completed.
3378 	 */
3379 	if (ldcp->hphase != VSW_MILESTONE4) {
3380 		DERR(vswp, "%s: got data packet before handshake complete "
3381 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3382 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3383 		DUMP_FLAGS(ldcp->lane_in.lstate);
3384 		DUMP_FLAGS(ldcp->lane_out.lstate);
3385 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3386 		return;
3387 	}
3388 
3389 	/*
3390 	 * To reduce the locking contention, release the
3391 	 * ldc_cblock here and re-acquire it once we are done
3392 	 * receiving packets.
3393 	 */
3394 	mutex_exit(&ldcp->ldc_cblock);
3395 	mutex_enter(&ldcp->ldc_rxlock);
3396 
3397 	/*
3398 	 * Switch on vio_subtype envelope, then let lower routines
3399 	 * decide if its an INFO, ACK or NACK packet.
3400 	 */
3401 	if (env == VIO_DRING_DATA) {
3402 		vsw_process_data_dring_pkt(ldcp, dpkt);
3403 	} else if (env == VIO_PKT_DATA) {
3404 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3405 	} else if (env == VIO_DESC_DATA) {
3406 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3407 	} else {
3408 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3409 	}
3410 
3411 	mutex_exit(&ldcp->ldc_rxlock);
3412 	mutex_enter(&ldcp->ldc_cblock);
3413 
3414 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3415 }
3416 
3417 #define	SND_DRING_NACK(ldcp, pkt) \
3418 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3419 	pkt->tag.vio_sid = ldcp->local_session; \
3420 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3421 			sizeof (vio_dring_msg_t), B_TRUE);
3422 
3423 static void
3424 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3425 {
3426 	vio_dring_msg_t		*dring_pkt;
3427 	vnet_public_desc_t	desc, *pub_addr = NULL;
3428 	vsw_private_desc_t	*priv_addr = NULL;
3429 	dring_info_t		*dp = NULL;
3430 	vsw_t			*vswp = ldcp->ldc_vswp;
3431 	mblk_t			*mp = NULL;
3432 	mblk_t			*bp = NULL;
3433 	mblk_t			*bpt = NULL;
3434 	size_t			nbytes = 0;
3435 	uint64_t		chain = 0;
3436 	uint64_t		len;
3437 	uint32_t		pos, start;
3438 	uint32_t		range_start, range_end;
3439 	int32_t			end, num, cnt = 0;
3440 	int			i, rv, rng_rv = 0, msg_rv = 0;
3441 	boolean_t		prev_desc_ack = B_FALSE;
3442 	int			read_attempts = 0;
3443 	struct ether_header	*ehp;
3444 	lane_t			*lp = &ldcp->lane_out;
3445 
3446 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3447 
3448 	/*
3449 	 * We know this is a data/dring packet so
3450 	 * cast it into the correct structure.
3451 	 */
3452 	dring_pkt = (vio_dring_msg_t *)dpkt;
3453 
3454 	/*
3455 	 * Switch on the vio_subtype. If its INFO then we need to
3456 	 * process the data. If its an ACK we need to make sure
3457 	 * it makes sense (i.e did we send an earlier data/info),
3458 	 * and if its a NACK then we maybe attempt a retry.
3459 	 */
3460 	switch (dring_pkt->tag.vio_subtype) {
3461 	case VIO_SUBTYPE_INFO:
3462 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3463 
3464 		READ_ENTER(&ldcp->lane_in.dlistrw);
3465 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3466 		    dring_pkt->dring_ident)) == NULL) {
3467 			RW_EXIT(&ldcp->lane_in.dlistrw);
3468 
3469 			DERR(vswp, "%s(%lld): unable to find dring from "
3470 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3471 			    dring_pkt->dring_ident);
3472 
3473 			SND_DRING_NACK(ldcp, dring_pkt);
3474 			return;
3475 		}
3476 
3477 		start = pos = dring_pkt->start_idx;
3478 		end = dring_pkt->end_idx;
3479 		len = dp->num_descriptors;
3480 
3481 		range_start = range_end = pos;
3482 
3483 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3484 		    __func__, ldcp->ldc_id, start, end);
3485 
3486 		if (end == -1) {
3487 			num = -1;
3488 		} else if (end >= 0) {
3489 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3490 
3491 			/* basic sanity check */
3492 			if (end > len) {
3493 				RW_EXIT(&ldcp->lane_in.dlistrw);
3494 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3495 				    "ring length %lld", __func__,
3496 				    ldcp->ldc_id, end, len);
3497 
3498 				SND_DRING_NACK(ldcp, dring_pkt);
3499 				return;
3500 			}
3501 		} else {
3502 			RW_EXIT(&ldcp->lane_in.dlistrw);
3503 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3504 			    __func__, ldcp->ldc_id, end);
3505 			SND_DRING_NACK(ldcp, dring_pkt);
3506 			return;
3507 		}
3508 
3509 		while (cnt != num) {
3510 vsw_recheck_desc:
3511 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3512 
3513 			if ((rng_rv = vnet_dring_entry_copy(pub_addr,
3514 			    &desc, dp->dring_mtype, dp->handle,
3515 			    pos, pos)) != 0) {
3516 				DERR(vswp, "%s(%lld): unable to copy "
3517 				    "descriptor at pos %d: err %d",
3518 				    __func__, pos, ldcp->ldc_id, rng_rv);
3519 				ldcp->ldc_stats.ierrors++;
3520 				break;
3521 			}
3522 
3523 			/*
3524 			 * When given a bounded range of descriptors
3525 			 * to process, its an error to hit a descriptor
3526 			 * which is not ready. In the non-bounded case
3527 			 * (end_idx == -1) this simply indicates we have
3528 			 * reached the end of the current active range.
3529 			 */
3530 			if (desc.hdr.dstate != VIO_DESC_READY) {
3531 				/* unbound - no error */
3532 				if (end == -1) {
3533 					if (read_attempts == vsw_read_attempts)
3534 						break;
3535 
3536 					delay(drv_usectohz(vsw_desc_delay));
3537 					read_attempts++;
3538 					goto vsw_recheck_desc;
3539 				}
3540 
3541 				/* bounded - error - so NACK back */
3542 				RW_EXIT(&ldcp->lane_in.dlistrw);
3543 				DERR(vswp, "%s(%lld): descriptor not READY "
3544 				    "(%d)", __func__, ldcp->ldc_id,
3545 				    desc.hdr.dstate);
3546 				SND_DRING_NACK(ldcp, dring_pkt);
3547 				return;
3548 			}
3549 
3550 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3551 
3552 			range_end = pos;
3553 
3554 			/*
3555 			 * If we ACK'd the previous descriptor then now
3556 			 * record the new range start position for later
3557 			 * ACK's.
3558 			 */
3559 			if (prev_desc_ack) {
3560 				range_start = pos;
3561 
3562 				D2(vswp, "%s(%lld): updating range start to be "
3563 				    "%d", __func__, ldcp->ldc_id, range_start);
3564 
3565 				prev_desc_ack = B_FALSE;
3566 			}
3567 
3568 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3569 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3570 			    __func__, ldcp->ldc_id, pos, &desc,
3571 			    desc.hdr.dstate, desc.nbytes);
3572 
3573 			if ((desc.nbytes < ETHERMIN) ||
3574 			    (desc.nbytes > lp->mtu)) {
3575 				/* invalid size; drop the packet */
3576 				ldcp->ldc_stats.ierrors++;
3577 				goto vsw_process_desc_done;
3578 			}
3579 
3580 			/*
3581 			 * Ensure that we ask ldc for an aligned
3582 			 * number of bytes. Data is padded to align on 8
3583 			 * byte boundary, desc.nbytes is actual data length,
3584 			 * i.e. minus that padding.
3585 			 */
3586 			nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7;
3587 			if (nbytes > ldcp->max_rxpool_size) {
3588 				mp = allocb(desc.nbytes + VNET_IPALIGN + 8,
3589 				    BPRI_MED);
3590 			} else {
3591 				mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3592 				if (mp == NULL) {
3593 					ldcp->ldc_stats.rx_vio_allocb_fail++;
3594 					/*
3595 					 * No free receive buffers available,
3596 					 * so fallback onto allocb(9F). Make
3597 					 * sure that we get a data buffer which
3598 					 * is a multiple of 8 as this is
3599 					 * required by ldc_mem_copy.
3600 					 */
3601 					DTRACE_PROBE(allocb);
3602 					mp = allocb(desc.nbytes +
3603 					    VNET_IPALIGN + 8, BPRI_MED);
3604 				}
3605 			}
3606 			if (mp == NULL) {
3607 				DERR(vswp, "%s(%ld): allocb failed",
3608 				    __func__, ldcp->ldc_id);
3609 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3610 				    dp->dring_mtype, dp->handle, pos, pos,
3611 				    VIO_DESC_DONE);
3612 				ldcp->ldc_stats.ierrors++;
3613 				ldcp->ldc_stats.rx_allocb_fail++;
3614 				break;
3615 			}
3616 
3617 			rv = ldc_mem_copy(ldcp->ldc_handle,
3618 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3619 			    desc.memcookie, desc.ncookies, LDC_COPY_IN);
3620 			if (rv != 0) {
3621 				DERR(vswp, "%s(%d): unable to copy in data "
3622 				    "from %d cookies in desc %d (rv %d)",
3623 				    __func__, ldcp->ldc_id, desc.ncookies,
3624 				    pos, rv);
3625 				freemsg(mp);
3626 
3627 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3628 				    dp->dring_mtype, dp->handle, pos, pos,
3629 				    VIO_DESC_DONE);
3630 				ldcp->ldc_stats.ierrors++;
3631 				break;
3632 			} else {
3633 				D2(vswp, "%s(%d): copied in %ld bytes"
3634 				    " using %d cookies", __func__,
3635 				    ldcp->ldc_id, nbytes, desc.ncookies);
3636 			}
3637 
3638 			/* adjust the read pointer to skip over the padding */
3639 			mp->b_rptr += VNET_IPALIGN;
3640 
3641 			/* point to the actual end of data */
3642 			mp->b_wptr = mp->b_rptr + desc.nbytes;
3643 
3644 			/* update statistics */
3645 			ehp = (struct ether_header *)mp->b_rptr;
3646 			if (IS_BROADCAST(ehp))
3647 				ldcp->ldc_stats.brdcstrcv++;
3648 			else if (IS_MULTICAST(ehp))
3649 				ldcp->ldc_stats.multircv++;
3650 
3651 			ldcp->ldc_stats.ipackets++;
3652 			ldcp->ldc_stats.rbytes += desc.nbytes;
3653 
3654 			/*
3655 			 * IPALIGN space can be used for VLAN_TAG
3656 			 */
3657 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3658 			    VSW_VNETPORT, mp);
3659 
3660 			/* build a chain of received packets */
3661 			if (bp == NULL) {
3662 				/* first pkt */
3663 				bp = mp;
3664 				bp->b_next = bp->b_prev = NULL;
3665 				bpt = bp;
3666 				chain = 1;
3667 			} else {
3668 				mp->b_next = mp->b_prev = NULL;
3669 				bpt->b_next = mp;
3670 				bpt = mp;
3671 				chain++;
3672 			}
3673 
3674 vsw_process_desc_done:
3675 			/* mark we are finished with this descriptor */
3676 			if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3677 			    dp->dring_mtype, dp->handle, pos, pos,
3678 			    VIO_DESC_DONE)) != 0) {
3679 				DERR(vswp, "%s(%lld): unable to update "
3680 				    "dstate at pos %d: err %d",
3681 				    __func__, pos, ldcp->ldc_id, rng_rv);
3682 				ldcp->ldc_stats.ierrors++;
3683 				break;
3684 			}
3685 
3686 			/*
3687 			 * Send an ACK back to peer if requested.
3688 			 */
3689 			if (desc.hdr.ack) {
3690 				dring_pkt->start_idx = range_start;
3691 				dring_pkt->end_idx = range_end;
3692 
3693 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3694 				    " requested", __func__, ldcp->ldc_id,
3695 				    dring_pkt->start_idx, dring_pkt->end_idx);
3696 
3697 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3698 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3699 				dring_pkt->tag.vio_sid = ldcp->local_session;
3700 
3701 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3702 				    sizeof (vio_dring_msg_t), B_FALSE);
3703 
3704 				/*
3705 				 * Check if ACK was successfully sent. If not
3706 				 * we break and deal with that below.
3707 				 */
3708 				if (msg_rv != 0)
3709 					break;
3710 
3711 				prev_desc_ack = B_TRUE;
3712 				range_start = pos;
3713 			}
3714 
3715 			/* next descriptor */
3716 			pos = (pos + 1) % len;
3717 			cnt++;
3718 
3719 			/*
3720 			 * Break out of loop here and stop processing to
3721 			 * allow some other network device (or disk) to
3722 			 * get access to the cpu.
3723 			 */
3724 			if (chain > vsw_chain_len) {
3725 				D3(vswp, "%s(%lld): switching chain of %d "
3726 				    "msgs", __func__, ldcp->ldc_id, chain);
3727 				break;
3728 			}
3729 		}
3730 		RW_EXIT(&ldcp->lane_in.dlistrw);
3731 
3732 		/* send the chain of packets to be switched */
3733 		if (bp != NULL) {
3734 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3735 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3736 			    __func__, ldcp->ldc_id, chain);
3737 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3738 			    ldcp->ldc_port, NULL);
3739 		}
3740 
3741 		/*
3742 		 * If when we encountered an error when attempting to
3743 		 * access an imported dring, initiate a connection reset.
3744 		 */
3745 		if (rng_rv != 0) {
3746 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3747 			break;
3748 		}
3749 
3750 		/*
3751 		 * If when we attempted to send the ACK we found that the
3752 		 * channel had been reset then now handle this. We deal with
3753 		 * it here as we cannot reset the channel while holding the
3754 		 * dlistrw lock, and we don't want to acquire/release it
3755 		 * continuously in the above loop, as a channel reset should
3756 		 * be a rare event.
3757 		 */
3758 		if (msg_rv == ECONNRESET) {
3759 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3760 			break;
3761 		}
3762 
3763 		DTRACE_PROBE1(msg_cnt, int, cnt);
3764 
3765 		/*
3766 		 * We are now finished so ACK back with the state
3767 		 * set to STOPPING so our peer knows we are finished
3768 		 */
3769 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3770 		dring_pkt->tag.vio_sid = ldcp->local_session;
3771 
3772 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3773 
3774 		DTRACE_PROBE(stop_process_sent);
3775 
3776 		/*
3777 		 * We have not processed any more descriptors beyond
3778 		 * the last one we ACK'd.
3779 		 */
3780 		if (prev_desc_ack)
3781 			range_start = range_end;
3782 
3783 		dring_pkt->start_idx = range_start;
3784 		dring_pkt->end_idx = range_end;
3785 
3786 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3787 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3788 		    dring_pkt->end_idx);
3789 
3790 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3791 		    sizeof (vio_dring_msg_t), B_TRUE);
3792 		break;
3793 
3794 	case VIO_SUBTYPE_ACK:
3795 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3796 		/*
3797 		 * Verify that the relevant descriptors are all
3798 		 * marked as DONE
3799 		 */
3800 		READ_ENTER(&ldcp->lane_out.dlistrw);
3801 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3802 		    dring_pkt->dring_ident)) == NULL) {
3803 			RW_EXIT(&ldcp->lane_out.dlistrw);
3804 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3805 			return;
3806 		}
3807 
3808 		start = end = 0;
3809 		start = dring_pkt->start_idx;
3810 		end = dring_pkt->end_idx;
3811 		len = dp->num_descriptors;
3812 
3813 
3814 		mutex_enter(&dp->dlock);
3815 		dp->last_ack_recv = end;
3816 		ldcp->ldc_stats.dring_data_acks++;
3817 		mutex_exit(&dp->dlock);
3818 
3819 		(void) vsw_reclaim_dring(dp, start);
3820 
3821 		/*
3822 		 * If our peer is stopping processing descriptors then
3823 		 * we check to make sure it has processed all the descriptors
3824 		 * we have updated. If not then we send it a new message
3825 		 * to prompt it to restart.
3826 		 */
3827 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3828 			DTRACE_PROBE(stop_process_recv);
3829 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3830 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3831 			    dring_pkt->end_idx);
3832 
3833 			/*
3834 			 * Check next descriptor in public section of ring.
3835 			 * If its marked as READY then we need to prompt our
3836 			 * peer to start processing the ring again.
3837 			 */
3838 			i = (end + 1) % len;
3839 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3840 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3841 
3842 			/*
3843 			 * Hold the restart lock across all of this to
3844 			 * make sure that its not possible for us to
3845 			 * decide that a msg needs to be sent in the future
3846 			 * but the sending code having already checked is
3847 			 * about to exit.
3848 			 */
3849 			mutex_enter(&dp->restart_lock);
3850 			ldcp->ldc_stats.dring_stopped_acks++;
3851 			mutex_enter(&priv_addr->dstate_lock);
3852 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3853 
3854 				mutex_exit(&priv_addr->dstate_lock);
3855 
3856 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3857 				dring_pkt->tag.vio_sid = ldcp->local_session;
3858 
3859 				dring_pkt->start_idx = (end + 1) % len;
3860 				dring_pkt->end_idx = -1;
3861 
3862 				D2(vswp, "%s(%lld) : sending restart msg:"
3863 				    " %d : %d", __func__, ldcp->ldc_id,
3864 				    dring_pkt->start_idx, dring_pkt->end_idx);
3865 
3866 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3867 				    sizeof (vio_dring_msg_t), B_FALSE);
3868 				ldcp->ldc_stats.dring_data_msgs++;
3869 
3870 			} else {
3871 				mutex_exit(&priv_addr->dstate_lock);
3872 				dp->restart_reqd = B_TRUE;
3873 			}
3874 			mutex_exit(&dp->restart_lock);
3875 		}
3876 		RW_EXIT(&ldcp->lane_out.dlistrw);
3877 
3878 		/* only do channel reset after dropping dlistrw lock */
3879 		if (msg_rv == ECONNRESET)
3880 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3881 
3882 		break;
3883 
3884 	case VIO_SUBTYPE_NACK:
3885 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3886 		    __func__, ldcp->ldc_id);
3887 		/*
3888 		 * Something is badly wrong if we are getting NACK's
3889 		 * for our data pkts. So reset the channel.
3890 		 */
3891 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3892 
3893 		break;
3894 
3895 	default:
3896 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3897 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3898 	}
3899 
3900 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3901 }
3902 
3903 /*
3904  * dummy pkt data handler function for vnet protocol version 1.0
3905  */
3906 static void
3907 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3908 {
3909 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3910 }
3911 
3912 /*
3913  * This function handles raw pkt data messages received over the channel.
3914  * Currently, only priority-eth-type frames are received through this mechanism.
3915  * In this case, the frame(data) is present within the message itself which
3916  * is copied into an mblk before switching it.
3917  */
3918 static void
3919 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3920 {
3921 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3922 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3923 	uint32_t		size;
3924 	mblk_t			*mp;
3925 	vsw_t			*vswp = ldcp->ldc_vswp;
3926 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3927 	lane_t			*lp = &ldcp->lane_out;
3928 
3929 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3930 	if (size < ETHERMIN || size > lp->mtu) {
3931 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3932 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3933 		    ldcp->ldc_id, size);
3934 		return;
3935 	}
3936 
3937 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3938 	if (mp == NULL) {
3939 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3940 		if (mp == NULL) {
3941 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3942 			DWARN(vswp, "%s(%lld) allocb failure, "
3943 			    "unable to process priority frame\n", __func__,
3944 			    ldcp->ldc_id);
3945 			return;
3946 		}
3947 	}
3948 
3949 	/* skip over the extra space for vlan tag */
3950 	mp->b_rptr += VLAN_TAGSZ;
3951 
3952 	/* copy the frame from the payload of raw data msg into the mblk */
3953 	bcopy(dpkt->data, mp->b_rptr, size);
3954 	mp->b_wptr = mp->b_rptr + size;
3955 
3956 	/* update stats */
3957 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3958 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3959 
3960 	/*
3961 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3962 	 */
3963 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3964 
3965 	/* switch the frame to destination */
3966 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3967 }
3968 
3969 /*
3970  * Process an in-band descriptor message (most likely from
3971  * OBP).
3972  */
3973 static void
3974 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3975 {
3976 	vnet_ibnd_desc_t	*ibnd_desc;
3977 	dring_info_t		*dp = NULL;
3978 	vsw_private_desc_t	*priv_addr = NULL;
3979 	vsw_t			*vswp = ldcp->ldc_vswp;
3980 	mblk_t			*mp = NULL;
3981 	size_t			nbytes = 0;
3982 	size_t			off = 0;
3983 	uint64_t		idx = 0;
3984 	uint32_t		num = 1, len, datalen = 0;
3985 	uint64_t		ncookies = 0;
3986 	int			i, rv;
3987 	int			j = 0;
3988 
3989 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3990 
3991 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3992 
3993 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3994 	case VIO_SUBTYPE_INFO:
3995 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3996 
3997 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3998 			return;
3999 
4000 		/*
4001 		 * Data is padded to align on a 8 byte boundary,
4002 		 * nbytes is actual data length, i.e. minus that
4003 		 * padding.
4004 		 */
4005 		datalen = ibnd_desc->nbytes;
4006 
4007 		D2(vswp, "%s(%lld): processing inband desc : "
4008 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
4009 
4010 		ncookies = ibnd_desc->ncookies;
4011 
4012 		/*
4013 		 * allocb(9F) returns an aligned data block. We
4014 		 * need to ensure that we ask ldc for an aligned
4015 		 * number of bytes also.
4016 		 */
4017 		nbytes = datalen;
4018 		if (nbytes & 0x7) {
4019 			off = 8 - (nbytes & 0x7);
4020 			nbytes += off;
4021 		}
4022 
4023 		/* alloc extra space for VLAN_TAG */
4024 		mp = allocb(datalen + 8, BPRI_MED);
4025 		if (mp == NULL) {
4026 			DERR(vswp, "%s(%lld): allocb failed",
4027 			    __func__, ldcp->ldc_id);
4028 			ldcp->ldc_stats.rx_allocb_fail++;
4029 			return;
4030 		}
4031 
4032 		/* skip over the extra space for VLAN_TAG */
4033 		mp->b_rptr += 8;
4034 
4035 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
4036 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
4037 		    LDC_COPY_IN);
4038 
4039 		if (rv != 0) {
4040 			DERR(vswp, "%s(%d): unable to copy in data from "
4041 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
4042 			freemsg(mp);
4043 			ldcp->ldc_stats.ierrors++;
4044 			return;
4045 		}
4046 
4047 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
4048 		    __func__, ldcp->ldc_id, nbytes, ncookies);
4049 
4050 		/* point to the actual end of data */
4051 		mp->b_wptr = mp->b_rptr + datalen;
4052 		ldcp->ldc_stats.ipackets++;
4053 		ldcp->ldc_stats.rbytes += datalen;
4054 
4055 		/*
4056 		 * We ACK back every in-band descriptor message we process
4057 		 */
4058 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
4059 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
4060 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
4061 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
4062 
4063 		/*
4064 		 * there is extra space alloc'd for VLAN_TAG
4065 		 */
4066 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
4067 
4068 		/* send the packet to be switched */
4069 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
4070 		    ldcp->ldc_port, NULL);
4071 
4072 		break;
4073 
4074 	case VIO_SUBTYPE_ACK:
4075 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4076 
4077 		/* Verify the ACK is valid */
4078 		idx = ibnd_desc->hdr.desc_handle;
4079 
4080 		if (idx >= vsw_ntxds) {
4081 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
4082 			    "(idx %ld)", vswp->instance, idx);
4083 			return;
4084 		}
4085 
4086 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4087 			DERR(vswp, "%s: no dring found", __func__);
4088 			return;
4089 		}
4090 
4091 		len = dp->num_descriptors;
4092 		/*
4093 		 * If the descriptor we are being ACK'ed for is not the
4094 		 * one we expected, then pkts were lost somwhere, either
4095 		 * when we tried to send a msg, or a previous ACK msg from
4096 		 * our peer. In either case we now reclaim the descriptors
4097 		 * in the range from the last ACK we received up to the
4098 		 * current ACK.
4099 		 */
4100 		if (idx != dp->last_ack_recv) {
4101 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
4102 			    __func__, dp->last_ack_recv, idx);
4103 			num = idx >= dp->last_ack_recv ?
4104 			    idx - dp->last_ack_recv + 1:
4105 			    (len - dp->last_ack_recv + 1) + idx;
4106 		}
4107 
4108 		/*
4109 		 * When we sent the in-band message to our peer we
4110 		 * marked the copy in our private ring as READY. We now
4111 		 * check that the descriptor we are being ACK'ed for is in
4112 		 * fact READY, i.e. it is one we have shared with our peer.
4113 		 *
4114 		 * If its not we flag an error, but still reset the descr
4115 		 * back to FREE.
4116 		 */
4117 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
4118 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4119 			mutex_enter(&priv_addr->dstate_lock);
4120 			if (priv_addr->dstate != VIO_DESC_READY) {
4121 				DERR(vswp, "%s: (%ld) desc at index %ld not "
4122 				    "READY (0x%lx)", __func__,
4123 				    ldcp->ldc_id, idx, priv_addr->dstate);
4124 				DERR(vswp, "%s: bound %d: ncookies %ld : "
4125 				    "datalen %ld", __func__,
4126 				    priv_addr->bound, priv_addr->ncookies,
4127 				    priv_addr->datalen);
4128 			}
4129 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
4130 			    ldcp->ldc_id, idx);
4131 			/* release resources associated with sent msg */
4132 			priv_addr->datalen = 0;
4133 			priv_addr->dstate = VIO_DESC_FREE;
4134 			mutex_exit(&priv_addr->dstate_lock);
4135 		}
4136 		/* update to next expected value */
4137 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
4138 
4139 		break;
4140 
4141 	case VIO_SUBTYPE_NACK:
4142 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4143 
4144 		/*
4145 		 * We should only get a NACK if our peer doesn't like
4146 		 * something about a message we have sent it. If this
4147 		 * happens we just release the resources associated with
4148 		 * the message. (We are relying on higher layers to decide
4149 		 * whether or not to resend.
4150 		 */
4151 
4152 		/* limit check */
4153 		idx = ibnd_desc->hdr.desc_handle;
4154 
4155 		if (idx >= vsw_ntxds) {
4156 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
4157 			    __func__, idx);
4158 			return;
4159 		}
4160 
4161 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4162 			DERR(vswp, "%s: no dring found", __func__);
4163 			return;
4164 		}
4165 
4166 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4167 
4168 		/* move to correct location in ring */
4169 		priv_addr += idx;
4170 
4171 		/* release resources associated with sent msg */
4172 		mutex_enter(&priv_addr->dstate_lock);
4173 		priv_addr->datalen = 0;
4174 		priv_addr->dstate = VIO_DESC_FREE;
4175 		mutex_exit(&priv_addr->dstate_lock);
4176 
4177 		break;
4178 
4179 	default:
4180 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4181 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
4182 	}
4183 
4184 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4185 }
4186 
4187 static void
4188 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
4189 {
4190 	_NOTE(ARGUNUSED(epkt))
4191 
4192 	vsw_t		*vswp = ldcp->ldc_vswp;
4193 	uint16_t	env = tagp->vio_subtype_env;
4194 
4195 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4196 
4197 	/*
4198 	 * Error vio_subtypes have yet to be defined. So for
4199 	 * the moment we can't do anything.
4200 	 */
4201 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
4202 
4203 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4204 }
4205 
4206 /* transmit the packet over the given port */
4207 int
4208 vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
4209 {
4210 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
4211 	vsw_ldc_t 	*ldcp;
4212 	int		status = 0;
4213 	uint32_t	n;
4214 
4215 	READ_ENTER(&ldcl->lockrw);
4216 	/*
4217 	 * Note for now, we have a single channel.
4218 	 */
4219 	ldcp = ldcl->head;
4220 	if (ldcp == NULL) {
4221 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
4222 		freemsgchain(mp);
4223 		RW_EXIT(&ldcl->lockrw);
4224 		return (1);
4225 	}
4226 
4227 	n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
4228 
4229 	count -= n;
4230 	if (count == 0) {
4231 		goto vsw_portsend_exit;
4232 	}
4233 
4234 	status = ldcp->tx(ldcp, mp, mpt, count);
4235 
4236 vsw_portsend_exit:
4237 	RW_EXIT(&ldcl->lockrw);
4238 
4239 	return (status);
4240 }
4241 
4242 /*
4243  * Break up frames into 2 seperate chains: normal and
4244  * priority, based on the frame type. The number of
4245  * priority frames is also counted and returned.
4246  *
4247  * Params:
4248  * 	vswp:	pointer to the instance of vsw
4249  *	np:	head of packet chain to be broken
4250  *	npt:	tail of packet chain to be broken
4251  *
4252  * Returns:
4253  *	np:	head of normal data packets
4254  *	npt:	tail of normal data packets
4255  *	hp:	head of high priority packets
4256  *	hpt:	tail of high priority packets
4257  */
4258 static uint32_t
4259 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4260 	mblk_t **hp, mblk_t **hpt)
4261 {
4262 	mblk_t			*tmp = NULL;
4263 	mblk_t			*smp = NULL;
4264 	mblk_t			*hmp = NULL;	/* high prio pkts head */
4265 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4266 	mblk_t			*nmp = NULL;	/* normal pkts head */
4267 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4268 	uint32_t		count = 0;
4269 	int			i;
4270 	struct ether_header	*ehp;
4271 	uint32_t		num_types;
4272 	uint16_t		*types;
4273 
4274 	tmp = *np;
4275 	while (tmp != NULL) {
4276 
4277 		smp = tmp;
4278 		tmp = tmp->b_next;
4279 		smp->b_next = NULL;
4280 		smp->b_prev = NULL;
4281 
4282 		ehp = (struct ether_header *)smp->b_rptr;
4283 		num_types = vswp->pri_num_types;
4284 		types = vswp->pri_types;
4285 		for (i = 0; i < num_types; i++) {
4286 			if (ehp->ether_type == types[i]) {
4287 				/* high priority frame */
4288 
4289 				if (hmp != NULL) {
4290 					hmpt->b_next = smp;
4291 					hmpt = smp;
4292 				} else {
4293 					hmp = hmpt = smp;
4294 				}
4295 				count++;
4296 				break;
4297 			}
4298 		}
4299 		if (i == num_types) {
4300 			/* normal data frame */
4301 
4302 			if (nmp != NULL) {
4303 				nmpt->b_next = smp;
4304 				nmpt = smp;
4305 			} else {
4306 				nmp = nmpt = smp;
4307 			}
4308 		}
4309 	}
4310 
4311 	*hp = hmp;
4312 	*hpt = hmpt;
4313 	*np = nmp;
4314 	*npt = nmpt;
4315 
4316 	return (count);
4317 }
4318 
4319 /*
4320  * Wrapper function to transmit normal and/or priority frames over the channel.
4321  */
4322 static int
4323 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4324 {
4325 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4326 	mblk_t			*tmp;
4327 	mblk_t			*smp;
4328 	mblk_t			*hmp;	/* high prio pkts head */
4329 	mblk_t			*hmpt;	/* high prio pkts tail */
4330 	mblk_t			*nmp;	/* normal pkts head */
4331 	mblk_t			*nmpt;	/* normal pkts tail */
4332 	uint32_t		n = 0;
4333 	vsw_t			*vswp = ldcp->ldc_vswp;
4334 
4335 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4336 	ASSERT(count != 0);
4337 
4338 	nmp = mp;
4339 	nmpt = mpt;
4340 
4341 	/* gather any priority frames from the chain of packets */
4342 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4343 
4344 	/* transmit priority frames */
4345 	tmp = hmp;
4346 	while (tmp != NULL) {
4347 		smp = tmp;
4348 		tmp = tmp->b_next;
4349 		smp->b_next = NULL;
4350 		vsw_ldcsend_pkt(ldcp, smp);
4351 	}
4352 
4353 	count -= n;
4354 
4355 	if (count == 0) {
4356 		/* no normal data frames to process */
4357 		return (0);
4358 	}
4359 
4360 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4361 }
4362 
4363 /*
4364  * Wrapper function to transmit normal frames over the channel.
4365  */
4366 static int
4367 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4368 {
4369 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4370 	mblk_t		*tmp = NULL;
4371 
4372 	ASSERT(count != 0);
4373 	/*
4374 	 * If the TX thread is enabled, then queue the
4375 	 * ordinary frames and signal the tx thread.
4376 	 */
4377 	if (ldcp->tx_thread != NULL) {
4378 
4379 		mutex_enter(&ldcp->tx_thr_lock);
4380 
4381 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4382 			/*
4383 			 * If we reached queue limit,
4384 			 * do not queue new packets,
4385 			 * drop them.
4386 			 */
4387 			ldcp->ldc_stats.tx_qfull += count;
4388 			mutex_exit(&ldcp->tx_thr_lock);
4389 			freemsgchain(mp);
4390 			goto exit;
4391 		}
4392 		if (ldcp->tx_mhead == NULL) {
4393 			ldcp->tx_mhead = mp;
4394 			ldcp->tx_mtail = mpt;
4395 			cv_signal(&ldcp->tx_thr_cv);
4396 		} else {
4397 			ldcp->tx_mtail->b_next = mp;
4398 			ldcp->tx_mtail = mpt;
4399 		}
4400 		ldcp->tx_cnt += count;
4401 		mutex_exit(&ldcp->tx_thr_lock);
4402 	} else {
4403 		while (mp != NULL) {
4404 			tmp = mp->b_next;
4405 			mp->b_next = mp->b_prev = NULL;
4406 			(void) vsw_ldcsend(ldcp, mp, 1);
4407 			mp = tmp;
4408 		}
4409 	}
4410 
4411 exit:
4412 	return (0);
4413 }
4414 
4415 /*
4416  * This function transmits the frame in the payload of a raw data
4417  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4418  * send special frames with high priorities, without going through
4419  * the normal data path which uses descriptor ring mechanism.
4420  */
4421 static void
4422 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4423 {
4424 	vio_raw_data_msg_t	*pkt;
4425 	mblk_t			*bp;
4426 	mblk_t			*nmp = NULL;
4427 	caddr_t			dst;
4428 	uint32_t		mblksz;
4429 	uint32_t		size;
4430 	uint32_t		nbytes;
4431 	int			rv;
4432 	vsw_t			*vswp = ldcp->ldc_vswp;
4433 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4434 
4435 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4436 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4437 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4438 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4439 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4440 		    ldcp->lane_out.lstate);
4441 		goto send_pkt_exit;
4442 	}
4443 
4444 	size = msgsize(mp);
4445 
4446 	/* frame size bigger than available payload len of raw data msg ? */
4447 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4448 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4449 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4450 		    ldcp->ldc_id, size);
4451 		goto send_pkt_exit;
4452 	}
4453 
4454 	if (size < ETHERMIN)
4455 		size = ETHERMIN;
4456 
4457 	/* alloc space for a raw data message */
4458 	nmp = vio_allocb(vswp->pri_tx_vmp);
4459 	if (nmp == NULL) {
4460 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4461 		DWARN(vswp, "vio_allocb failed\n");
4462 		goto send_pkt_exit;
4463 	}
4464 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4465 
4466 	/* copy frame into the payload of raw data message */
4467 	dst = (caddr_t)pkt->data;
4468 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4469 		mblksz = MBLKL(bp);
4470 		bcopy(bp->b_rptr, dst, mblksz);
4471 		dst += mblksz;
4472 	}
4473 
4474 	/* setup the raw data msg */
4475 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4476 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4477 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4478 	pkt->tag.vio_sid = ldcp->local_session;
4479 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4480 
4481 	/* send the msg over ldc */
4482 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4483 	if (rv != 0) {
4484 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4485 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4486 		    ldcp->ldc_id);
4487 		goto send_pkt_exit;
4488 	}
4489 
4490 	/* update stats */
4491 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4492 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4493 
4494 send_pkt_exit:
4495 	if (nmp != NULL)
4496 		freemsg(nmp);
4497 	freemsg(mp);
4498 }
4499 
4500 /*
4501  * Transmit the packet over the given LDC channel.
4502  *
4503  * The 'retries' argument indicates how many times a packet
4504  * is retried before it is dropped. Note, the retry is done
4505  * only for a resource related failure, for all other failures
4506  * the packet is dropped immediately.
4507  */
4508 static int
4509 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4510 {
4511 	int i;
4512 	int rc;
4513 	int status = 0;
4514 	vsw_port_t *port = ldcp->ldc_port;
4515 	dring_info_t *dp = NULL;
4516 
4517 
4518 	for (i = 0; i < retries; ) {
4519 		/*
4520 		 * Send the message out using the appropriate
4521 		 * transmit function which will free mblock when it
4522 		 * is finished with it.
4523 		 */
4524 		mutex_enter(&port->tx_lock);
4525 		if (port->transmit != NULL) {
4526 			status = (*port->transmit)(ldcp, mp);
4527 		}
4528 		if (status == LDC_TX_SUCCESS) {
4529 			mutex_exit(&port->tx_lock);
4530 			break;
4531 		}
4532 		i++;	/* increment the counter here */
4533 
4534 		/* If its the last retry, then update the oerror */
4535 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4536 			ldcp->ldc_stats.oerrors++;
4537 		}
4538 		mutex_exit(&port->tx_lock);
4539 
4540 		if (status != LDC_TX_NORESOURCES) {
4541 			/*
4542 			 * No retrying required for errors un-related
4543 			 * to resources.
4544 			 */
4545 			break;
4546 		}
4547 		READ_ENTER(&ldcp->lane_out.dlistrw);
4548 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4549 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4550 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4551 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4552 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4553 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4554 		} else {
4555 			/*
4556 			 * If there is no dring or the xfer_mode is
4557 			 * set to DESC_MODE(ie., OBP), then simply break here.
4558 			 */
4559 			RW_EXIT(&ldcp->lane_out.dlistrw);
4560 			break;
4561 		}
4562 		RW_EXIT(&ldcp->lane_out.dlistrw);
4563 
4564 		/*
4565 		 * Delay only if none were reclaimed
4566 		 * and its not the last retry.
4567 		 */
4568 		if ((rc == 0) && (i < retries)) {
4569 			delay(drv_usectohz(vsw_ldc_tx_delay));
4570 		}
4571 	}
4572 	freemsg(mp);
4573 	return (status);
4574 }
4575 
4576 /*
4577  * Send packet out via descriptor ring to a logical device.
4578  */
4579 static int
4580 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4581 {
4582 	vio_dring_msg_t		dring_pkt;
4583 	dring_info_t		*dp = NULL;
4584 	vsw_private_desc_t	*priv_desc = NULL;
4585 	vnet_public_desc_t	*pub = NULL;
4586 	vsw_t			*vswp = ldcp->ldc_vswp;
4587 	mblk_t			*bp;
4588 	size_t			n, size;
4589 	caddr_t			bufp;
4590 	int			idx;
4591 	int			status = LDC_TX_SUCCESS;
4592 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4593 	lane_t			*lp = &ldcp->lane_out;
4594 
4595 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4596 
4597 	/* TODO: make test a macro */
4598 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4599 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4600 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4601 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4602 		    ldcp->lane_out.lstate);
4603 		ldcp->ldc_stats.oerrors++;
4604 		return (LDC_TX_FAILURE);
4605 	}
4606 
4607 	/*
4608 	 * Note - using first ring only, this may change
4609 	 * in the future.
4610 	 */
4611 	READ_ENTER(&ldcp->lane_out.dlistrw);
4612 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4613 		RW_EXIT(&ldcp->lane_out.dlistrw);
4614 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4615 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4616 		ldcp->ldc_stats.oerrors++;
4617 		return (LDC_TX_FAILURE);
4618 	}
4619 
4620 	size = msgsize(mp);
4621 	if (size > (size_t)lp->mtu) {
4622 		RW_EXIT(&ldcp->lane_out.dlistrw);
4623 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4624 		    ldcp->ldc_id, size);
4625 		ldcp->ldc_stats.oerrors++;
4626 		return (LDC_TX_FAILURE);
4627 	}
4628 
4629 	/*
4630 	 * Find a free descriptor
4631 	 *
4632 	 * Note: for the moment we are assuming that we will only
4633 	 * have one dring going from the switch to each of its
4634 	 * peers. This may change in the future.
4635 	 */
4636 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4637 		D2(vswp, "%s(%lld): no descriptor available for ring "
4638 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4639 
4640 		/* nothing more we can do */
4641 		status = LDC_TX_NORESOURCES;
4642 		ldcp->ldc_stats.tx_no_desc++;
4643 		goto vsw_dringsend_free_exit;
4644 	} else {
4645 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4646 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4647 	}
4648 
4649 	/* copy data into the descriptor */
4650 	bufp = priv_desc->datap;
4651 	bufp += VNET_IPALIGN;
4652 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4653 		n = MBLKL(bp);
4654 		bcopy(bp->b_rptr, bufp, n);
4655 		bufp += n;
4656 	}
4657 
4658 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4659 
4660 	pub = priv_desc->descp;
4661 	pub->nbytes = priv_desc->datalen;
4662 
4663 	/* update statistics */
4664 	if (IS_BROADCAST(ehp))
4665 		ldcp->ldc_stats.brdcstxmt++;
4666 	else if (IS_MULTICAST(ehp))
4667 		ldcp->ldc_stats.multixmt++;
4668 	ldcp->ldc_stats.opackets++;
4669 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4670 
4671 	mutex_enter(&priv_desc->dstate_lock);
4672 	pub->hdr.dstate = VIO_DESC_READY;
4673 	mutex_exit(&priv_desc->dstate_lock);
4674 
4675 	/*
4676 	 * Determine whether or not we need to send a message to our
4677 	 * peer prompting them to read our newly updated descriptor(s).
4678 	 */
4679 	mutex_enter(&dp->restart_lock);
4680 	if (dp->restart_reqd) {
4681 		dp->restart_reqd = B_FALSE;
4682 		ldcp->ldc_stats.dring_data_msgs++;
4683 		mutex_exit(&dp->restart_lock);
4684 
4685 		/*
4686 		 * Send a vio_dring_msg to peer to prompt them to read
4687 		 * the updated descriptor ring.
4688 		 */
4689 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4690 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4691 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4692 		dring_pkt.tag.vio_sid = ldcp->local_session;
4693 
4694 		/* Note - for now using first ring */
4695 		dring_pkt.dring_ident = dp->ident;
4696 
4697 		/*
4698 		 * If last_ack_recv is -1 then we know we've not
4699 		 * received any ack's yet, so this must be the first
4700 		 * msg sent, so set the start to the begining of the ring.
4701 		 */
4702 		mutex_enter(&dp->dlock);
4703 		if (dp->last_ack_recv == -1) {
4704 			dring_pkt.start_idx = 0;
4705 		} else {
4706 			dring_pkt.start_idx =
4707 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4708 		}
4709 		dring_pkt.end_idx = -1;
4710 		mutex_exit(&dp->dlock);
4711 
4712 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4713 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4714 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4715 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4716 		    dring_pkt.end_idx);
4717 
4718 		RW_EXIT(&ldcp->lane_out.dlistrw);
4719 
4720 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4721 		    sizeof (vio_dring_msg_t), B_TRUE);
4722 
4723 		return (status);
4724 
4725 	} else {
4726 		mutex_exit(&dp->restart_lock);
4727 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4728 		    ldcp->ldc_id, idx);
4729 	}
4730 
4731 vsw_dringsend_free_exit:
4732 
4733 	RW_EXIT(&ldcp->lane_out.dlistrw);
4734 
4735 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4736 	return (status);
4737 }
4738 
4739 /*
4740  * Send an in-band descriptor message over ldc.
4741  */
4742 static int
4743 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4744 {
4745 	vsw_t			*vswp = ldcp->ldc_vswp;
4746 	vnet_ibnd_desc_t	ibnd_msg;
4747 	vsw_private_desc_t	*priv_desc = NULL;
4748 	dring_info_t		*dp = NULL;
4749 	size_t			n, size = 0;
4750 	caddr_t			bufp;
4751 	mblk_t			*bp;
4752 	int			idx, i;
4753 	int			status = LDC_TX_SUCCESS;
4754 	static int		warn_msg = 1;
4755 	lane_t			*lp = &ldcp->lane_out;
4756 
4757 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4758 
4759 	ASSERT(mp != NULL);
4760 
4761 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4762 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4763 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4764 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4765 		    ldcp->lane_out.lstate);
4766 		ldcp->ldc_stats.oerrors++;
4767 		return (LDC_TX_FAILURE);
4768 	}
4769 
4770 	/*
4771 	 * only expect single dring to exist, which we use
4772 	 * as an internal buffer, rather than a transfer channel.
4773 	 */
4774 	READ_ENTER(&ldcp->lane_out.dlistrw);
4775 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4776 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4777 		    __func__, ldcp->ldc_id);
4778 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4779 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4780 		RW_EXIT(&ldcp->lane_out.dlistrw);
4781 		ldcp->ldc_stats.oerrors++;
4782 		return (LDC_TX_FAILURE);
4783 	}
4784 
4785 	size = msgsize(mp);
4786 	if (size > (size_t)lp->mtu) {
4787 		RW_EXIT(&ldcp->lane_out.dlistrw);
4788 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4789 		    ldcp->ldc_id, size);
4790 		ldcp->ldc_stats.oerrors++;
4791 		return (LDC_TX_FAILURE);
4792 	}
4793 
4794 	/*
4795 	 * Find a free descriptor in our buffer ring
4796 	 */
4797 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4798 		RW_EXIT(&ldcp->lane_out.dlistrw);
4799 		if (warn_msg) {
4800 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4801 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4802 			warn_msg = 0;
4803 		}
4804 
4805 		/* nothing more we can do */
4806 		status = LDC_TX_NORESOURCES;
4807 		goto vsw_descrsend_free_exit;
4808 	} else {
4809 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4810 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4811 		warn_msg = 1;
4812 	}
4813 
4814 	/* copy data into the descriptor */
4815 	bufp = priv_desc->datap;
4816 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4817 		n = MBLKL(bp);
4818 		bcopy(bp->b_rptr, bufp, n);
4819 		bufp += n;
4820 	}
4821 
4822 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4823 
4824 	/* create and send the in-band descp msg */
4825 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4826 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4827 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4828 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4829 
4830 	/*
4831 	 * Copy the mem cookies describing the data from the
4832 	 * private region of the descriptor ring into the inband
4833 	 * descriptor.
4834 	 */
4835 	for (i = 0; i < priv_desc->ncookies; i++) {
4836 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4837 		    sizeof (ldc_mem_cookie_t));
4838 	}
4839 
4840 	ibnd_msg.hdr.desc_handle = idx;
4841 	ibnd_msg.ncookies = priv_desc->ncookies;
4842 	ibnd_msg.nbytes = size;
4843 
4844 	ldcp->ldc_stats.opackets++;
4845 	ldcp->ldc_stats.obytes += size;
4846 
4847 	RW_EXIT(&ldcp->lane_out.dlistrw);
4848 
4849 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4850 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4851 
4852 vsw_descrsend_free_exit:
4853 
4854 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4855 	return (status);
4856 }
4857 
4858 static void
4859 vsw_send_ver(void *arg)
4860 {
4861 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4862 	vsw_t		*vswp = ldcp->ldc_vswp;
4863 	lane_t		*lp = &ldcp->lane_out;
4864 	vio_ver_msg_t	ver_msg;
4865 
4866 	D1(vswp, "%s enter", __func__);
4867 
4868 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4869 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4870 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4871 	ver_msg.tag.vio_sid = ldcp->local_session;
4872 
4873 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4874 		ver_msg.ver_major = vsw_versions[0].ver_major;
4875 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4876 	} else {
4877 		/* use the major,minor that we've ack'd */
4878 		lane_t	*lpi = &ldcp->lane_in;
4879 		ver_msg.ver_major = lpi->ver_major;
4880 		ver_msg.ver_minor = lpi->ver_minor;
4881 	}
4882 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4883 
4884 	lp->lstate |= VSW_VER_INFO_SENT;
4885 	lp->ver_major = ver_msg.ver_major;
4886 	lp->ver_minor = ver_msg.ver_minor;
4887 
4888 	DUMP_TAG(ver_msg.tag);
4889 
4890 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4891 
4892 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4893 }
4894 
4895 static void
4896 vsw_send_attr(vsw_ldc_t *ldcp)
4897 {
4898 	vsw_t			*vswp = ldcp->ldc_vswp;
4899 	lane_t			*lp = &ldcp->lane_out;
4900 	vnet_attr_msg_t		attr_msg;
4901 
4902 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4903 
4904 	/*
4905 	 * Subtype is set to INFO by default
4906 	 */
4907 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4908 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4909 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4910 	attr_msg.tag.vio_sid = ldcp->local_session;
4911 
4912 	/* payload copied from default settings for lane */
4913 	attr_msg.mtu = lp->mtu;
4914 	attr_msg.addr_type = lp->addr_type;
4915 	attr_msg.xfer_mode = lp->xfer_mode;
4916 	attr_msg.ack_freq = lp->xfer_mode;
4917 
4918 	READ_ENTER(&vswp->if_lockrw);
4919 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4920 	RW_EXIT(&vswp->if_lockrw);
4921 
4922 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4923 
4924 	DUMP_TAG(attr_msg.tag);
4925 
4926 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4927 
4928 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4929 }
4930 
4931 /*
4932  * Create dring info msg (which also results in the creation of
4933  * a dring).
4934  */
4935 static vio_dring_reg_msg_t *
4936 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4937 {
4938 	vio_dring_reg_msg_t	*mp;
4939 	dring_info_t		*dp;
4940 	vsw_t			*vswp = ldcp->ldc_vswp;
4941 	int			rv;
4942 
4943 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4944 
4945 	/*
4946 	 * If we can't create a dring, obviously no point sending
4947 	 * a message.
4948 	 */
4949 	if ((dp = vsw_create_dring(ldcp)) == NULL)
4950 		return (NULL);
4951 
4952 	/* Allocate pools of receive mblks */
4953 	rv = vsw_init_multipools(ldcp, vswp);
4954 	if (rv) {
4955 		DWARN(vswp, "%s: unable to create free mblk pools for"
4956 		    " channel %ld (rv %d)", __func__, ldcp->ldc_id, rv);
4957 		vsw_free_lane_resources(ldcp, OUTBOUND);
4958 		return (NULL);
4959 	}
4960 
4961 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4962 
4963 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4964 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4965 	mp->tag.vio_subtype_env = VIO_DRING_REG;
4966 	mp->tag.vio_sid = ldcp->local_session;
4967 
4968 	/* payload */
4969 	mp->num_descriptors = dp->num_descriptors;
4970 	mp->descriptor_size = dp->descriptor_size;
4971 	mp->options = dp->options;
4972 	mp->ncookies = dp->ncookies;
4973 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4974 
4975 	mp->dring_ident = 0;
4976 
4977 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4978 
4979 	return (mp);
4980 }
4981 
4982 static void
4983 vsw_send_dring_info(vsw_ldc_t *ldcp)
4984 {
4985 	vio_dring_reg_msg_t	*dring_msg;
4986 	vsw_t			*vswp = ldcp->ldc_vswp;
4987 
4988 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4989 
4990 	dring_msg = vsw_create_dring_info_pkt(ldcp);
4991 	if (dring_msg == NULL) {
4992 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4993 		    vswp->instance, __func__);
4994 		return;
4995 	}
4996 
4997 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4998 
4999 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
5000 
5001 	(void) vsw_send_msg(ldcp, dring_msg,
5002 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
5003 
5004 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
5005 
5006 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
5007 }
5008 
5009 static void
5010 vsw_send_rdx(vsw_ldc_t *ldcp)
5011 {
5012 	vsw_t		*vswp = ldcp->ldc_vswp;
5013 	vio_rdx_msg_t	rdx_msg;
5014 
5015 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5016 
5017 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5018 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5019 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
5020 	rdx_msg.tag.vio_sid = ldcp->local_session;
5021 
5022 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
5023 
5024 	DUMP_TAG(rdx_msg.tag);
5025 
5026 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
5027 
5028 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
5029 }
5030 
5031 /*
5032  * Generic routine to send message out over ldc channel.
5033  *
5034  * It is possible that when we attempt to write over the ldc channel
5035  * that we get notified that it has been reset. Depending on the value
5036  * of the handle_reset flag we either handle that event here or simply
5037  * notify the caller that the channel was reset.
5038  */
5039 int
5040 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
5041 {
5042 	int			rv;
5043 	size_t			msglen = size;
5044 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
5045 	vsw_t			*vswp = ldcp->ldc_vswp;
5046 	vio_dring_msg_t		*dmsg;
5047 	vio_raw_data_msg_t	*rmsg;
5048 	vnet_ibnd_desc_t	*imsg;
5049 	boolean_t		data_msg = B_FALSE;
5050 
5051 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
5052 	    ldcp->ldc_id, size);
5053 
5054 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
5055 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
5056 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
5057 
5058 	mutex_enter(&ldcp->ldc_txlock);
5059 
5060 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
5061 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
5062 			dmsg = (vio_dring_msg_t *)tag;
5063 			dmsg->seq_num = ldcp->lane_out.seq_num;
5064 			data_msg = B_TRUE;
5065 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
5066 			rmsg = (vio_raw_data_msg_t *)tag;
5067 			rmsg->seq_num = ldcp->lane_out.seq_num;
5068 			data_msg = B_TRUE;
5069 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
5070 			imsg = (vnet_ibnd_desc_t *)tag;
5071 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
5072 			data_msg = B_TRUE;
5073 		}
5074 	}
5075 
5076 	do {
5077 		msglen = size;
5078 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
5079 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
5080 
5081 	if (rv == 0 && data_msg == B_TRUE) {
5082 		ldcp->lane_out.seq_num++;
5083 	}
5084 
5085 	if ((rv != 0) || (msglen != size)) {
5086 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
5087 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
5088 		ldcp->ldc_stats.oerrors++;
5089 	}
5090 
5091 	mutex_exit(&ldcp->ldc_txlock);
5092 
5093 	/*
5094 	 * If channel has been reset we either handle it here or
5095 	 * simply report back that it has been reset and let caller
5096 	 * decide what to do.
5097 	 */
5098 	if (rv == ECONNRESET) {
5099 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
5100 
5101 		/*
5102 		 * N.B - must never be holding the dlistrw lock when
5103 		 * we do a reset of the channel.
5104 		 */
5105 		if (handle_reset) {
5106 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
5107 		}
5108 	}
5109 
5110 	return (rv);
5111 }
5112 
5113 /*
5114  * Remove the specified address from the list of address maintained
5115  * in this port node.
5116  */
5117 mcst_addr_t *
5118 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
5119 {
5120 	vsw_t		*vswp = NULL;
5121 	vsw_port_t	*port = NULL;
5122 	mcst_addr_t	*prev_p = NULL;
5123 	mcst_addr_t	*curr_p = NULL;
5124 
5125 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
5126 	    __func__, devtype, addr);
5127 
5128 	if (devtype == VSW_VNETPORT) {
5129 		port = (vsw_port_t *)arg;
5130 		mutex_enter(&port->mca_lock);
5131 		prev_p = curr_p = port->mcap;
5132 	} else {
5133 		vswp = (vsw_t *)arg;
5134 		mutex_enter(&vswp->mca_lock);
5135 		prev_p = curr_p = vswp->mcap;
5136 	}
5137 
5138 	while (curr_p != NULL) {
5139 		if (curr_p->addr == addr) {
5140 			D2(NULL, "%s: address found", __func__);
5141 			/* match found */
5142 			if (prev_p == curr_p) {
5143 				/* list head */
5144 				if (devtype == VSW_VNETPORT)
5145 					port->mcap = curr_p->nextp;
5146 				else
5147 					vswp->mcap = curr_p->nextp;
5148 			} else {
5149 				prev_p->nextp = curr_p->nextp;
5150 			}
5151 			break;
5152 		} else {
5153 			prev_p = curr_p;
5154 			curr_p = curr_p->nextp;
5155 		}
5156 	}
5157 
5158 	if (devtype == VSW_VNETPORT)
5159 		mutex_exit(&port->mca_lock);
5160 	else
5161 		mutex_exit(&vswp->mca_lock);
5162 
5163 	D1(NULL, "%s: exit", __func__);
5164 
5165 	return (curr_p);
5166 }
5167 
5168 /*
5169  * Creates a descriptor ring (dring) and links it into the
5170  * link of outbound drings for this channel.
5171  *
5172  * Returns NULL if creation failed.
5173  */
5174 static dring_info_t *
5175 vsw_create_dring(vsw_ldc_t *ldcp)
5176 {
5177 	vsw_private_desc_t	*priv_addr = NULL;
5178 	vsw_t			*vswp = ldcp->ldc_vswp;
5179 	ldc_mem_info_t		minfo;
5180 	dring_info_t		*dp, *tp;
5181 	int			i;
5182 
5183 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5184 
5185 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5186 
5187 	/* create public section of ring */
5188 	if ((ldc_mem_dring_create(vsw_ntxds,
5189 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
5190 
5191 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
5192 		    "failed", ldcp->ldc_id);
5193 		goto create_fail_exit;
5194 	}
5195 
5196 	ASSERT(dp->handle != NULL);
5197 
5198 	/*
5199 	 * Get the base address of the public section of the ring.
5200 	 */
5201 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
5202 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
5203 		    ldcp->ldc_id);
5204 		goto dring_fail_exit;
5205 	} else {
5206 		ASSERT(minfo.vaddr != 0);
5207 		dp->pub_addr = minfo.vaddr;
5208 	}
5209 
5210 	dp->num_descriptors = vsw_ntxds;
5211 	dp->descriptor_size = VSW_PUB_SIZE;
5212 	dp->options = VIO_TX_DRING;
5213 	dp->ncookies = 1;	/* guaranteed by ldc */
5214 
5215 	/*
5216 	 * create private portion of ring
5217 	 */
5218 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
5219 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5220 
5221 	if (vsw_setup_ring(ldcp, dp)) {
5222 		DERR(vswp, "%s: unable to setup ring", __func__);
5223 		goto dring_fail_exit;
5224 	}
5225 
5226 	/* haven't used any descriptors yet */
5227 	dp->end_idx = 0;
5228 	dp->last_ack_recv = -1;
5229 
5230 	/* bind dring to the channel */
5231 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
5232 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
5233 	    &dp->cookie[0], &dp->ncookies)) != 0) {
5234 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
5235 		    "%lld", ldcp->ldc_id);
5236 		goto dring_fail_exit;
5237 	}
5238 
5239 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5240 	dp->restart_reqd = B_TRUE;
5241 
5242 	/*
5243 	 * Only ever create rings for outgoing lane. Link it onto
5244 	 * end of list.
5245 	 */
5246 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5247 	if (ldcp->lane_out.dringp == NULL) {
5248 		D2(vswp, "vsw_create_dring: adding first outbound ring");
5249 		ldcp->lane_out.dringp = dp;
5250 	} else {
5251 		tp = ldcp->lane_out.dringp;
5252 		while (tp->next != NULL)
5253 			tp = tp->next;
5254 
5255 		tp->next = dp;
5256 	}
5257 	RW_EXIT(&ldcp->lane_out.dlistrw);
5258 
5259 	return (dp);
5260 
5261 dring_fail_exit:
5262 	(void) ldc_mem_dring_destroy(dp->handle);
5263 
5264 create_fail_exit:
5265 	if (dp->priv_addr != NULL) {
5266 		priv_addr = dp->priv_addr;
5267 		for (i = 0; i < vsw_ntxds; i++) {
5268 			if (priv_addr->memhandle != NULL)
5269 				(void) ldc_mem_free_handle(
5270 				    priv_addr->memhandle);
5271 			priv_addr++;
5272 		}
5273 		kmem_free(dp->priv_addr,
5274 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5275 	}
5276 	mutex_destroy(&dp->dlock);
5277 
5278 	kmem_free(dp, sizeof (dring_info_t));
5279 	return (NULL);
5280 }
5281 
5282 /*
5283  * Create a ring consisting of just a private portion and link
5284  * it into the list of rings for the outbound lane.
5285  *
5286  * These type of rings are used primarily for temporary data
5287  * storage (i.e. as data buffers).
5288  */
5289 void
5290 vsw_create_privring(vsw_ldc_t *ldcp)
5291 {
5292 	dring_info_t		*dp, *tp;
5293 	vsw_t			*vswp = ldcp->ldc_vswp;
5294 
5295 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5296 
5297 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5298 
5299 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5300 
5301 	/* no public section */
5302 	dp->pub_addr = NULL;
5303 
5304 	dp->priv_addr = kmem_zalloc(
5305 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5306 
5307 	dp->num_descriptors = vsw_ntxds;
5308 
5309 	if (vsw_setup_ring(ldcp, dp)) {
5310 		DERR(vswp, "%s: setup of ring failed", __func__);
5311 		kmem_free(dp->priv_addr,
5312 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5313 		mutex_destroy(&dp->dlock);
5314 		kmem_free(dp, sizeof (dring_info_t));
5315 		return;
5316 	}
5317 
5318 	/* haven't used any descriptors yet */
5319 	dp->end_idx = 0;
5320 
5321 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5322 	dp->restart_reqd = B_TRUE;
5323 
5324 	/*
5325 	 * Only ever create rings for outgoing lane. Link it onto
5326 	 * end of list.
5327 	 */
5328 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5329 	if (ldcp->lane_out.dringp == NULL) {
5330 		D2(vswp, "%s: adding first outbound privring", __func__);
5331 		ldcp->lane_out.dringp = dp;
5332 	} else {
5333 		tp = ldcp->lane_out.dringp;
5334 		while (tp->next != NULL)
5335 			tp = tp->next;
5336 
5337 		tp->next = dp;
5338 	}
5339 	RW_EXIT(&ldcp->lane_out.dlistrw);
5340 
5341 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5342 }
5343 
5344 /*
5345  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5346  * failure.
5347  */
5348 int
5349 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5350 {
5351 	vnet_public_desc_t	*pub_addr = NULL;
5352 	vsw_private_desc_t	*priv_addr = NULL;
5353 	vsw_t			*vswp = ldcp->ldc_vswp;
5354 	uint64_t		*tmpp;
5355 	uint64_t		offset = 0;
5356 	uint32_t		ncookies = 0;
5357 	static char		*name = "vsw_setup_ring";
5358 	int			i, j, nc, rv;
5359 	size_t			data_sz;
5360 	void			*data_addr;
5361 
5362 	priv_addr = dp->priv_addr;
5363 	pub_addr = dp->pub_addr;
5364 
5365 	/* public section may be null but private should never be */
5366 	ASSERT(priv_addr != NULL);
5367 
5368 	/*
5369 	 * Allocate the region of memory which will be used to hold
5370 	 * the data the descriptors will refer to.
5371 	 */
5372 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5373 
5374 	/*
5375 	 * In order to ensure that the number of ldc cookies per descriptor is
5376 	 * limited to be within the default MAX_COOKIES (2), we take the steps
5377 	 * outlined below:
5378 	 *
5379 	 * Align the entire data buffer area to 8K and carve out per descriptor
5380 	 * data buffers starting from this 8K aligned base address.
5381 	 *
5382 	 * We round up the mtu specified to be a multiple of 2K or 4K.
5383 	 * For sizes up to 12K we round up the size to the next 2K.
5384 	 * For sizes > 12K we round up to the next 4K (otherwise sizes such as
5385 	 * 14K could end up needing 3 cookies, with the buffer spread across
5386 	 * 3 8K pages:  8K+6K, 2K+8K+2K, 6K+8K, ...).
5387 	 */
5388 	if (data_sz <= VNET_12K) {
5389 		data_sz = VNET_ROUNDUP_2K(data_sz);
5390 	} else {
5391 		data_sz = VNET_ROUNDUP_4K(data_sz);
5392 	}
5393 
5394 	dp->desc_data_sz = data_sz;
5395 
5396 	/* allocate extra 8K bytes for alignment */
5397 	dp->data_sz = (vsw_ntxds * data_sz) + VNET_8K;
5398 	data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5399 	dp->data_addr = data_addr;
5400 
5401 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5402 	    dp->data_sz, dp->data_addr);
5403 
5404 	/* align the starting address of the data area to 8K */
5405 	data_addr = (void *)VNET_ROUNDUP_8K((uintptr_t)data_addr);
5406 
5407 	tmpp = (uint64_t *)data_addr;
5408 	offset = dp->desc_data_sz/sizeof (tmpp);
5409 
5410 	/*
5411 	 * Initialise some of the private and public (if they exist)
5412 	 * descriptor fields.
5413 	 */
5414 	for (i = 0; i < vsw_ntxds; i++) {
5415 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5416 
5417 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5418 		    &priv_addr->memhandle)) != 0) {
5419 			DERR(vswp, "%s: alloc mem handle failed", name);
5420 			goto setup_ring_cleanup;
5421 		}
5422 
5423 		priv_addr->datap = (void *)tmpp;
5424 
5425 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5426 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5427 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5428 		    &(priv_addr->memcookie[0]), &ncookies);
5429 		if (rv != 0) {
5430 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5431 			    "(rv %d)", name, ldcp->ldc_id, rv);
5432 			goto setup_ring_cleanup;
5433 		}
5434 		priv_addr->bound = 1;
5435 
5436 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5437 		    name, i, priv_addr->memcookie[0].addr,
5438 		    priv_addr->memcookie[0].size);
5439 
5440 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5441 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5442 			    "invalid num of cookies (%d) for size 0x%llx",
5443 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5444 
5445 			goto setup_ring_cleanup;
5446 		} else {
5447 			for (j = 1; j < ncookies; j++) {
5448 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5449 				    &(priv_addr->memcookie[j]));
5450 				if (rv != 0) {
5451 					DERR(vswp, "%s: ldc_mem_nextcookie "
5452 					    "failed rv (%d)", name, rv);
5453 					goto setup_ring_cleanup;
5454 				}
5455 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5456 				    "size 0x%llx", name, j,
5457 				    priv_addr->memcookie[j].addr,
5458 				    priv_addr->memcookie[j].size);
5459 			}
5460 
5461 		}
5462 		priv_addr->ncookies = ncookies;
5463 		priv_addr->dstate = VIO_DESC_FREE;
5464 
5465 		if (pub_addr != NULL) {
5466 
5467 			/* link pub and private sides */
5468 			priv_addr->descp = pub_addr;
5469 
5470 			pub_addr->ncookies = priv_addr->ncookies;
5471 
5472 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5473 				bcopy(&priv_addr->memcookie[nc],
5474 				    &pub_addr->memcookie[nc],
5475 				    sizeof (ldc_mem_cookie_t));
5476 			}
5477 
5478 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5479 			pub_addr++;
5480 		}
5481 
5482 		/*
5483 		 * move to next element in the dring and the next
5484 		 * position in the data buffer.
5485 		 */
5486 		priv_addr++;
5487 		tmpp += offset;
5488 	}
5489 
5490 	return (0);
5491 
5492 setup_ring_cleanup:
5493 	priv_addr = dp->priv_addr;
5494 
5495 	for (j = 0; j < i; j++) {
5496 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5497 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5498 
5499 		mutex_destroy(&priv_addr->dstate_lock);
5500 
5501 		priv_addr++;
5502 	}
5503 	kmem_free(dp->data_addr, dp->data_sz);
5504 
5505 	return (1);
5506 }
5507 
5508 /*
5509  * Searches the private section of a ring for a free descriptor,
5510  * starting at the location of the last free descriptor found
5511  * previously.
5512  *
5513  * Returns 0 if free descriptor is available, and updates state
5514  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5515  *
5516  * FUTURE: might need to return contiguous range of descriptors
5517  * as dring info msg assumes all will be contiguous.
5518  */
5519 static int
5520 vsw_dring_find_free_desc(dring_info_t *dringp,
5521 		vsw_private_desc_t **priv_p, int *idx)
5522 {
5523 	vsw_private_desc_t	*addr = NULL;
5524 	int			num = vsw_ntxds;
5525 	int			ret = 1;
5526 
5527 	D1(NULL, "%s enter\n", __func__);
5528 
5529 	ASSERT(dringp->priv_addr != NULL);
5530 
5531 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5532 	    __func__, dringp, dringp->end_idx);
5533 
5534 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5535 
5536 	mutex_enter(&addr->dstate_lock);
5537 	if (addr->dstate == VIO_DESC_FREE) {
5538 		addr->dstate = VIO_DESC_READY;
5539 		*priv_p = addr;
5540 		*idx = dringp->end_idx;
5541 		dringp->end_idx = (dringp->end_idx + 1) % num;
5542 		ret = 0;
5543 
5544 	}
5545 	mutex_exit(&addr->dstate_lock);
5546 
5547 	/* ring full */
5548 	if (ret == 1) {
5549 		D2(NULL, "%s: no desp free: started at %d", __func__,
5550 		    dringp->end_idx);
5551 	}
5552 
5553 	D1(NULL, "%s: exit\n", __func__);
5554 
5555 	return (ret);
5556 }
5557 
5558 /*
5559  * Map from a dring identifier to the ring itself. Returns
5560  * pointer to ring or NULL if no match found.
5561  *
5562  * Should be called with dlistrw rwlock held as reader.
5563  */
5564 static dring_info_t *
5565 vsw_ident2dring(lane_t *lane, uint64_t ident)
5566 {
5567 	dring_info_t	*dp = NULL;
5568 
5569 	if ((dp = lane->dringp) == NULL) {
5570 		return (NULL);
5571 	} else {
5572 		if (dp->ident == ident)
5573 			return (dp);
5574 
5575 		while (dp != NULL) {
5576 			if (dp->ident == ident)
5577 				break;
5578 			dp = dp->next;
5579 		}
5580 	}
5581 
5582 	return (dp);
5583 }
5584 
5585 /*
5586  * Set the default lane attributes. These are copied into
5587  * the attr msg we send to our peer. If they are not acceptable
5588  * then (currently) the handshake ends.
5589  */
5590 static void
5591 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5592 {
5593 	bzero(lp, sizeof (lane_t));
5594 
5595 	READ_ENTER(&vswp->if_lockrw);
5596 	ether_copy(&(vswp->if_addr), &(lp->addr));
5597 	RW_EXIT(&vswp->if_lockrw);
5598 
5599 	lp->mtu = vswp->max_frame_size;
5600 	lp->addr_type = ADDR_TYPE_MAC;
5601 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5602 	lp->ack_freq = 0;	/* for shared mode */
5603 	lp->seq_num = VNET_ISS;
5604 }
5605 
5606 /*
5607  * Verify that the attributes are acceptable.
5608  *
5609  * FUTURE: If some attributes are not acceptable, change them
5610  * our desired values.
5611  */
5612 static int
5613 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5614 {
5615 	int			ret = 0;
5616 	struct ether_addr	ea;
5617 	vsw_port_t		*port = ldcp->ldc_port;
5618 	lane_t			*lp = &ldcp->lane_out;
5619 
5620 	D1(NULL, "vsw_check_attr enter\n");
5621 
5622 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5623 	    (pkt->xfer_mode != lp->xfer_mode)) {
5624 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5625 		ret = 1;
5626 	}
5627 
5628 	/* Only support MAC addresses at moment. */
5629 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5630 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5631 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5632 		ret = 1;
5633 	}
5634 
5635 	/*
5636 	 * MAC address supplied by device should match that stored
5637 	 * in the vsw-port OBP node. Need to decide what to do if they
5638 	 * don't match, for the moment just warn but don't fail.
5639 	 */
5640 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5641 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5642 		DERR(NULL, "vsw_check_attr: device supplied address "
5643 		    "0x%llx doesn't match node address 0x%llx\n",
5644 		    pkt->addr, port->p_macaddr);
5645 	}
5646 
5647 	/*
5648 	 * Ack freq only makes sense in pkt mode, in shared
5649 	 * mode the ring descriptors say whether or not to
5650 	 * send back an ACK.
5651 	 */
5652 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5653 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5654 	    (VSW_VER_LT(ldcp, 1, 2) &&
5655 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5656 		if (pkt->ack_freq > 0) {
5657 			D2(NULL, "vsw_check_attr: non zero ack freq "
5658 			    " in SHM mode\n");
5659 			ret = 1;
5660 		}
5661 	}
5662 
5663 	if (VSW_VER_LT(ldcp, 1, 4)) {
5664 		/* versions < 1.4, mtu must match */
5665 		if (pkt->mtu != lp->mtu) {
5666 			D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5667 			    pkt->mtu);
5668 			ret = 1;
5669 		}
5670 	} else {
5671 		/* Ver >= 1.4, validate mtu of the peer is at least ETHERMAX */
5672 		if (pkt->mtu < ETHERMAX) {
5673 			ret = 1;
5674 		}
5675 	}
5676 
5677 	D1(NULL, "vsw_check_attr exit\n");
5678 
5679 	return (ret);
5680 }
5681 
5682 /*
5683  * Returns 1 if there is a problem, 0 otherwise.
5684  */
5685 static int
5686 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5687 {
5688 	_NOTE(ARGUNUSED(pkt))
5689 
5690 	int	ret = 0;
5691 
5692 	D1(NULL, "vsw_check_dring_info enter\n");
5693 
5694 	if ((pkt->num_descriptors == 0) ||
5695 	    (pkt->descriptor_size == 0) ||
5696 	    (pkt->ncookies != 1)) {
5697 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5698 		ret = 1;
5699 	}
5700 
5701 	D1(NULL, "vsw_check_dring_info exit\n");
5702 
5703 	return (ret);
5704 }
5705 
5706 /*
5707  * Returns 1 if two memory cookies match. Otherwise returns 0.
5708  */
5709 static int
5710 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5711 {
5712 	if ((m1->addr != m2->addr) ||
5713 	    (m2->size != m2->size)) {
5714 		return (0);
5715 	} else {
5716 		return (1);
5717 	}
5718 }
5719 
5720 /*
5721  * Returns 1 if ring described in reg message matches that
5722  * described by dring_info structure. Otherwise returns 0.
5723  */
5724 static int
5725 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5726 {
5727 	if ((msg->descriptor_size != dp->descriptor_size) ||
5728 	    (msg->num_descriptors != dp->num_descriptors) ||
5729 	    (msg->ncookies != dp->ncookies) ||
5730 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5731 		return (0);
5732 	} else {
5733 		return (1);
5734 	}
5735 
5736 }
5737 
5738 static caddr_t
5739 vsw_print_ethaddr(uint8_t *a, char *ebuf)
5740 {
5741 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
5742 	    a[0], a[1], a[2], a[3], a[4], a[5]);
5743 	return (ebuf);
5744 }
5745 
5746 /*
5747  * Reset and free all the resources associated with
5748  * the channel.
5749  */
5750 static void
5751 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5752 {
5753 	dring_info_t		*dp, *dpp;
5754 	lane_t			*lp = NULL;
5755 
5756 	ASSERT(ldcp != NULL);
5757 
5758 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5759 
5760 	if (dir == INBOUND) {
5761 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5762 		    " of channel %lld", __func__, ldcp->ldc_id);
5763 		lp = &ldcp->lane_in;
5764 	} else {
5765 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5766 		    " of channel %lld", __func__, ldcp->ldc_id);
5767 		lp = &ldcp->lane_out;
5768 	}
5769 
5770 	lp->lstate = VSW_LANE_INACTIV;
5771 	lp->seq_num = VNET_ISS;
5772 
5773 	if (lp->dringp) {
5774 		if (dir == INBOUND) {
5775 			WRITE_ENTER(&lp->dlistrw);
5776 			dp = lp->dringp;
5777 			while (dp != NULL) {
5778 				dpp = dp->next;
5779 				if (dp->handle != NULL)
5780 					(void) ldc_mem_dring_unmap(dp->handle);
5781 				kmem_free(dp, sizeof (dring_info_t));
5782 				dp = dpp;
5783 			}
5784 			RW_EXIT(&lp->dlistrw);
5785 		} else {
5786 			/*
5787 			 * unbind, destroy exported dring, free dring struct
5788 			 */
5789 			WRITE_ENTER(&lp->dlistrw);
5790 			dp = lp->dringp;
5791 			vsw_free_ring(dp);
5792 			RW_EXIT(&lp->dlistrw);
5793 		}
5794 		lp->dringp = NULL;
5795 	}
5796 
5797 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5798 }
5799 
5800 /*
5801  * Free ring and all associated resources.
5802  *
5803  * Should be called with dlistrw rwlock held as writer.
5804  */
5805 static void
5806 vsw_free_ring(dring_info_t *dp)
5807 {
5808 	vsw_private_desc_t	*paddr = NULL;
5809 	dring_info_t		*dpp;
5810 	int			i;
5811 
5812 	while (dp != NULL) {
5813 		mutex_enter(&dp->dlock);
5814 		dpp = dp->next;
5815 		if (dp->priv_addr != NULL) {
5816 			/*
5817 			 * First unbind and free the memory handles
5818 			 * stored in each descriptor within the ring.
5819 			 */
5820 			for (i = 0; i < vsw_ntxds; i++) {
5821 				paddr = (vsw_private_desc_t *)
5822 				    dp->priv_addr + i;
5823 				if (paddr->memhandle != NULL) {
5824 					if (paddr->bound == 1) {
5825 						if (ldc_mem_unbind_handle(
5826 						    paddr->memhandle) != 0) {
5827 							DERR(NULL, "error "
5828 							"unbinding handle for "
5829 							"ring 0x%llx at pos %d",
5830 							    dp, i);
5831 							continue;
5832 						}
5833 						paddr->bound = 0;
5834 					}
5835 
5836 					if (ldc_mem_free_handle(
5837 					    paddr->memhandle) != 0) {
5838 						DERR(NULL, "error freeing "
5839 						    "handle for ring 0x%llx "
5840 						    "at pos %d", dp, i);
5841 						continue;
5842 					}
5843 					paddr->memhandle = NULL;
5844 				}
5845 				mutex_destroy(&paddr->dstate_lock);
5846 			}
5847 			kmem_free(dp->priv_addr,
5848 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5849 		}
5850 
5851 		/*
5852 		 * Now unbind and destroy the ring itself.
5853 		 */
5854 		if (dp->handle != NULL) {
5855 			(void) ldc_mem_dring_unbind(dp->handle);
5856 			(void) ldc_mem_dring_destroy(dp->handle);
5857 		}
5858 
5859 		if (dp->data_addr != NULL) {
5860 			kmem_free(dp->data_addr, dp->data_sz);
5861 		}
5862 
5863 		mutex_exit(&dp->dlock);
5864 		mutex_destroy(&dp->dlock);
5865 		mutex_destroy(&dp->restart_lock);
5866 		kmem_free(dp, sizeof (dring_info_t));
5867 
5868 		dp = dpp;
5869 	}
5870 }
5871 
5872 /*
5873  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5874  * This thread is woken up by the LDC interrupt handler to process
5875  * LDC packets and receive data.
5876  */
5877 static void
5878 vsw_ldc_rx_worker(void *arg)
5879 {
5880 	callb_cpr_t	cprinfo;
5881 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5882 	vsw_t *vswp = ldcp->ldc_vswp;
5883 
5884 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5885 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5886 	    "vsw_rx_thread");
5887 	mutex_enter(&ldcp->rx_thr_lock);
5888 	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5889 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5890 
5891 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5892 		/*
5893 		 * Wait until the data is received or a stop
5894 		 * request is received.
5895 		 */
5896 		while (!(ldcp->rx_thr_flags &
5897 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5898 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5899 		}
5900 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5901 
5902 		/*
5903 		 * First process the stop request.
5904 		 */
5905 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5906 			D2(vswp, "%s(%lld):Rx thread stopped\n",
5907 			    __func__, ldcp->ldc_id);
5908 			break;
5909 		}
5910 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5911 		mutex_exit(&ldcp->rx_thr_lock);
5912 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5913 		    __func__, ldcp->ldc_id);
5914 		mutex_enter(&ldcp->ldc_cblock);
5915 		vsw_process_pkt(ldcp);
5916 		mutex_exit(&ldcp->ldc_cblock);
5917 		mutex_enter(&ldcp->rx_thr_lock);
5918 	}
5919 
5920 	/*
5921 	 * Update the run status and wakeup the thread that
5922 	 * has sent the stop request.
5923 	 */
5924 	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5925 	cv_signal(&ldcp->rx_thr_cv);
5926 	CALLB_CPR_EXIT(&cprinfo);
5927 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5928 	thread_exit();
5929 }
5930 
5931 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5932 static void
5933 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5934 {
5935 	vsw_t *vswp = ldcp->ldc_vswp;
5936 
5937 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5938 	/*
5939 	 * Send a stop request by setting the stop flag and
5940 	 * wait until the receive thread stops.
5941 	 */
5942 	mutex_enter(&ldcp->rx_thr_lock);
5943 	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5944 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5945 		cv_signal(&ldcp->rx_thr_cv);
5946 		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5947 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5948 		}
5949 	}
5950 	mutex_exit(&ldcp->rx_thr_lock);
5951 	ldcp->rx_thread = NULL;
5952 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5953 }
5954 
5955 /*
5956  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5957  * This thread is woken up by the vsw_portsend to transmit
5958  * packets.
5959  */
5960 static void
5961 vsw_ldc_tx_worker(void *arg)
5962 {
5963 	callb_cpr_t	cprinfo;
5964 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5965 	vsw_t *vswp = ldcp->ldc_vswp;
5966 	mblk_t *mp;
5967 	mblk_t *tmp;
5968 
5969 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5970 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5971 	    "vnet_tx_thread");
5972 	mutex_enter(&ldcp->tx_thr_lock);
5973 	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5974 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5975 
5976 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5977 		/*
5978 		 * Wait until the data is received or a stop
5979 		 * request is received.
5980 		 */
5981 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5982 		    (ldcp->tx_mhead == NULL)) {
5983 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5984 		}
5985 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5986 
5987 		/*
5988 		 * First process the stop request.
5989 		 */
5990 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5991 			D2(vswp, "%s(%lld):tx thread stopped\n",
5992 			    __func__, ldcp->ldc_id);
5993 			break;
5994 		}
5995 		mp = ldcp->tx_mhead;
5996 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5997 		ldcp->tx_cnt = 0;
5998 		mutex_exit(&ldcp->tx_thr_lock);
5999 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
6000 		    __func__, ldcp->ldc_id);
6001 		while (mp != NULL) {
6002 			tmp = mp->b_next;
6003 			mp->b_next = mp->b_prev = NULL;
6004 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
6005 			mp = tmp;
6006 		}
6007 		mutex_enter(&ldcp->tx_thr_lock);
6008 	}
6009 
6010 	/*
6011 	 * Update the run status and wakeup the thread that
6012 	 * has sent the stop request.
6013 	 */
6014 	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
6015 	cv_signal(&ldcp->tx_thr_cv);
6016 	CALLB_CPR_EXIT(&cprinfo);
6017 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6018 	thread_exit();
6019 }
6020 
6021 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
6022 static void
6023 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
6024 {
6025 	vsw_t *vswp = ldcp->ldc_vswp;
6026 
6027 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6028 	/*
6029 	 * Send a stop request by setting the stop flag and
6030 	 * wait until the receive thread stops.
6031 	 */
6032 	mutex_enter(&ldcp->tx_thr_lock);
6033 	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
6034 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
6035 		cv_signal(&ldcp->tx_thr_cv);
6036 		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
6037 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
6038 		}
6039 	}
6040 	mutex_exit(&ldcp->tx_thr_lock);
6041 	ldcp->tx_thread = NULL;
6042 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6043 }
6044 
6045 /* vsw_reclaim_dring -- reclaim descriptors */
6046 static int
6047 vsw_reclaim_dring(dring_info_t *dp, int start)
6048 {
6049 	int i, j, len;
6050 	vsw_private_desc_t *priv_addr;
6051 	vnet_public_desc_t *pub_addr;
6052 
6053 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
6054 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6055 	len = dp->num_descriptors;
6056 
6057 	D2(NULL, "%s: start index %ld\n", __func__, start);
6058 
6059 	j = 0;
6060 	for (i = start; j < len; i = (i + 1) % len, j++) {
6061 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6062 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6063 
6064 		mutex_enter(&priv_addr->dstate_lock);
6065 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
6066 			mutex_exit(&priv_addr->dstate_lock);
6067 			break;
6068 		}
6069 		pub_addr->hdr.dstate = VIO_DESC_FREE;
6070 		priv_addr->dstate = VIO_DESC_FREE;
6071 		/* clear all the fields */
6072 		priv_addr->datalen = 0;
6073 		pub_addr->hdr.ack = 0;
6074 		mutex_exit(&priv_addr->dstate_lock);
6075 
6076 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
6077 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
6078 	}
6079 	return (j);
6080 }
6081 
6082 /*
6083  * Debugging routines
6084  */
6085 static void
6086 display_state(void)
6087 {
6088 	vsw_t		*vswp;
6089 	vsw_port_list_t	*plist;
6090 	vsw_port_t 	*port;
6091 	vsw_ldc_list_t	*ldcl;
6092 	vsw_ldc_t 	*ldcp;
6093 	extern vsw_t 	*vsw_head;
6094 
6095 	cmn_err(CE_NOTE, "***** system state *****");
6096 
6097 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
6098 		plist = &vswp->plist;
6099 		READ_ENTER(&plist->lockrw);
6100 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
6101 		    vswp->instance, plist->num_ports);
6102 
6103 		for (port = plist->head; port != NULL; port = port->p_next) {
6104 			ldcl = &port->p_ldclist;
6105 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
6106 			    port->p_instance, port->num_ldcs);
6107 			READ_ENTER(&ldcl->lockrw);
6108 			ldcp = ldcl->head;
6109 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
6110 				cmn_err(CE_CONT, "chan %lu : dev %d : "
6111 				    "status %d : phase %u\n",
6112 				    ldcp->ldc_id, ldcp->dev_class,
6113 				    ldcp->ldc_status, ldcp->hphase);
6114 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
6115 				    "psession %lu\n", ldcp->ldc_id,
6116 				    ldcp->local_session, ldcp->peer_session);
6117 
6118 				cmn_err(CE_CONT, "Inbound lane:\n");
6119 				display_lane(&ldcp->lane_in);
6120 				cmn_err(CE_CONT, "Outbound lane:\n");
6121 				display_lane(&ldcp->lane_out);
6122 			}
6123 			RW_EXIT(&ldcl->lockrw);
6124 		}
6125 		RW_EXIT(&plist->lockrw);
6126 	}
6127 	cmn_err(CE_NOTE, "***** system state *****");
6128 }
6129 
6130 static void
6131 display_lane(lane_t *lp)
6132 {
6133 	dring_info_t	*drp;
6134 
6135 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
6136 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
6137 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
6138 	    lp->addr_type, lp->addr, lp->xfer_mode);
6139 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
6140 
6141 	cmn_err(CE_CONT, "Dring info:\n");
6142 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
6143 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
6144 		    drp->num_descriptors, drp->descriptor_size);
6145 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
6146 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
6147 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
6148 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
6149 		    drp->ident, drp->end_idx);
6150 		display_ring(drp);
6151 	}
6152 }
6153 
6154 static void
6155 display_ring(dring_info_t *dringp)
6156 {
6157 	uint64_t		i;
6158 	uint64_t		priv_count = 0;
6159 	uint64_t		pub_count = 0;
6160 	vnet_public_desc_t	*pub_addr = NULL;
6161 	vsw_private_desc_t	*priv_addr = NULL;
6162 
6163 	for (i = 0; i < vsw_ntxds; i++) {
6164 		if (dringp->pub_addr != NULL) {
6165 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
6166 
6167 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
6168 				pub_count++;
6169 		}
6170 
6171 		if (dringp->priv_addr != NULL) {
6172 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
6173 
6174 			if (priv_addr->dstate == VIO_DESC_FREE)
6175 				priv_count++;
6176 		}
6177 	}
6178 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
6179 	    i, priv_count, pub_count);
6180 }
6181 
6182 static void
6183 dump_flags(uint64_t state)
6184 {
6185 	int	i;
6186 
6187 	typedef struct flag_name {
6188 		int	flag_val;
6189 		char	*flag_name;
6190 	} flag_name_t;
6191 
6192 	flag_name_t	flags[] = {
6193 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
6194 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
6195 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
6196 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
6197 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
6198 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
6199 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
6200 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
6201 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
6202 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
6203 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
6204 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
6205 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
6206 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
6207 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
6208 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
6209 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
6210 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
6211 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
6212 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
6213 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
6214 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
6215 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
6216 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
6217 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
6218 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
6219 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
6220 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
6221 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
6222 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
6223 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
6224 
6225 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
6226 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
6227 		if (state & flags[i].flag_val)
6228 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
6229 	}
6230 }
6231