xref: /titanic_50/usr/src/uts/sun4v/io/vsw_ldc.c (revision 7257d1b4d25bfac0c802847390e98a464fd787ac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 #include <sys/callb.h>
75 #include <sys/vlan.h>
76 
77 /* Port add/deletion/etc routines */
78 static	int vsw_port_delete(vsw_port_t *port);
79 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
80 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
81 static	int vsw_init_ldcs(vsw_port_t *port);
82 static	int vsw_uninit_ldcs(vsw_port_t *port);
83 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
84 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
85 static	int vsw_drain_ldcs(vsw_port_t *port);
86 static	int vsw_drain_port_taskq(vsw_port_t *port);
87 static	void vsw_marker_task(void *);
88 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
89 int vsw_detach_ports(vsw_t *vswp);
90 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
91 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
92 int vsw_port_detach(vsw_t *vswp, int p_instance);
93 int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
94 int vsw_port_attach(vsw_port_t *portp);
95 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
96 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
97 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
98 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
99 
100 /* Interrupt routines */
101 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
102 
103 /* Handshake routines */
104 static	void vsw_ldc_reinit(vsw_ldc_t *);
105 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
106 static	void vsw_conn_task(void *);
107 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
108 static	void vsw_next_milestone(vsw_ldc_t *);
109 static	int vsw_supported_version(vio_ver_msg_t *);
110 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
111 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
112 
113 /* Data processing routines */
114 static void vsw_process_pkt(void *);
115 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
116 static void vsw_process_ctrl_pkt(void *);
117 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
123 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
124 	uint32_t);
125 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
126 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
127 static void vsw_process_pkt_data(void *, void *, uint32_t);
128 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
129 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
130 
131 /* Switching/data transmit routines */
132 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
133 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
134 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
135 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
136 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
138 
139 /* Packet creation routines */
140 static void vsw_send_ver(void *);
141 static void vsw_send_attr(vsw_ldc_t *);
142 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
143 static void vsw_send_dring_info(vsw_ldc_t *);
144 static void vsw_send_rdx(vsw_ldc_t *);
145 
146 /* Dring routines */
147 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
148 static void vsw_create_privring(vsw_ldc_t *);
149 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
150 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
151     int *);
152 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
153 static int vsw_reclaim_dring(dring_info_t *dp, int start);
154 
155 static void vsw_set_lane_attr(vsw_t *, lane_t *);
156 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
157 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
158 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
159 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
160 
161 /* Rcv/Tx thread routines */
162 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
163 static void vsw_ldc_tx_worker(void *arg);
164 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
165 static void vsw_ldc_rx_worker(void *arg);
166 
167 /* Misc support routines */
168 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
169 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
170 static int vsw_free_ring(dring_info_t *);
171 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
172 static int vsw_get_same_dest_list(struct ether_header *ehp,
173     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
174 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
175 
176 /* Debugging routines */
177 static void dump_flags(uint64_t);
178 static void display_state(void);
179 static void display_lane(lane_t *);
180 static void display_ring(dring_info_t *);
181 
182 /*
183  * Functions imported from other files.
184  */
185 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
186 extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
187 extern void vsw_reconfig_hw(vsw_t *);
188 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
189 extern void vsw_del_mcst_port(vsw_port_t *port);
190 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
191 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
192 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
193 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
194 extern void vsw_create_vlans(void *arg, int type);
195 extern void vsw_destroy_vlans(void *arg, int type);
196 extern void vsw_vlan_add_ids(void *arg, int type);
197 extern void vsw_vlan_remove_ids(void *arg, int type);
198 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
199 	struct ether_header *ehp, uint16_t *vidp);
200 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
201 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
202 	mblk_t **npt);
203 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
204 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
205 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
206 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
207 extern void vsw_hio_stop_port(vsw_port_t *portp);
208 
209 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
210 
211 /*
212  * Tunables used in this file.
213  */
214 extern int vsw_num_handshakes;
215 extern int vsw_wretries;
216 extern int vsw_desc_delay;
217 extern int vsw_read_attempts;
218 extern int vsw_ldc_tx_delay;
219 extern int vsw_ldc_tx_retries;
220 extern boolean_t vsw_ldc_rxthr_enabled;
221 extern boolean_t vsw_ldc_txthr_enabled;
222 extern uint32_t vsw_ntxds;
223 extern uint32_t vsw_max_tx_qcount;
224 extern uint32_t vsw_chain_len;
225 extern uint32_t vsw_mblk_size1;
226 extern uint32_t vsw_mblk_size2;
227 extern uint32_t vsw_mblk_size3;
228 extern uint32_t vsw_num_mblks1;
229 extern uint32_t vsw_num_mblks2;
230 extern uint32_t vsw_num_mblks3;
231 extern boolean_t vsw_obp_ver_proto_workaround;
232 
233 #define	LDC_ENTER_LOCK(ldcp)	\
234 				mutex_enter(&((ldcp)->ldc_cblock));\
235 				mutex_enter(&((ldcp)->ldc_rxlock));\
236 				mutex_enter(&((ldcp)->ldc_txlock));
237 #define	LDC_EXIT_LOCK(ldcp)	\
238 				mutex_exit(&((ldcp)->ldc_txlock));\
239 				mutex_exit(&((ldcp)->ldc_rxlock));\
240 				mutex_exit(&((ldcp)->ldc_cblock));
241 
242 #define	VSW_VER_EQ(ldcp, major, minor)	\
243 	((ldcp)->lane_out.ver_major == (major) &&	\
244 	    (ldcp)->lane_out.ver_minor == (minor))
245 
246 #define	VSW_VER_LT(ldcp, major, minor)	\
247 	(((ldcp)->lane_out.ver_major < (major)) ||	\
248 	    ((ldcp)->lane_out.ver_major == (major) &&	\
249 	    (ldcp)->lane_out.ver_minor < (minor)))
250 
251 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
252 	(((ldcp)->lane_out.ver_major > (major)) ||	\
253 	    ((ldcp)->lane_out.ver_major == (major) &&	\
254 	    (ldcp)->lane_out.ver_minor >= (minor)))
255 
256 /* supported versions */
257 static	ver_sup_t	vsw_versions[] = { {1, 3} };
258 
259 /*
260  * For the moment the state dump routines have their own
261  * private flag.
262  */
263 #define	DUMP_STATE	0
264 
265 #if DUMP_STATE
266 
267 #define	DUMP_TAG(tag) \
268 {			\
269 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
270 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
271 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
272 }
273 
274 #define	DUMP_TAG_PTR(tag) \
275 {			\
276 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
277 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
278 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
279 }
280 
281 #define	DUMP_FLAGS(flags) dump_flags(flags);
282 #define	DISPLAY_STATE()	display_state()
283 
284 #else
285 
286 #define	DUMP_TAG(tag)
287 #define	DUMP_TAG_PTR(tag)
288 #define	DUMP_FLAGS(state)
289 #define	DISPLAY_STATE()
290 
291 #endif	/* DUMP_STATE */
292 
293 /*
294  * Attach the specified port.
295  *
296  * Returns 0 on success, 1 on failure.
297  */
298 int
299 vsw_port_attach(vsw_port_t *port)
300 {
301 	vsw_t			*vswp = port->p_vswp;
302 	vsw_port_list_t		*plist = &vswp->plist;
303 	vsw_port_t		*p, **pp;
304 	int			i;
305 	int			nids = port->num_ldcs;
306 	uint64_t		*ldcids;
307 
308 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
309 
310 	/* port already exists? */
311 	READ_ENTER(&plist->lockrw);
312 	for (p = plist->head; p != NULL; p = p->p_next) {
313 		if (p->p_instance == port->p_instance) {
314 			DWARN(vswp, "%s: port instance %d already attached",
315 			    __func__, p->p_instance);
316 			RW_EXIT(&plist->lockrw);
317 			return (1);
318 		}
319 	}
320 	RW_EXIT(&plist->lockrw);
321 
322 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
323 
324 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
325 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
326 
327 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
328 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
329 	port->state = VSW_PORT_INIT;
330 
331 	D2(vswp, "%s: %d nids", __func__, nids);
332 	ldcids = port->ldc_ids;
333 	for (i = 0; i < nids; i++) {
334 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
335 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
336 			DERR(vswp, "%s: ldc_attach failed", __func__);
337 
338 			rw_destroy(&port->p_ldclist.lockrw);
339 
340 			cv_destroy(&port->state_cv);
341 			mutex_destroy(&port->state_lock);
342 
343 			mutex_destroy(&port->tx_lock);
344 			mutex_destroy(&port->mca_lock);
345 			kmem_free(port, sizeof (vsw_port_t));
346 			return (1);
347 		}
348 	}
349 
350 	if (vswp->switching_setup_done == B_TRUE) {
351 		/*
352 		 * If the underlying physical device has been setup,
353 		 * program the mac address of this port in it.
354 		 * Otherwise, port macaddr will be set after the physical
355 		 * device is successfully setup by the timeout handler.
356 		 */
357 		mutex_enter(&vswp->hw_lock);
358 		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
359 		mutex_exit(&vswp->hw_lock);
360 	}
361 
362 	/* create the fdb entry for this port/mac address */
363 	vsw_fdbe_add(vswp, port);
364 
365 	vsw_create_vlans(port, VSW_VNETPORT);
366 
367 	WRITE_ENTER(&plist->lockrw);
368 
369 	/* link it into the list of ports for this vsw instance */
370 	pp = (vsw_port_t **)(&plist->head);
371 	port->p_next = *pp;
372 	*pp = port;
373 	plist->num_ports++;
374 
375 	RW_EXIT(&plist->lockrw);
376 
377 	/*
378 	 * Initialise the port and any ldc's under it.
379 	 */
380 	(void) vsw_init_ldcs(port);
381 
382 	D1(vswp, "%s: exit", __func__);
383 	return (0);
384 }
385 
386 /*
387  * Detach the specified port.
388  *
389  * Returns 0 on success, 1 on failure.
390  */
391 int
392 vsw_port_detach(vsw_t *vswp, int p_instance)
393 {
394 	vsw_port_t	*port = NULL;
395 	vsw_port_list_t	*plist = &vswp->plist;
396 
397 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
398 
399 	WRITE_ENTER(&plist->lockrw);
400 
401 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
402 		RW_EXIT(&plist->lockrw);
403 		return (1);
404 	}
405 
406 	if (vsw_plist_del_node(vswp, port)) {
407 		RW_EXIT(&plist->lockrw);
408 		return (1);
409 	}
410 
411 	/* cleanup any HybridIO for this port */
412 	vsw_hio_stop_port(port);
413 
414 	/*
415 	 * No longer need to hold writer lock on port list now
416 	 * that we have unlinked the target port from the list.
417 	 */
418 	RW_EXIT(&plist->lockrw);
419 
420 	/* Remove the fdb entry for this port/mac address */
421 	vsw_fdbe_del(vswp, &(port->p_macaddr));
422 	vsw_destroy_vlans(port, VSW_VNETPORT);
423 
424 	/* Remove any multicast addresses.. */
425 	vsw_del_mcst_port(port);
426 
427 	/* Remove address if was programmed into HW. */
428 	mutex_enter(&vswp->hw_lock);
429 
430 	/*
431 	 * Port's address may not have been set in hardware. This could
432 	 * happen if the underlying physical device is not yet available and
433 	 * vsw_setup_switching_timeout() may be in progress.
434 	 * We remove its addr from hardware only if it has been set before.
435 	 */
436 	if (port->addr_set != VSW_ADDR_UNSET)
437 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
438 
439 	if (vswp->recfg_reqd)
440 		vsw_reconfig_hw(vswp);
441 
442 	mutex_exit(&vswp->hw_lock);
443 
444 	if (vsw_port_delete(port)) {
445 		return (1);
446 	}
447 
448 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
449 	return (0);
450 }
451 
452 /*
453  * Detach all active ports.
454  *
455  * Returns 0 on success, 1 on failure.
456  */
457 int
458 vsw_detach_ports(vsw_t *vswp)
459 {
460 	vsw_port_list_t 	*plist = &vswp->plist;
461 	vsw_port_t		*port = NULL;
462 
463 	D1(vswp, "%s: enter", __func__);
464 
465 	WRITE_ENTER(&plist->lockrw);
466 
467 	while ((port = plist->head) != NULL) {
468 		if (vsw_plist_del_node(vswp, port)) {
469 			DERR(vswp, "%s: Error deleting port %d"
470 			    " from port list", __func__, port->p_instance);
471 			RW_EXIT(&plist->lockrw);
472 			return (1);
473 		}
474 
475 		/* Remove address if was programmed into HW. */
476 		mutex_enter(&vswp->hw_lock);
477 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
478 		mutex_exit(&vswp->hw_lock);
479 
480 		/* Remove the fdb entry for this port/mac address */
481 		vsw_fdbe_del(vswp, &(port->p_macaddr));
482 		vsw_destroy_vlans(port, VSW_VNETPORT);
483 
484 		/* Remove any multicast addresses.. */
485 		vsw_del_mcst_port(port);
486 
487 		/*
488 		 * No longer need to hold the lock on the port list
489 		 * now that we have unlinked the target port from the
490 		 * list.
491 		 */
492 		RW_EXIT(&plist->lockrw);
493 		if (vsw_port_delete(port)) {
494 			DERR(vswp, "%s: Error deleting port %d",
495 			    __func__, port->p_instance);
496 			return (1);
497 		}
498 		WRITE_ENTER(&plist->lockrw);
499 	}
500 	RW_EXIT(&plist->lockrw);
501 
502 	D1(vswp, "%s: exit", __func__);
503 
504 	return (0);
505 }
506 
507 /*
508  * Delete the specified port.
509  *
510  * Returns 0 on success, 1 on failure.
511  */
512 static int
513 vsw_port_delete(vsw_port_t *port)
514 {
515 	vsw_ldc_list_t 		*ldcl;
516 	vsw_t			*vswp = port->p_vswp;
517 	int			num_ldcs;
518 
519 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
520 
521 	(void) vsw_uninit_ldcs(port);
522 
523 	/*
524 	 * Wait for any pending ctrl msg tasks which reference this
525 	 * port to finish.
526 	 */
527 	if (vsw_drain_port_taskq(port))
528 		return (1);
529 
530 	/*
531 	 * Wait for any active callbacks to finish
532 	 */
533 	if (vsw_drain_ldcs(port))
534 		return (1);
535 
536 	ldcl = &port->p_ldclist;
537 	num_ldcs = port->num_ldcs;
538 	WRITE_ENTER(&ldcl->lockrw);
539 	while (num_ldcs > 0) {
540 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
541 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
542 			    vswp->instance, ldcl->head->ldc_id);
543 			RW_EXIT(&ldcl->lockrw);
544 			port->num_ldcs = num_ldcs;
545 			return (1);
546 		}
547 		num_ldcs--;
548 	}
549 	RW_EXIT(&ldcl->lockrw);
550 
551 	rw_destroy(&port->p_ldclist.lockrw);
552 
553 	mutex_destroy(&port->mca_lock);
554 	mutex_destroy(&port->tx_lock);
555 
556 	cv_destroy(&port->state_cv);
557 	mutex_destroy(&port->state_lock);
558 
559 	if (port->num_ldcs != 0) {
560 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
561 		port->num_ldcs = 0;
562 	}
563 	kmem_free(port, sizeof (vsw_port_t));
564 
565 	D1(vswp, "%s: exit", __func__);
566 
567 	return (0);
568 }
569 
570 /*
571  * Attach a logical domain channel (ldc) under a specified port.
572  *
573  * Returns 0 on success, 1 on failure.
574  */
575 static int
576 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
577 {
578 	vsw_t 		*vswp = port->p_vswp;
579 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
580 	vsw_ldc_t 	*ldcp = NULL;
581 	ldc_attr_t 	attr;
582 	ldc_status_t	istatus;
583 	int 		status = DDI_FAILURE;
584 	int		rv;
585 	char		kname[MAXNAMELEN];
586 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
587 			    PROG_callback = 0x2, PROG_rx_thread = 0x4,
588 			    PROG_tx_thread = 0x8}
589 			progress;
590 
591 	progress = PROG_init;
592 
593 	D1(vswp, "%s: enter", __func__);
594 
595 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
596 	if (ldcp == NULL) {
597 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
598 		return (1);
599 	}
600 	ldcp->ldc_id = ldc_id;
601 
602 	/* Allocate pools of receive mblks */
603 	rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
604 	    vsw_mblk_size1, vsw_mblk_size2, vsw_mblk_size3,
605 	    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
606 	if (rv) {
607 		DWARN(vswp, "%s: unable to create free mblk pools for"
608 		    " channel %ld (rv %d)", __func__, ldc_id, rv);
609 		kmem_free(ldcp, sizeof (vsw_ldc_t));
610 		return (1);
611 	}
612 
613 	progress |= PROG_mblks;
614 
615 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
616 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
617 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
618 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
619 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
620 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
621 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
622 
623 	/* required for handshake with peer */
624 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
625 	ldcp->peer_session = 0;
626 	ldcp->session_status = 0;
627 	ldcp->hss_id = 1;	/* Initial handshake session id */
628 
629 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
630 
631 	/* only set for outbound lane, inbound set by peer */
632 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
633 
634 	attr.devclass = LDC_DEV_NT_SVC;
635 	attr.instance = ddi_get_instance(vswp->dip);
636 	attr.mode = LDC_MODE_UNRELIABLE;
637 	attr.mtu = VSW_LDC_MTU;
638 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
639 	if (status != 0) {
640 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
641 		    __func__, ldc_id, status);
642 		goto ldc_attach_fail;
643 	}
644 
645 	if (vsw_ldc_rxthr_enabled) {
646 		ldcp->rx_thr_flags = 0;
647 
648 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
649 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
650 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
651 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
652 
653 		progress |= PROG_rx_thread;
654 		if (ldcp->rx_thread == NULL) {
655 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
656 			    __func__, ldc_id);
657 			goto ldc_attach_fail;
658 		}
659 	}
660 
661 	if (vsw_ldc_txthr_enabled) {
662 		ldcp->tx_thr_flags = 0;
663 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
664 
665 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
666 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
667 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
668 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
669 
670 		progress |= PROG_tx_thread;
671 		if (ldcp->tx_thread == NULL) {
672 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
673 			    __func__, ldc_id);
674 			goto ldc_attach_fail;
675 		}
676 	}
677 
678 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
679 	if (status != 0) {
680 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
681 		    __func__, ldc_id, status);
682 		(void) ldc_fini(ldcp->ldc_handle);
683 		goto ldc_attach_fail;
684 	}
685 	/*
686 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
687 	 * data msgs, including raw data msgs used to recv priority frames.
688 	 */
689 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
690 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
691 
692 	progress |= PROG_callback;
693 
694 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
695 
696 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
697 		DERR(vswp, "%s: ldc_status failed", __func__);
698 		mutex_destroy(&ldcp->status_lock);
699 		goto ldc_attach_fail;
700 	}
701 
702 	ldcp->ldc_status = istatus;
703 	ldcp->ldc_port = port;
704 	ldcp->ldc_vswp = vswp;
705 
706 	vsw_reset_vnet_proto_ops(ldcp);
707 
708 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
709 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
710 	    kname, &ldcp->ldc_stats);
711 	if (ldcp->ksp == NULL) {
712 		DERR(vswp, "%s: kstats setup failed", __func__);
713 		goto ldc_attach_fail;
714 	}
715 
716 	/* link it into the list of channels for this port */
717 	WRITE_ENTER(&ldcl->lockrw);
718 	ldcp->ldc_next = ldcl->head;
719 	ldcl->head = ldcp;
720 	RW_EXIT(&ldcl->lockrw);
721 
722 	D1(vswp, "%s: exit", __func__);
723 	return (0);
724 
725 ldc_attach_fail:
726 
727 	if (progress & PROG_callback) {
728 		(void) ldc_unreg_callback(ldcp->ldc_handle);
729 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
730 	}
731 
732 	if (progress & PROG_rx_thread) {
733 		if (ldcp->rx_thread != NULL) {
734 			vsw_stop_rx_thread(ldcp);
735 		}
736 		mutex_destroy(&ldcp->rx_thr_lock);
737 		cv_destroy(&ldcp->rx_thr_cv);
738 	}
739 
740 	if (progress & PROG_tx_thread) {
741 		if (ldcp->tx_thread != NULL) {
742 			vsw_stop_tx_thread(ldcp);
743 		}
744 		mutex_destroy(&ldcp->tx_thr_lock);
745 		cv_destroy(&ldcp->tx_thr_cv);
746 	}
747 	if (ldcp->ksp != NULL) {
748 		vgen_destroy_kstats(ldcp->ksp);
749 	}
750 	mutex_destroy(&ldcp->ldc_txlock);
751 	mutex_destroy(&ldcp->ldc_rxlock);
752 	mutex_destroy(&ldcp->ldc_cblock);
753 	mutex_destroy(&ldcp->drain_cv_lock);
754 
755 	cv_destroy(&ldcp->drain_cv);
756 
757 	rw_destroy(&ldcp->lane_in.dlistrw);
758 	rw_destroy(&ldcp->lane_out.dlistrw);
759 
760 	if (progress & PROG_mblks) {
761 		vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
762 	}
763 	kmem_free(ldcp, sizeof (vsw_ldc_t));
764 
765 	return (1);
766 }
767 
768 /*
769  * Detach a logical domain channel (ldc) belonging to a
770  * particular port.
771  *
772  * Returns 0 on success, 1 on failure.
773  */
774 static int
775 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
776 {
777 	vsw_t 		*vswp = port->p_vswp;
778 	vsw_ldc_t 	*ldcp, *prev_ldcp;
779 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
780 	int 		rv;
781 
782 	prev_ldcp = ldcl->head;
783 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
784 		if (ldcp->ldc_id == ldc_id) {
785 			break;
786 		}
787 	}
788 
789 	/* specified ldc id not found */
790 	if (ldcp == NULL) {
791 		DERR(vswp, "%s: ldcp = NULL", __func__);
792 		return (1);
793 	}
794 
795 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
796 
797 	/* Stop the receive thread */
798 	if (ldcp->rx_thread != NULL) {
799 		vsw_stop_rx_thread(ldcp);
800 		mutex_destroy(&ldcp->rx_thr_lock);
801 		cv_destroy(&ldcp->rx_thr_cv);
802 	}
803 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
804 
805 	/* Stop the tx thread */
806 	if (ldcp->tx_thread != NULL) {
807 		vsw_stop_tx_thread(ldcp);
808 		mutex_destroy(&ldcp->tx_thr_lock);
809 		cv_destroy(&ldcp->tx_thr_cv);
810 		if (ldcp->tx_mhead != NULL) {
811 			freemsgchain(ldcp->tx_mhead);
812 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
813 			ldcp->tx_cnt = 0;
814 		}
815 	}
816 
817 	/* Destory kstats */
818 	vgen_destroy_kstats(ldcp->ksp);
819 
820 	/*
821 	 * Before we can close the channel we must release any mapped
822 	 * resources (e.g. drings).
823 	 */
824 	vsw_free_lane_resources(ldcp, INBOUND);
825 	vsw_free_lane_resources(ldcp, OUTBOUND);
826 
827 	/*
828 	 * If the close fails we are in serious trouble, as won't
829 	 * be able to delete the parent port.
830 	 */
831 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
832 		DERR(vswp, "%s: error %d closing channel %lld",
833 		    __func__, rv, ldcp->ldc_id);
834 		return (1);
835 	}
836 
837 	(void) ldc_fini(ldcp->ldc_handle);
838 
839 	ldcp->ldc_status = LDC_INIT;
840 	ldcp->ldc_handle = NULL;
841 	ldcp->ldc_vswp = NULL;
842 
843 
844 	/*
845 	 * Most likely some mblks are still in use and
846 	 * have not been returned to the pool. These mblks are
847 	 * added to the pool that is maintained in the device instance.
848 	 * Another attempt will be made to destroy the pool
849 	 * when the device detaches.
850 	 */
851 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
852 
853 	/* unlink it from the list */
854 	prev_ldcp = ldcp->ldc_next;
855 
856 	mutex_destroy(&ldcp->ldc_txlock);
857 	mutex_destroy(&ldcp->ldc_rxlock);
858 	mutex_destroy(&ldcp->ldc_cblock);
859 	cv_destroy(&ldcp->drain_cv);
860 	mutex_destroy(&ldcp->drain_cv_lock);
861 	mutex_destroy(&ldcp->status_lock);
862 	rw_destroy(&ldcp->lane_in.dlistrw);
863 	rw_destroy(&ldcp->lane_out.dlistrw);
864 
865 	kmem_free(ldcp, sizeof (vsw_ldc_t));
866 
867 	return (0);
868 }
869 
870 /*
871  * Open and attempt to bring up the channel. Note that channel
872  * can only be brought up if peer has also opened channel.
873  *
874  * Returns 0 if can open and bring up channel, otherwise
875  * returns 1.
876  */
877 static int
878 vsw_ldc_init(vsw_ldc_t *ldcp)
879 {
880 	vsw_t 		*vswp = ldcp->ldc_vswp;
881 	ldc_status_t	istatus = 0;
882 	int		rv;
883 
884 	D1(vswp, "%s: enter", __func__);
885 
886 	LDC_ENTER_LOCK(ldcp);
887 
888 	/* don't start at 0 in case clients don't like that */
889 	ldcp->next_ident = 1;
890 
891 	rv = ldc_open(ldcp->ldc_handle);
892 	if (rv != 0) {
893 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
894 		    __func__, ldcp->ldc_id, rv);
895 		LDC_EXIT_LOCK(ldcp);
896 		return (1);
897 	}
898 
899 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
900 		DERR(vswp, "%s: unable to get status", __func__);
901 		LDC_EXIT_LOCK(ldcp);
902 		return (1);
903 
904 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
905 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
906 		    __func__, ldcp->ldc_id, istatus);
907 		LDC_EXIT_LOCK(ldcp);
908 		return (1);
909 	}
910 
911 	mutex_enter(&ldcp->status_lock);
912 	ldcp->ldc_status = istatus;
913 	mutex_exit(&ldcp->status_lock);
914 
915 	rv = ldc_up(ldcp->ldc_handle);
916 	if (rv != 0) {
917 		/*
918 		 * Not a fatal error for ldc_up() to fail, as peer
919 		 * end point may simply not be ready yet.
920 		 */
921 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
922 		    ldcp->ldc_id, rv);
923 		LDC_EXIT_LOCK(ldcp);
924 		return (1);
925 	}
926 
927 	/*
928 	 * ldc_up() call is non-blocking so need to explicitly
929 	 * check channel status to see if in fact the channel
930 	 * is UP.
931 	 */
932 	mutex_enter(&ldcp->status_lock);
933 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
934 		DERR(vswp, "%s: unable to get status", __func__);
935 		mutex_exit(&ldcp->status_lock);
936 		LDC_EXIT_LOCK(ldcp);
937 		return (1);
938 
939 	}
940 
941 	if (ldcp->ldc_status == LDC_UP) {
942 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
943 		    ldcp->ldc_id, istatus);
944 		mutex_exit(&ldcp->status_lock);
945 		LDC_EXIT_LOCK(ldcp);
946 
947 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
948 		return (0);
949 	}
950 
951 	mutex_exit(&ldcp->status_lock);
952 	LDC_EXIT_LOCK(ldcp);
953 
954 	D1(vswp, "%s: exit", __func__);
955 	return (0);
956 }
957 
958 /* disable callbacks on the channel */
959 static int
960 vsw_ldc_uninit(vsw_ldc_t *ldcp)
961 {
962 	vsw_t	*vswp = ldcp->ldc_vswp;
963 	int	rv;
964 
965 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
966 
967 	LDC_ENTER_LOCK(ldcp);
968 
969 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
970 	if (rv != 0) {
971 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
972 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
973 		LDC_EXIT_LOCK(ldcp);
974 		return (1);
975 	}
976 
977 	mutex_enter(&ldcp->status_lock);
978 	ldcp->ldc_status = LDC_INIT;
979 	mutex_exit(&ldcp->status_lock);
980 
981 	LDC_EXIT_LOCK(ldcp);
982 
983 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
984 
985 	return (0);
986 }
987 
988 static int
989 vsw_init_ldcs(vsw_port_t *port)
990 {
991 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
992 	vsw_ldc_t	*ldcp;
993 
994 	READ_ENTER(&ldcl->lockrw);
995 	ldcp =  ldcl->head;
996 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
997 		(void) vsw_ldc_init(ldcp);
998 	}
999 	RW_EXIT(&ldcl->lockrw);
1000 
1001 	return (0);
1002 }
1003 
1004 static int
1005 vsw_uninit_ldcs(vsw_port_t *port)
1006 {
1007 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1008 	vsw_ldc_t	*ldcp;
1009 
1010 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1011 
1012 	READ_ENTER(&ldcl->lockrw);
1013 	ldcp =  ldcl->head;
1014 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1015 		(void) vsw_ldc_uninit(ldcp);
1016 	}
1017 	RW_EXIT(&ldcl->lockrw);
1018 
1019 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1020 
1021 	return (0);
1022 }
1023 
1024 /*
1025  * Wait until the callback(s) associated with the ldcs under the specified
1026  * port have completed.
1027  *
1028  * Prior to this function being invoked each channel under this port
1029  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1030  *
1031  * A short explaination of what we are doing below..
1032  *
1033  * The simplest approach would be to have a reference counter in
1034  * the ldc structure which is increment/decremented by the callbacks as
1035  * they use the channel. The drain function could then simply disable any
1036  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1037  * there is a tiny window here - before the callback is able to get the lock
1038  * on the channel it is interrupted and this function gets to execute. It
1039  * sees that the ref count is zero and believes its free to delete the
1040  * associated data structures.
1041  *
1042  * We get around this by taking advantage of the fact that before the ldc
1043  * framework invokes a callback it sets a flag to indicate that there is a
1044  * callback active (or about to become active). If when we attempt to
1045  * unregister a callback when this active flag is set then the unregister
1046  * will fail with EWOULDBLOCK.
1047  *
1048  * If the unregister fails we do a cv_timedwait. We will either be signaled
1049  * by the callback as it is exiting (note we have to wait a short period to
1050  * allow the callback to return fully to the ldc framework and it to clear
1051  * the active flag), or by the timer expiring. In either case we again attempt
1052  * the unregister. We repeat this until we can succesfully unregister the
1053  * callback.
1054  *
1055  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1056  * the case where the callback has finished but the ldc framework has not yet
1057  * cleared the active flag. In this case we would never get a cv_signal.
1058  */
1059 static int
1060 vsw_drain_ldcs(vsw_port_t *port)
1061 {
1062 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1063 	vsw_ldc_t	*ldcp;
1064 	vsw_t		*vswp = port->p_vswp;
1065 
1066 	D1(vswp, "%s: enter", __func__);
1067 
1068 	READ_ENTER(&ldcl->lockrw);
1069 
1070 	ldcp = ldcl->head;
1071 
1072 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1073 		/*
1074 		 * If we can unregister the channel callback then we
1075 		 * know that there is no callback either running or
1076 		 * scheduled to run for this channel so move on to next
1077 		 * channel in the list.
1078 		 */
1079 		mutex_enter(&ldcp->drain_cv_lock);
1080 
1081 		/* prompt active callbacks to quit */
1082 		ldcp->drain_state = VSW_LDC_DRAINING;
1083 
1084 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1085 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1086 			    ldcp->ldc_id);
1087 			mutex_exit(&ldcp->drain_cv_lock);
1088 			continue;
1089 		} else {
1090 			/*
1091 			 * If we end up here we know that either 1) a callback
1092 			 * is currently executing, 2) is about to start (i.e.
1093 			 * the ldc framework has set the active flag but
1094 			 * has not actually invoked the callback yet, or 3)
1095 			 * has finished and has returned to the ldc framework
1096 			 * but the ldc framework has not yet cleared the
1097 			 * active bit.
1098 			 *
1099 			 * Wait for it to finish.
1100 			 */
1101 			while (ldc_unreg_callback(ldcp->ldc_handle)
1102 			    == EWOULDBLOCK)
1103 				(void) cv_timedwait(&ldcp->drain_cv,
1104 				    &ldcp->drain_cv_lock, lbolt + hz);
1105 
1106 			mutex_exit(&ldcp->drain_cv_lock);
1107 			D2(vswp, "%s: unreg callback for chan %ld after "
1108 			    "timeout", __func__, ldcp->ldc_id);
1109 		}
1110 	}
1111 	RW_EXIT(&ldcl->lockrw);
1112 
1113 	D1(vswp, "%s: exit", __func__);
1114 	return (0);
1115 }
1116 
1117 /*
1118  * Wait until all tasks which reference this port have completed.
1119  *
1120  * Prior to this function being invoked each channel under this port
1121  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1122  */
1123 static int
1124 vsw_drain_port_taskq(vsw_port_t *port)
1125 {
1126 	vsw_t		*vswp = port->p_vswp;
1127 
1128 	D1(vswp, "%s: enter", __func__);
1129 
1130 	/*
1131 	 * Mark the port as in the process of being detached, and
1132 	 * dispatch a marker task to the queue so we know when all
1133 	 * relevant tasks have completed.
1134 	 */
1135 	mutex_enter(&port->state_lock);
1136 	port->state = VSW_PORT_DETACHING;
1137 
1138 	if ((vswp->taskq_p == NULL) ||
1139 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1140 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1141 		DERR(vswp, "%s: unable to dispatch marker task",
1142 		    __func__);
1143 		mutex_exit(&port->state_lock);
1144 		return (1);
1145 	}
1146 
1147 	/*
1148 	 * Wait for the marker task to finish.
1149 	 */
1150 	while (port->state != VSW_PORT_DETACHABLE)
1151 		cv_wait(&port->state_cv, &port->state_lock);
1152 
1153 	mutex_exit(&port->state_lock);
1154 
1155 	D1(vswp, "%s: exit", __func__);
1156 
1157 	return (0);
1158 }
1159 
1160 static void
1161 vsw_marker_task(void *arg)
1162 {
1163 	vsw_port_t	*port = arg;
1164 	vsw_t		*vswp = port->p_vswp;
1165 
1166 	D1(vswp, "%s: enter", __func__);
1167 
1168 	mutex_enter(&port->state_lock);
1169 
1170 	/*
1171 	 * No further tasks should be dispatched which reference
1172 	 * this port so ok to mark it as safe to detach.
1173 	 */
1174 	port->state = VSW_PORT_DETACHABLE;
1175 
1176 	cv_signal(&port->state_cv);
1177 
1178 	mutex_exit(&port->state_lock);
1179 
1180 	D1(vswp, "%s: exit", __func__);
1181 }
1182 
1183 vsw_port_t *
1184 vsw_lookup_port(vsw_t *vswp, int p_instance)
1185 {
1186 	vsw_port_list_t *plist = &vswp->plist;
1187 	vsw_port_t	*port;
1188 
1189 	for (port = plist->head; port != NULL; port = port->p_next) {
1190 		if (port->p_instance == p_instance) {
1191 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1192 			return (port);
1193 		}
1194 	}
1195 
1196 	return (NULL);
1197 }
1198 
1199 void
1200 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1201 {
1202 	vsw_ldc_list_t 	*ldclp;
1203 	vsw_ldc_t	*ldcp;
1204 
1205 	ldclp = &portp->p_ldclist;
1206 
1207 	READ_ENTER(&ldclp->lockrw);
1208 
1209 	/*
1210 	 * NOTE: for now, we will assume we have a single channel.
1211 	 */
1212 	if (ldclp->head == NULL) {
1213 		RW_EXIT(&ldclp->lockrw);
1214 		return;
1215 	}
1216 	ldcp = ldclp->head;
1217 
1218 	mutex_enter(&ldcp->ldc_cblock);
1219 
1220 	/*
1221 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1222 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1223 	 */
1224 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1225 	    portp->nvids != 0) {
1226 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1227 	}
1228 
1229 	mutex_exit(&ldcp->ldc_cblock);
1230 
1231 	RW_EXIT(&ldclp->lockrw);
1232 }
1233 
1234 void
1235 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1236 {
1237 	vsw_ldc_list_t	*ldclp;
1238 	vsw_ldc_t	*ldcp;
1239 
1240 	ldclp = &portp->p_ldclist;
1241 
1242 	READ_ENTER(&ldclp->lockrw);
1243 
1244 	/*
1245 	 * NOTE: for now, we will assume we have a single channel.
1246 	 */
1247 	if (ldclp->head == NULL) {
1248 		RW_EXIT(&ldclp->lockrw);
1249 		return;
1250 	}
1251 	ldcp = ldclp->head;
1252 
1253 	mutex_enter(&ldcp->ldc_cblock);
1254 
1255 	/*
1256 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1257 	 * to trigger re-negotiation, which inturn trigger HybridIO
1258 	 * setup/cleanup.
1259 	 */
1260 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1261 	    (portp->p_hio_capable == B_TRUE)) {
1262 		if (immediate == B_TRUE) {
1263 			(void) ldc_down(ldcp->ldc_handle);
1264 		} else {
1265 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1266 		}
1267 	}
1268 
1269 	mutex_exit(&ldcp->ldc_cblock);
1270 
1271 	RW_EXIT(&ldclp->lockrw);
1272 }
1273 
1274 /*
1275  * Search for and remove the specified port from the port
1276  * list. Returns 0 if able to locate and remove port, otherwise
1277  * returns 1.
1278  */
1279 static int
1280 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1281 {
1282 	vsw_port_list_t *plist = &vswp->plist;
1283 	vsw_port_t	*curr_p, *prev_p;
1284 
1285 	if (plist->head == NULL)
1286 		return (1);
1287 
1288 	curr_p = prev_p = plist->head;
1289 
1290 	while (curr_p != NULL) {
1291 		if (curr_p == port) {
1292 			if (prev_p == curr_p) {
1293 				plist->head = curr_p->p_next;
1294 			} else {
1295 				prev_p->p_next = curr_p->p_next;
1296 			}
1297 			plist->num_ports--;
1298 			break;
1299 		} else {
1300 			prev_p = curr_p;
1301 			curr_p = curr_p->p_next;
1302 		}
1303 	}
1304 	return (0);
1305 }
1306 
1307 /*
1308  * Interrupt handler for ldc messages.
1309  */
1310 static uint_t
1311 vsw_ldc_cb(uint64_t event, caddr_t arg)
1312 {
1313 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1314 	vsw_t 		*vswp = ldcp->ldc_vswp;
1315 
1316 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1317 
1318 	mutex_enter(&ldcp->ldc_cblock);
1319 	ldcp->ldc_stats.callbacks++;
1320 
1321 	mutex_enter(&ldcp->status_lock);
1322 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1323 		mutex_exit(&ldcp->status_lock);
1324 		mutex_exit(&ldcp->ldc_cblock);
1325 		return (LDC_SUCCESS);
1326 	}
1327 	mutex_exit(&ldcp->status_lock);
1328 
1329 	if (event & LDC_EVT_UP) {
1330 		/*
1331 		 * Channel has come up.
1332 		 */
1333 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1334 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1335 
1336 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1337 
1338 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1339 	}
1340 
1341 	if (event & LDC_EVT_READ) {
1342 		/*
1343 		 * Data available for reading.
1344 		 */
1345 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1346 		    __func__, ldcp->ldc_id, event);
1347 
1348 		if (ldcp->rx_thread != NULL) {
1349 			/*
1350 			 * If the receive thread is enabled, then
1351 			 * wakeup the receive thread to process the
1352 			 * LDC messages.
1353 			 */
1354 			mutex_exit(&ldcp->ldc_cblock);
1355 			mutex_enter(&ldcp->rx_thr_lock);
1356 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1357 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1358 				cv_signal(&ldcp->rx_thr_cv);
1359 			}
1360 			mutex_exit(&ldcp->rx_thr_lock);
1361 			mutex_enter(&ldcp->ldc_cblock);
1362 		} else {
1363 			vsw_process_pkt(ldcp);
1364 		}
1365 
1366 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1367 
1368 		goto vsw_cb_exit;
1369 	}
1370 
1371 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1372 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1373 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1374 
1375 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1376 	}
1377 
1378 	/*
1379 	 * Catch either LDC_EVT_WRITE which we don't support or any
1380 	 * unknown event.
1381 	 */
1382 	if (event &
1383 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1384 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1385 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1386 	}
1387 
1388 vsw_cb_exit:
1389 	mutex_exit(&ldcp->ldc_cblock);
1390 
1391 	/*
1392 	 * Let the drain function know we are finishing if it
1393 	 * is waiting.
1394 	 */
1395 	mutex_enter(&ldcp->drain_cv_lock);
1396 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1397 		cv_signal(&ldcp->drain_cv);
1398 	mutex_exit(&ldcp->drain_cv_lock);
1399 
1400 	return (LDC_SUCCESS);
1401 }
1402 
1403 /*
1404  * Reinitialise data structures associated with the channel.
1405  */
1406 static void
1407 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1408 {
1409 	vsw_t		*vswp = ldcp->ldc_vswp;
1410 	vsw_port_t	*port;
1411 	vsw_ldc_list_t	*ldcl;
1412 
1413 	D1(vswp, "%s: enter", __func__);
1414 
1415 	port = ldcp->ldc_port;
1416 	ldcl = &port->p_ldclist;
1417 
1418 	READ_ENTER(&ldcl->lockrw);
1419 
1420 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1421 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1422 
1423 	vsw_free_lane_resources(ldcp, INBOUND);
1424 	vsw_free_lane_resources(ldcp, OUTBOUND);
1425 	RW_EXIT(&ldcl->lockrw);
1426 
1427 	ldcp->lane_in.lstate = 0;
1428 	ldcp->lane_out.lstate = 0;
1429 
1430 	/* Remove the fdb entry for this port/mac address */
1431 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1432 
1433 	/* remove the port from vlans it has been assigned to */
1434 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1435 
1436 	/*
1437 	 * Remove parent port from any multicast groups
1438 	 * it may have registered with. Client must resend
1439 	 * multicast add command after handshake completes.
1440 	 */
1441 	vsw_del_mcst_port(port);
1442 
1443 	ldcp->peer_session = 0;
1444 	ldcp->session_status = 0;
1445 	ldcp->hcnt = 0;
1446 	ldcp->hphase = VSW_MILESTONE0;
1447 
1448 	vsw_reset_vnet_proto_ops(ldcp);
1449 
1450 	D1(vswp, "%s: exit", __func__);
1451 }
1452 
1453 /*
1454  * Process a connection event.
1455  *
1456  * Note - care must be taken to ensure that this function is
1457  * not called with the dlistrw lock held.
1458  */
1459 static void
1460 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1461 {
1462 	vsw_t		*vswp = ldcp->ldc_vswp;
1463 	vsw_conn_evt_t	*conn = NULL;
1464 
1465 	D1(vswp, "%s: enter", __func__);
1466 
1467 	/*
1468 	 * Check if either a reset or restart event is pending
1469 	 * or in progress. If so just return.
1470 	 *
1471 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1472 	 * being received by the callback handler, or a ECONNRESET error
1473 	 * code being returned from a ldc_read() or ldc_write() call.
1474 	 *
1475 	 * A VSW_CONN_RESTART event occurs when some error checking code
1476 	 * decides that there is a problem with data from the channel,
1477 	 * and that the handshake should be restarted.
1478 	 */
1479 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1480 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1481 		return;
1482 
1483 	/*
1484 	 * If it is an LDC_UP event we first check the recorded
1485 	 * state of the channel. If this is UP then we know that
1486 	 * the channel moving to the UP state has already been dealt
1487 	 * with and don't need to dispatch a  new task.
1488 	 *
1489 	 * The reason for this check is that when we do a ldc_up(),
1490 	 * depending on the state of the peer, we may or may not get
1491 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1492 	 * every time we do ldc_up() we explicitly check the channel
1493 	 * status to see has it come up (ldc_up() is asynch and will
1494 	 * complete at some undefined time), and take the appropriate
1495 	 * action.
1496 	 *
1497 	 * The flip side of this is that we may get a LDC_UP event
1498 	 * when we have already seen that the channel is up and have
1499 	 * dealt with that.
1500 	 */
1501 	mutex_enter(&ldcp->status_lock);
1502 	if (evt == VSW_CONN_UP) {
1503 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1504 			mutex_exit(&ldcp->status_lock);
1505 			return;
1506 		}
1507 	}
1508 	mutex_exit(&ldcp->status_lock);
1509 
1510 	/*
1511 	 * The transaction group id allows us to identify and discard
1512 	 * any tasks which are still pending on the taskq and refer
1513 	 * to the handshake session we are about to restart or reset.
1514 	 * These stale messages no longer have any real meaning.
1515 	 */
1516 	(void) atomic_inc_32(&ldcp->hss_id);
1517 
1518 	ASSERT(vswp->taskq_p != NULL);
1519 
1520 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1521 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1522 		    " connection event", vswp->instance);
1523 		goto err_exit;
1524 	}
1525 
1526 	conn->evt = evt;
1527 	conn->ldcp = ldcp;
1528 
1529 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1530 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1531 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1532 		    vswp->instance);
1533 
1534 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1535 		goto err_exit;
1536 	}
1537 
1538 	D1(vswp, "%s: exit", __func__);
1539 	return;
1540 
1541 err_exit:
1542 	/*
1543 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1544 	 * that future requests will at least be attempted and will hopefully
1545 	 * succeed.
1546 	 */
1547 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1548 		ldcp->reset_active = 0;
1549 }
1550 
1551 /*
1552  * Deal with events relating to a connection. Invoked from a taskq.
1553  */
1554 static void
1555 vsw_conn_task(void *arg)
1556 {
1557 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1558 	vsw_ldc_t	*ldcp = NULL;
1559 	vsw_port_t	*portp;
1560 	vsw_t		*vswp = NULL;
1561 	uint16_t	evt;
1562 	ldc_status_t	curr_status;
1563 
1564 	ldcp = conn->ldcp;
1565 	evt = conn->evt;
1566 	vswp = ldcp->ldc_vswp;
1567 	portp = ldcp->ldc_port;
1568 
1569 	D1(vswp, "%s: enter", __func__);
1570 
1571 	/* can safely free now have copied out data */
1572 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1573 
1574 	mutex_enter(&ldcp->status_lock);
1575 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1576 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1577 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1578 		mutex_exit(&ldcp->status_lock);
1579 		return;
1580 	}
1581 
1582 	/*
1583 	 * If we wish to restart the handshake on this channel, then if
1584 	 * the channel is UP we bring it DOWN to flush the underlying
1585 	 * ldc queue.
1586 	 */
1587 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1588 		(void) ldc_down(ldcp->ldc_handle);
1589 
1590 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1591 		vsw_hio_stop(vswp, ldcp);
1592 	}
1593 
1594 	/*
1595 	 * re-init all the associated data structures.
1596 	 */
1597 	vsw_ldc_reinit(ldcp);
1598 
1599 	/*
1600 	 * Bring the channel back up (note it does no harm to
1601 	 * do this even if the channel is already UP, Just
1602 	 * becomes effectively a no-op).
1603 	 */
1604 	(void) ldc_up(ldcp->ldc_handle);
1605 
1606 	/*
1607 	 * Check if channel is now UP. This will only happen if
1608 	 * peer has also done a ldc_up().
1609 	 */
1610 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1611 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1612 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1613 		mutex_exit(&ldcp->status_lock);
1614 		return;
1615 	}
1616 
1617 	ldcp->ldc_status = curr_status;
1618 
1619 	/* channel UP so restart handshake by sending version info */
1620 	if (curr_status == LDC_UP) {
1621 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1622 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1623 			    " handshake attempts (%d) on channel %ld",
1624 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1625 			mutex_exit(&ldcp->status_lock);
1626 			return;
1627 		}
1628 
1629 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1630 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1631 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1632 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1633 			    vswp->instance);
1634 
1635 			/*
1636 			 * Don't count as valid restart attempt if couldn't
1637 			 * send version msg.
1638 			 */
1639 			if (ldcp->hcnt > 0)
1640 				ldcp->hcnt--;
1641 		}
1642 	}
1643 
1644 	/*
1645 	 * Mark that the process is complete by clearing the flag.
1646 	 *
1647 	 * Note is it possible that the taskq dispatch above may have failed,
1648 	 * most likely due to memory shortage. We still clear the flag so
1649 	 * future attempts will at least be attempted and will hopefully
1650 	 * succeed.
1651 	 */
1652 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1653 		ldcp->reset_active = 0;
1654 
1655 	mutex_exit(&ldcp->status_lock);
1656 
1657 	D1(vswp, "%s: exit", __func__);
1658 }
1659 
1660 /*
1661  * returns 0 if legal for event signified by flag to have
1662  * occured at the time it did. Otherwise returns 1.
1663  */
1664 int
1665 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1666 {
1667 	vsw_t		*vswp = ldcp->ldc_vswp;
1668 	uint64_t	state;
1669 	uint64_t	phase;
1670 
1671 	if (dir == INBOUND)
1672 		state = ldcp->lane_in.lstate;
1673 	else
1674 		state = ldcp->lane_out.lstate;
1675 
1676 	phase = ldcp->hphase;
1677 
1678 	switch (flag) {
1679 	case VSW_VER_INFO_RECV:
1680 		if (phase > VSW_MILESTONE0) {
1681 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1682 			    " when in state %d\n", ldcp->ldc_id, phase);
1683 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1684 			return (1);
1685 		}
1686 		break;
1687 
1688 	case VSW_VER_ACK_RECV:
1689 	case VSW_VER_NACK_RECV:
1690 		if (!(state & VSW_VER_INFO_SENT)) {
1691 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1692 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1693 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1694 			return (1);
1695 		} else
1696 			state &= ~VSW_VER_INFO_SENT;
1697 		break;
1698 
1699 	case VSW_ATTR_INFO_RECV:
1700 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1701 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1702 			    " when in state %d\n", ldcp->ldc_id, phase);
1703 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1704 			return (1);
1705 		}
1706 		break;
1707 
1708 	case VSW_ATTR_ACK_RECV:
1709 	case VSW_ATTR_NACK_RECV:
1710 		if (!(state & VSW_ATTR_INFO_SENT)) {
1711 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1712 			    " or ATTR_NACK when in state %d\n",
1713 			    ldcp->ldc_id, phase);
1714 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1715 			return (1);
1716 		} else
1717 			state &= ~VSW_ATTR_INFO_SENT;
1718 		break;
1719 
1720 	case VSW_DRING_INFO_RECV:
1721 		if (phase < VSW_MILESTONE1) {
1722 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1723 			    " when in state %d\n", ldcp->ldc_id, phase);
1724 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1725 			return (1);
1726 		}
1727 		break;
1728 
1729 	case VSW_DRING_ACK_RECV:
1730 	case VSW_DRING_NACK_RECV:
1731 		if (!(state & VSW_DRING_INFO_SENT)) {
1732 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1733 			    " or DRING_NACK when in state %d\n",
1734 			    ldcp->ldc_id, phase);
1735 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1736 			return (1);
1737 		} else
1738 			state &= ~VSW_DRING_INFO_SENT;
1739 		break;
1740 
1741 	case VSW_RDX_INFO_RECV:
1742 		if (phase < VSW_MILESTONE3) {
1743 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1744 			    " when in state %d\n", ldcp->ldc_id, phase);
1745 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1746 			return (1);
1747 		}
1748 		break;
1749 
1750 	case VSW_RDX_ACK_RECV:
1751 	case VSW_RDX_NACK_RECV:
1752 		if (!(state & VSW_RDX_INFO_SENT)) {
1753 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1754 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1755 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1756 			return (1);
1757 		} else
1758 			state &= ~VSW_RDX_INFO_SENT;
1759 		break;
1760 
1761 	case VSW_MCST_INFO_RECV:
1762 		if (phase < VSW_MILESTONE3) {
1763 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1764 			    " when in state %d\n", ldcp->ldc_id, phase);
1765 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1766 			return (1);
1767 		}
1768 		break;
1769 
1770 	default:
1771 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1772 		    ldcp->ldc_id, flag);
1773 		return (1);
1774 	}
1775 
1776 	if (dir == INBOUND)
1777 		ldcp->lane_in.lstate = state;
1778 	else
1779 		ldcp->lane_out.lstate = state;
1780 
1781 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1782 
1783 	return (0);
1784 }
1785 
1786 void
1787 vsw_next_milestone(vsw_ldc_t *ldcp)
1788 {
1789 	vsw_t		*vswp = ldcp->ldc_vswp;
1790 	vsw_port_t	*portp = ldcp->ldc_port;
1791 
1792 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1793 	    ldcp->ldc_id, ldcp->hphase);
1794 
1795 	DUMP_FLAGS(ldcp->lane_in.lstate);
1796 	DUMP_FLAGS(ldcp->lane_out.lstate);
1797 
1798 	switch (ldcp->hphase) {
1799 
1800 	case VSW_MILESTONE0:
1801 		/*
1802 		 * If we haven't started to handshake with our peer,
1803 		 * start to do so now.
1804 		 */
1805 		if (ldcp->lane_out.lstate == 0) {
1806 			D2(vswp, "%s: (chan %lld) starting handshake "
1807 			    "with peer", __func__, ldcp->ldc_id);
1808 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1809 		}
1810 
1811 		/*
1812 		 * Only way to pass this milestone is to have successfully
1813 		 * negotiated version info.
1814 		 */
1815 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1816 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1817 
1818 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1819 			    __func__, ldcp->ldc_id);
1820 
1821 			vsw_set_vnet_proto_ops(ldcp);
1822 
1823 			/*
1824 			 * Next milestone is passed when attribute
1825 			 * information has been successfully exchanged.
1826 			 */
1827 			ldcp->hphase = VSW_MILESTONE1;
1828 			vsw_send_attr(ldcp);
1829 
1830 		}
1831 		break;
1832 
1833 	case VSW_MILESTONE1:
1834 		/*
1835 		 * Only way to pass this milestone is to have successfully
1836 		 * negotiated attribute information.
1837 		 */
1838 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1839 
1840 			ldcp->hphase = VSW_MILESTONE2;
1841 
1842 			/*
1843 			 * If the peer device has said it wishes to
1844 			 * use descriptor rings then we send it our ring
1845 			 * info, otherwise we just set up a private ring
1846 			 * which we use an internal buffer
1847 			 */
1848 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1849 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1850 			    (VSW_VER_LT(ldcp, 1, 2) &&
1851 			    (ldcp->lane_in.xfer_mode ==
1852 			    VIO_DRING_MODE_V1_0))) {
1853 				vsw_send_dring_info(ldcp);
1854 			}
1855 		}
1856 		break;
1857 
1858 	case VSW_MILESTONE2:
1859 		/*
1860 		 * If peer has indicated in its attribute message that
1861 		 * it wishes to use descriptor rings then the only way
1862 		 * to pass this milestone is for us to have received
1863 		 * valid dring info.
1864 		 *
1865 		 * If peer is not using descriptor rings then just fall
1866 		 * through.
1867 		 */
1868 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1869 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1870 		    (VSW_VER_LT(ldcp, 1, 2) &&
1871 		    (ldcp->lane_in.xfer_mode ==
1872 		    VIO_DRING_MODE_V1_0))) {
1873 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
1874 				break;
1875 		}
1876 
1877 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
1878 		    __func__, ldcp->ldc_id);
1879 
1880 		ldcp->hphase = VSW_MILESTONE3;
1881 		vsw_send_rdx(ldcp);
1882 		break;
1883 
1884 	case VSW_MILESTONE3:
1885 		/*
1886 		 * Pass this milestone when all paramaters have been
1887 		 * successfully exchanged and RDX sent in both directions.
1888 		 *
1889 		 * Mark outbound lane as available to transmit data.
1890 		 */
1891 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
1892 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
1893 
1894 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
1895 			    __func__, ldcp->ldc_id);
1896 			D2(vswp, "%s: ** handshake complete (0x%llx : "
1897 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
1898 			    ldcp->lane_out.lstate);
1899 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
1900 			ldcp->hphase = VSW_MILESTONE4;
1901 			ldcp->hcnt = 0;
1902 			DISPLAY_STATE();
1903 			/* Start HIO if enabled and capable */
1904 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1905 				D2(vswp, "%s: start HybridIO setup", __func__);
1906 				vsw_hio_start(vswp, ldcp);
1907 			}
1908 		} else {
1909 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1910 			    __func__, ldcp->lane_in.lstate,
1911 			    ldcp->lane_out.lstate);
1912 		}
1913 		break;
1914 
1915 	case VSW_MILESTONE4:
1916 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1917 		    ldcp->ldc_id);
1918 		break;
1919 
1920 	default:
1921 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1922 		    ldcp->ldc_id, ldcp->hphase);
1923 	}
1924 
1925 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1926 	    ldcp->hphase);
1927 }
1928 
1929 /*
1930  * Check if major version is supported.
1931  *
1932  * Returns 0 if finds supported major number, and if necessary
1933  * adjusts the minor field.
1934  *
1935  * Returns 1 if can't match major number exactly. Sets mjor/minor
1936  * to next lowest support values, or to zero if no other values possible.
1937  */
1938 static int
1939 vsw_supported_version(vio_ver_msg_t *vp)
1940 {
1941 	int	i;
1942 
1943 	D1(NULL, "vsw_supported_version: enter");
1944 
1945 	for (i = 0; i < VSW_NUM_VER; i++) {
1946 		if (vsw_versions[i].ver_major == vp->ver_major) {
1947 			/*
1948 			 * Matching or lower major version found. Update
1949 			 * minor number if necessary.
1950 			 */
1951 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
1952 				D2(NULL, "%s: adjusting minor value from %d "
1953 				    "to %d", __func__, vp->ver_minor,
1954 				    vsw_versions[i].ver_minor);
1955 				vp->ver_minor = vsw_versions[i].ver_minor;
1956 			}
1957 
1958 			return (0);
1959 		}
1960 
1961 		/*
1962 		 * If the message contains a higher major version number, set
1963 		 * the message's major/minor versions to the current values
1964 		 * and return false, so this message will get resent with
1965 		 * these values.
1966 		 */
1967 		if (vsw_versions[i].ver_major < vp->ver_major) {
1968 			D2(NULL, "%s: adjusting major and minor "
1969 			    "values to %d, %d\n",
1970 			    __func__, vsw_versions[i].ver_major,
1971 			    vsw_versions[i].ver_minor);
1972 			vp->ver_major = vsw_versions[i].ver_major;
1973 			vp->ver_minor = vsw_versions[i].ver_minor;
1974 			return (1);
1975 		}
1976 	}
1977 
1978 	/* No match was possible, zero out fields */
1979 	vp->ver_major = 0;
1980 	vp->ver_minor = 0;
1981 
1982 	D1(NULL, "vsw_supported_version: exit");
1983 
1984 	return (1);
1985 }
1986 
1987 /*
1988  * Set vnet-protocol-version dependent functions based on version.
1989  */
1990 static void
1991 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1992 {
1993 	vsw_t	*vswp = ldcp->ldc_vswp;
1994 	lane_t	*lp = &ldcp->lane_out;
1995 
1996 	if (VSW_VER_GTEQ(ldcp, 1, 3)) {
1997 		/*
1998 		 * If the version negotiated with peer is >= 1.3,
1999 		 * set the mtu in our attributes to max_frame_size.
2000 		 */
2001 		lp->mtu = vswp->max_frame_size;
2002 	} else {
2003 		vsw_port_t	*portp = ldcp->ldc_port;
2004 		/*
2005 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2006 		 * We can negotiate that size with those peers provided the
2007 		 * following conditions are true:
2008 		 * - Our max_frame_size is greater only by VLAN_TAGSZ (4).
2009 		 * - Only pvid is defined for our peer and there are no vids.
2010 		 * If the above conditions are true, then we can send/recv only
2011 		 * untagged frames of max size ETHERMAX. Note that pvid of the
2012 		 * peer can be different, as vsw has to serve the vnet in that
2013 		 * vlan even if itself is not assigned to that vlan.
2014 		 */
2015 		if ((vswp->max_frame_size == ETHERMAX + VLAN_TAGSZ) &&
2016 		    portp->nvids == 0) {
2017 			lp->mtu = ETHERMAX;
2018 		}
2019 	}
2020 
2021 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2022 		/* Versions >= 1.2 */
2023 
2024 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2025 			/*
2026 			 * enable priority routines and pkt mode only if
2027 			 * at least one pri-eth-type is specified in MD.
2028 			 */
2029 			ldcp->tx = vsw_ldctx_pri;
2030 			ldcp->rx_pktdata = vsw_process_pkt_data;
2031 
2032 			/* set xfer mode for vsw_send_attr() */
2033 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2034 		} else {
2035 			/* no priority eth types defined in MD */
2036 
2037 			ldcp->tx = vsw_ldctx;
2038 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2039 
2040 			/* set xfer mode for vsw_send_attr() */
2041 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2042 		}
2043 
2044 	} else {
2045 		/* Versions prior to 1.2  */
2046 
2047 		vsw_reset_vnet_proto_ops(ldcp);
2048 	}
2049 }
2050 
2051 /*
2052  * Reset vnet-protocol-version dependent functions to v1.0.
2053  */
2054 static void
2055 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2056 {
2057 	lane_t	*lp = &ldcp->lane_out;
2058 
2059 	ldcp->tx = vsw_ldctx;
2060 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2061 
2062 	/* set xfer mode for vsw_send_attr() */
2063 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2064 }
2065 
2066 /*
2067  * Main routine for processing messages received over LDC.
2068  */
2069 static void
2070 vsw_process_pkt(void *arg)
2071 {
2072 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2073 	vsw_t 		*vswp = ldcp->ldc_vswp;
2074 	size_t		msglen;
2075 	vio_msg_tag_t	*tagp;
2076 	uint64_t	*ldcmsg;
2077 	int 		rv = 0;
2078 
2079 
2080 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2081 
2082 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2083 
2084 	ldcmsg = ldcp->ldcmsg;
2085 	/*
2086 	 * If channel is up read messages until channel is empty.
2087 	 */
2088 	do {
2089 		msglen = ldcp->msglen;
2090 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2091 
2092 		if (rv != 0) {
2093 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2094 			    __func__, ldcp->ldc_id, rv, msglen);
2095 		}
2096 
2097 		/* channel has been reset */
2098 		if (rv == ECONNRESET) {
2099 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2100 			break;
2101 		}
2102 
2103 		if (msglen == 0) {
2104 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2105 			    ldcp->ldc_id);
2106 			break;
2107 		}
2108 
2109 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2110 		    ldcp->ldc_id, msglen);
2111 
2112 		/*
2113 		 * Figure out what sort of packet we have gotten by
2114 		 * examining the msg tag, and then switch it appropriately.
2115 		 */
2116 		tagp = (vio_msg_tag_t *)ldcmsg;
2117 
2118 		switch (tagp->vio_msgtype) {
2119 		case VIO_TYPE_CTRL:
2120 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2121 			break;
2122 		case VIO_TYPE_DATA:
2123 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2124 			break;
2125 		case VIO_TYPE_ERR:
2126 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2127 			break;
2128 		default:
2129 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2130 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2131 			break;
2132 		}
2133 	} while (msglen);
2134 
2135 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2136 }
2137 
2138 /*
2139  * Dispatch a task to process a VIO control message.
2140  */
2141 static void
2142 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2143 {
2144 	vsw_ctrl_task_t		*ctaskp = NULL;
2145 	vsw_port_t		*port = ldcp->ldc_port;
2146 	vsw_t			*vswp = port->p_vswp;
2147 
2148 	D1(vswp, "%s: enter", __func__);
2149 
2150 	/*
2151 	 * We need to handle RDX ACK messages in-band as once they
2152 	 * are exchanged it is possible that we will get an
2153 	 * immediate (legitimate) data packet.
2154 	 */
2155 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2156 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2157 
2158 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2159 			return;
2160 
2161 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2162 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2163 		    "(ostate 0x%llx : hphase %d)", __func__,
2164 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2165 		vsw_next_milestone(ldcp);
2166 		return;
2167 	}
2168 
2169 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2170 
2171 	if (ctaskp == NULL) {
2172 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2173 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2174 		return;
2175 	}
2176 
2177 	ctaskp->ldcp = ldcp;
2178 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2179 	ctaskp->hss_id = ldcp->hss_id;
2180 
2181 	/*
2182 	 * Dispatch task to processing taskq if port is not in
2183 	 * the process of being detached.
2184 	 */
2185 	mutex_enter(&port->state_lock);
2186 	if (port->state == VSW_PORT_INIT) {
2187 		if ((vswp->taskq_p == NULL) ||
2188 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2189 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2190 			DERR(vswp, "%s: unable to dispatch task to taskq",
2191 			    __func__);
2192 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2193 			mutex_exit(&port->state_lock);
2194 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2195 			return;
2196 		}
2197 	} else {
2198 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2199 		    "task", __func__, port->p_instance);
2200 	}
2201 
2202 	mutex_exit(&port->state_lock);
2203 
2204 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2205 	    ldcp->ldc_id);
2206 	D1(vswp, "%s: exit", __func__);
2207 }
2208 
2209 /*
2210  * Process a VIO ctrl message. Invoked from taskq.
2211  */
2212 static void
2213 vsw_process_ctrl_pkt(void *arg)
2214 {
2215 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2216 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2217 	vsw_t 		*vswp = ldcp->ldc_vswp;
2218 	vio_msg_tag_t	tag;
2219 	uint16_t	env;
2220 
2221 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2222 
2223 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2224 	env = tag.vio_subtype_env;
2225 
2226 	/* stale pkt check */
2227 	if (ctaskp->hss_id < ldcp->hss_id) {
2228 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2229 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2230 		return;
2231 	}
2232 
2233 	/* session id check */
2234 	if (ldcp->session_status & VSW_PEER_SESSION) {
2235 		if (ldcp->peer_session != tag.vio_sid) {
2236 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2237 			    __func__, ldcp->ldc_id, tag.vio_sid);
2238 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2239 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2240 			return;
2241 		}
2242 	}
2243 
2244 	/*
2245 	 * Switch on vio_subtype envelope, then let lower routines
2246 	 * decide if its an INFO, ACK or NACK packet.
2247 	 */
2248 	switch (env) {
2249 	case VIO_VER_INFO:
2250 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2251 		break;
2252 	case VIO_DRING_REG:
2253 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2254 		break;
2255 	case VIO_DRING_UNREG:
2256 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2257 		break;
2258 	case VIO_ATTR_INFO:
2259 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2260 		break;
2261 	case VNET_MCAST_INFO:
2262 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2263 		break;
2264 	case VIO_RDX:
2265 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2266 		break;
2267 	case VIO_DDS_INFO:
2268 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2269 		break;
2270 	default:
2271 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2272 	}
2273 
2274 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2275 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2276 }
2277 
2278 /*
2279  * Version negotiation. We can end up here either because our peer
2280  * has responded to a handshake message we have sent it, or our peer
2281  * has initiated a handshake with us. If its the former then can only
2282  * be ACK or NACK, if its the later can only be INFO.
2283  *
2284  * If its an ACK we move to the next stage of the handshake, namely
2285  * attribute exchange. If its a NACK we see if we can specify another
2286  * version, if we can't we stop.
2287  *
2288  * If it is an INFO we reset all params associated with communication
2289  * in that direction over this channel (remember connection is
2290  * essentially 2 independent simplex channels).
2291  */
2292 void
2293 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2294 {
2295 	vio_ver_msg_t	*ver_pkt;
2296 	vsw_t 		*vswp = ldcp->ldc_vswp;
2297 
2298 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2299 
2300 	/*
2301 	 * We know this is a ctrl/version packet so
2302 	 * cast it into the correct structure.
2303 	 */
2304 	ver_pkt = (vio_ver_msg_t *)pkt;
2305 
2306 	switch (ver_pkt->tag.vio_subtype) {
2307 	case VIO_SUBTYPE_INFO:
2308 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2309 
2310 		/*
2311 		 * Record the session id, which we will use from now
2312 		 * until we see another VER_INFO msg. Even then the
2313 		 * session id in most cases will be unchanged, execpt
2314 		 * if channel was reset.
2315 		 */
2316 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2317 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2318 			DERR(vswp, "%s: updating session id for chan %lld "
2319 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2320 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2321 		}
2322 
2323 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2324 		ldcp->session_status |= VSW_PEER_SESSION;
2325 
2326 		/* Legal message at this time ? */
2327 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2328 			return;
2329 
2330 		/*
2331 		 * First check the device class. Currently only expect
2332 		 * to be talking to a network device. In the future may
2333 		 * also talk to another switch.
2334 		 */
2335 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2336 			DERR(vswp, "%s: illegal device class %d", __func__,
2337 			    ver_pkt->dev_class);
2338 
2339 			ver_pkt->tag.vio_sid = ldcp->local_session;
2340 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2341 
2342 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2343 
2344 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2345 			    sizeof (vio_ver_msg_t), B_TRUE);
2346 
2347 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2348 			vsw_next_milestone(ldcp);
2349 			return;
2350 		} else {
2351 			ldcp->dev_class = ver_pkt->dev_class;
2352 		}
2353 
2354 		/*
2355 		 * Now check the version.
2356 		 */
2357 		if (vsw_supported_version(ver_pkt) == 0) {
2358 			/*
2359 			 * Support this major version and possibly
2360 			 * adjusted minor version.
2361 			 */
2362 
2363 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2364 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2365 
2366 			/* Store accepted values */
2367 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2368 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2369 
2370 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2371 
2372 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2373 
2374 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2375 				/*
2376 				 * Send a version info message
2377 				 * using the accepted version that
2378 				 * we are about to ack. Also note that
2379 				 * we send our ver info before we ack.
2380 				 * Otherwise, as soon as receiving the
2381 				 * ack, obp sends attr info msg, which
2382 				 * breaks vsw_check_flag() invoked
2383 				 * from vsw_process_ctrl_attr_pkt();
2384 				 * as we also need VSW_VER_ACK_RECV to
2385 				 * be set in lane_out.lstate, before
2386 				 * we can receive attr info.
2387 				 */
2388 				vsw_send_ver(ldcp);
2389 			}
2390 		} else {
2391 			/*
2392 			 * NACK back with the next lower major/minor
2393 			 * pairing we support (if don't suuport any more
2394 			 * versions then they will be set to zero.
2395 			 */
2396 
2397 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2398 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2399 
2400 			/* Store updated values */
2401 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2402 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2403 
2404 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2405 
2406 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2407 		}
2408 
2409 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2410 		ver_pkt->tag.vio_sid = ldcp->local_session;
2411 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2412 		    sizeof (vio_ver_msg_t), B_TRUE);
2413 
2414 		vsw_next_milestone(ldcp);
2415 		break;
2416 
2417 	case VIO_SUBTYPE_ACK:
2418 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2419 
2420 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2421 			return;
2422 
2423 		/* Store updated values */
2424 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2425 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2426 
2427 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2428 		vsw_next_milestone(ldcp);
2429 
2430 		break;
2431 
2432 	case VIO_SUBTYPE_NACK:
2433 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2434 
2435 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2436 			return;
2437 
2438 		/*
2439 		 * If our peer sent us a NACK with the ver fields set to
2440 		 * zero then there is nothing more we can do. Otherwise see
2441 		 * if we support either the version suggested, or a lesser
2442 		 * one.
2443 		 */
2444 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2445 			DERR(vswp, "%s: peer unable to negotiate any "
2446 			    "further.", __func__);
2447 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2448 			vsw_next_milestone(ldcp);
2449 			return;
2450 		}
2451 
2452 		/*
2453 		 * Check to see if we support this major version or
2454 		 * a lower one. If we don't then maj/min will be set
2455 		 * to zero.
2456 		 */
2457 		(void) vsw_supported_version(ver_pkt);
2458 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2459 			/* Nothing more we can do */
2460 			DERR(vswp, "%s: version negotiation failed.\n",
2461 			    __func__);
2462 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2463 			vsw_next_milestone(ldcp);
2464 		} else {
2465 			/* found a supported major version */
2466 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2467 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2468 
2469 			D2(vswp, "%s: resending with updated values (%x, %x)",
2470 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2471 
2472 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2473 			ver_pkt->tag.vio_sid = ldcp->local_session;
2474 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2475 
2476 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2477 
2478 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2479 			    sizeof (vio_ver_msg_t), B_TRUE);
2480 
2481 			vsw_next_milestone(ldcp);
2482 
2483 		}
2484 		break;
2485 
2486 	default:
2487 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2488 		    ver_pkt->tag.vio_subtype);
2489 	}
2490 
2491 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2492 }
2493 
2494 /*
2495  * Process an attribute packet. We can end up here either because our peer
2496  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2497  * peer has sent us an attribute INFO message
2498  *
2499  * If its an ACK we then move to the next stage of the handshake which
2500  * is to send our descriptor ring info to our peer. If its a NACK then
2501  * there is nothing more we can (currently) do.
2502  *
2503  * If we get a valid/acceptable INFO packet (and we have already negotiated
2504  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2505  * NACK back and reset channel state to INACTIV.
2506  *
2507  * FUTURE: in time we will probably negotiate over attributes, but for
2508  * the moment unacceptable attributes are regarded as a fatal error.
2509  *
2510  */
2511 void
2512 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2513 {
2514 	vnet_attr_msg_t		*attr_pkt;
2515 	vsw_t			*vswp = ldcp->ldc_vswp;
2516 	vsw_port_t		*port = ldcp->ldc_port;
2517 	uint64_t		macaddr = 0;
2518 	int			i;
2519 
2520 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2521 
2522 	/*
2523 	 * We know this is a ctrl/attr packet so
2524 	 * cast it into the correct structure.
2525 	 */
2526 	attr_pkt = (vnet_attr_msg_t *)pkt;
2527 
2528 	switch (attr_pkt->tag.vio_subtype) {
2529 	case VIO_SUBTYPE_INFO:
2530 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2531 
2532 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2533 			return;
2534 
2535 		/*
2536 		 * If the attributes are unacceptable then we NACK back.
2537 		 */
2538 		if (vsw_check_attr(attr_pkt, ldcp)) {
2539 
2540 			DERR(vswp, "%s (chan %d): invalid attributes",
2541 			    __func__, ldcp->ldc_id);
2542 
2543 			vsw_free_lane_resources(ldcp, INBOUND);
2544 
2545 			attr_pkt->tag.vio_sid = ldcp->local_session;
2546 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2547 
2548 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2549 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2550 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2551 			    sizeof (vnet_attr_msg_t), B_TRUE);
2552 
2553 			vsw_next_milestone(ldcp);
2554 			return;
2555 		}
2556 
2557 		/*
2558 		 * Otherwise store attributes for this lane and update
2559 		 * lane state.
2560 		 */
2561 		ldcp->lane_in.mtu = attr_pkt->mtu;
2562 		ldcp->lane_in.addr = attr_pkt->addr;
2563 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
2564 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
2565 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
2566 
2567 		macaddr = ldcp->lane_in.addr;
2568 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2569 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2570 			macaddr >>= 8;
2571 		}
2572 
2573 		/* create the fdb entry for this port/mac address */
2574 		vsw_fdbe_add(vswp, port);
2575 
2576 		/* add the port to the specified vlans */
2577 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2578 
2579 		/* setup device specifc xmit routines */
2580 		mutex_enter(&port->tx_lock);
2581 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2582 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2583 		    (VSW_VER_LT(ldcp, 1, 2) &&
2584 		    (ldcp->lane_in.xfer_mode == VIO_DRING_MODE_V1_0))) {
2585 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2586 			port->transmit = vsw_dringsend;
2587 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
2588 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2589 			vsw_create_privring(ldcp);
2590 			port->transmit = vsw_descrsend;
2591 			ldcp->lane_out.xfer_mode = VIO_DESC_MODE;
2592 		}
2593 
2594 		/*
2595 		 * HybridIO is supported only vnet, not by OBP.
2596 		 * So, set hio_capable to true only when in DRING mode.
2597 		 */
2598 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2599 		    (ldcp->lane_in.xfer_mode != VIO_DESC_MODE)) {
2600 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2601 		} else {
2602 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2603 		}
2604 
2605 		mutex_exit(&port->tx_lock);
2606 
2607 		attr_pkt->tag.vio_sid = ldcp->local_session;
2608 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2609 
2610 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2611 
2612 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
2613 
2614 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2615 		    sizeof (vnet_attr_msg_t), B_TRUE);
2616 
2617 		vsw_next_milestone(ldcp);
2618 		break;
2619 
2620 	case VIO_SUBTYPE_ACK:
2621 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2622 
2623 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2624 			return;
2625 
2626 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
2627 		vsw_next_milestone(ldcp);
2628 		break;
2629 
2630 	case VIO_SUBTYPE_NACK:
2631 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2632 
2633 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2634 			return;
2635 
2636 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
2637 		vsw_next_milestone(ldcp);
2638 		break;
2639 
2640 	default:
2641 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2642 		    attr_pkt->tag.vio_subtype);
2643 	}
2644 
2645 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2646 }
2647 
2648 /*
2649  * Process a dring info packet. We can end up here either because our peer
2650  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2651  * peer has sent us a dring INFO message.
2652  *
2653  * If we get a valid/acceptable INFO packet (and we have already negotiated
2654  * a version) we ACK back and update the lane state, otherwise we NACK back.
2655  *
2656  * FUTURE: nothing to stop client from sending us info on multiple dring's
2657  * but for the moment we will just use the first one we are given.
2658  *
2659  */
2660 void
2661 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2662 {
2663 	vio_dring_reg_msg_t	*dring_pkt;
2664 	vsw_t			*vswp = ldcp->ldc_vswp;
2665 	ldc_mem_info_t		minfo;
2666 	dring_info_t		*dp, *dbp;
2667 	int			dring_found = 0;
2668 
2669 	/*
2670 	 * We know this is a ctrl/dring packet so
2671 	 * cast it into the correct structure.
2672 	 */
2673 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2674 
2675 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2676 
2677 	switch (dring_pkt->tag.vio_subtype) {
2678 	case VIO_SUBTYPE_INFO:
2679 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2680 
2681 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2682 			return;
2683 
2684 		/*
2685 		 * If the dring params are unacceptable then we NACK back.
2686 		 */
2687 		if (vsw_check_dring_info(dring_pkt)) {
2688 
2689 			DERR(vswp, "%s (%lld): invalid dring info",
2690 			    __func__, ldcp->ldc_id);
2691 
2692 			vsw_free_lane_resources(ldcp, INBOUND);
2693 
2694 			dring_pkt->tag.vio_sid = ldcp->local_session;
2695 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2696 
2697 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2698 
2699 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2700 
2701 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2702 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2703 
2704 			vsw_next_milestone(ldcp);
2705 			return;
2706 		}
2707 
2708 		/*
2709 		 * Otherwise, attempt to map in the dring using the
2710 		 * cookie. If that succeeds we send back a unique dring
2711 		 * identifier that the sending side will use in future
2712 		 * to refer to this descriptor ring.
2713 		 */
2714 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2715 
2716 		dp->num_descriptors = dring_pkt->num_descriptors;
2717 		dp->descriptor_size = dring_pkt->descriptor_size;
2718 		dp->options = dring_pkt->options;
2719 		dp->ncookies = dring_pkt->ncookies;
2720 
2721 		/*
2722 		 * Note: should only get one cookie. Enforced in
2723 		 * the ldc layer.
2724 		 */
2725 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2726 		    sizeof (ldc_mem_cookie_t));
2727 
2728 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2729 		    dp->num_descriptors, dp->descriptor_size);
2730 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2731 		    dp->options, dp->ncookies);
2732 
2733 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2734 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2735 		    LDC_SHADOW_MAP, &(dp->handle))) != 0) {
2736 
2737 			DERR(vswp, "%s: dring_map failed\n", __func__);
2738 
2739 			kmem_free(dp, sizeof (dring_info_t));
2740 			vsw_free_lane_resources(ldcp, INBOUND);
2741 
2742 			dring_pkt->tag.vio_sid = ldcp->local_session;
2743 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2744 
2745 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2746 
2747 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2748 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2749 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2750 
2751 			vsw_next_milestone(ldcp);
2752 			return;
2753 		}
2754 
2755 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2756 
2757 			DERR(vswp, "%s: dring_addr failed\n", __func__);
2758 
2759 			kmem_free(dp, sizeof (dring_info_t));
2760 			vsw_free_lane_resources(ldcp, INBOUND);
2761 
2762 			dring_pkt->tag.vio_sid = ldcp->local_session;
2763 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2764 
2765 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2766 
2767 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2768 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2769 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2770 
2771 			vsw_next_milestone(ldcp);
2772 			return;
2773 		} else {
2774 			/* store the address of the pub part of ring */
2775 			dp->pub_addr = minfo.vaddr;
2776 		}
2777 
2778 		/* no private section as we are importing */
2779 		dp->priv_addr = NULL;
2780 
2781 		/*
2782 		 * Using simple mono increasing int for ident at
2783 		 * the moment.
2784 		 */
2785 		dp->ident = ldcp->next_ident;
2786 		ldcp->next_ident++;
2787 
2788 		dp->end_idx = 0;
2789 		dp->next = NULL;
2790 
2791 		/*
2792 		 * Link it onto the end of the list of drings
2793 		 * for this lane.
2794 		 */
2795 		if (ldcp->lane_in.dringp == NULL) {
2796 			D2(vswp, "%s: adding first INBOUND dring", __func__);
2797 			ldcp->lane_in.dringp = dp;
2798 		} else {
2799 			dbp = ldcp->lane_in.dringp;
2800 
2801 			while (dbp->next != NULL)
2802 				dbp = dbp->next;
2803 
2804 			dbp->next = dp;
2805 		}
2806 
2807 		/* acknowledge it */
2808 		dring_pkt->tag.vio_sid = ldcp->local_session;
2809 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2810 		dring_pkt->dring_ident = dp->ident;
2811 
2812 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2813 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
2814 
2815 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
2816 		vsw_next_milestone(ldcp);
2817 		break;
2818 
2819 	case VIO_SUBTYPE_ACK:
2820 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2821 
2822 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
2823 			return;
2824 
2825 		/*
2826 		 * Peer is acknowledging our dring info and will have
2827 		 * sent us a dring identifier which we will use to
2828 		 * refer to this ring w.r.t. our peer.
2829 		 */
2830 		dp = ldcp->lane_out.dringp;
2831 		if (dp != NULL) {
2832 			/*
2833 			 * Find the ring this ident should be associated
2834 			 * with.
2835 			 */
2836 			if (vsw_dring_match(dp, dring_pkt)) {
2837 				dring_found = 1;
2838 
2839 			} else while (dp != NULL) {
2840 				if (vsw_dring_match(dp, dring_pkt)) {
2841 					dring_found = 1;
2842 					break;
2843 				}
2844 				dp = dp->next;
2845 			}
2846 
2847 			if (dring_found == 0) {
2848 				DERR(NULL, "%s: unrecognised ring cookie",
2849 				    __func__);
2850 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2851 				return;
2852 			}
2853 
2854 		} else {
2855 			DERR(vswp, "%s: DRING ACK received but no drings "
2856 			    "allocated", __func__);
2857 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2858 			return;
2859 		}
2860 
2861 		/* store ident */
2862 		dp->ident = dring_pkt->dring_ident;
2863 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
2864 		vsw_next_milestone(ldcp);
2865 		break;
2866 
2867 	case VIO_SUBTYPE_NACK:
2868 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2869 
2870 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
2871 			return;
2872 
2873 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
2874 		vsw_next_milestone(ldcp);
2875 		break;
2876 
2877 	default:
2878 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2879 		    dring_pkt->tag.vio_subtype);
2880 	}
2881 
2882 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2883 }
2884 
2885 /*
2886  * Process a request from peer to unregister a dring.
2887  *
2888  * For the moment we just restart the handshake if our
2889  * peer endpoint attempts to unregister a dring.
2890  */
2891 void
2892 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
2893 {
2894 	vsw_t			*vswp = ldcp->ldc_vswp;
2895 	vio_dring_unreg_msg_t	*dring_pkt;
2896 
2897 	/*
2898 	 * We know this is a ctrl/dring packet so
2899 	 * cast it into the correct structure.
2900 	 */
2901 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
2902 
2903 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2904 
2905 	switch (dring_pkt->tag.vio_subtype) {
2906 	case VIO_SUBTYPE_INFO:
2907 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2908 
2909 		DWARN(vswp, "%s: restarting handshake..", __func__);
2910 		break;
2911 
2912 	case VIO_SUBTYPE_ACK:
2913 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2914 
2915 		DWARN(vswp, "%s: restarting handshake..", __func__);
2916 		break;
2917 
2918 	case VIO_SUBTYPE_NACK:
2919 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2920 
2921 		DWARN(vswp, "%s: restarting handshake..", __func__);
2922 		break;
2923 
2924 	default:
2925 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
2926 		    dring_pkt->tag.vio_subtype);
2927 	}
2928 
2929 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2930 
2931 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2932 }
2933 
2934 #define	SND_MCST_NACK(ldcp, pkt) \
2935 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
2936 	pkt->tag.vio_sid = ldcp->local_session; \
2937 	(void) vsw_send_msg(ldcp, (void *)pkt, \
2938 			sizeof (vnet_mcast_msg_t), B_TRUE);
2939 
2940 /*
2941  * Process a multicast request from a vnet.
2942  *
2943  * Vnet's specify a multicast address that they are interested in. This
2944  * address is used as a key into the hash table which forms the multicast
2945  * forwarding database (mFDB).
2946  *
2947  * The table keys are the multicast addresses, while the table entries
2948  * are pointers to lists of ports which wish to receive packets for the
2949  * specified multicast address.
2950  *
2951  * When a multicast packet is being switched we use the address as a key
2952  * into the hash table, and then walk the appropriate port list forwarding
2953  * the pkt to each port in turn.
2954  *
2955  * If a vnet is no longer interested in a particular multicast grouping
2956  * we simply find the correct location in the hash table and then delete
2957  * the relevant port from the port list.
2958  *
2959  * To deal with the case whereby a port is being deleted without first
2960  * removing itself from the lists in the hash table, we maintain a list
2961  * of multicast addresses the port has registered an interest in, within
2962  * the port structure itself. We then simply walk that list of addresses
2963  * using them as keys into the hash table and remove the port from the
2964  * appropriate lists.
2965  */
2966 static void
2967 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
2968 {
2969 	vnet_mcast_msg_t	*mcst_pkt;
2970 	vsw_port_t		*port = ldcp->ldc_port;
2971 	vsw_t			*vswp = ldcp->ldc_vswp;
2972 	int			i;
2973 
2974 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2975 
2976 	/*
2977 	 * We know this is a ctrl/mcast packet so
2978 	 * cast it into the correct structure.
2979 	 */
2980 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
2981 
2982 	switch (mcst_pkt->tag.vio_subtype) {
2983 	case VIO_SUBTYPE_INFO:
2984 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2985 
2986 		/*
2987 		 * Check if in correct state to receive a multicast
2988 		 * message (i.e. handshake complete). If not reset
2989 		 * the handshake.
2990 		 */
2991 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
2992 			return;
2993 
2994 		/*
2995 		 * Before attempting to add or remove address check
2996 		 * that they are valid multicast addresses.
2997 		 * If not, then NACK back.
2998 		 */
2999 		for (i = 0; i < mcst_pkt->count; i++) {
3000 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3001 				DERR(vswp, "%s: invalid multicast address",
3002 				    __func__);
3003 				SND_MCST_NACK(ldcp, mcst_pkt);
3004 				return;
3005 			}
3006 		}
3007 
3008 		/*
3009 		 * Now add/remove the addresses. If this fails we
3010 		 * NACK back.
3011 		 */
3012 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3013 			SND_MCST_NACK(ldcp, mcst_pkt);
3014 			return;
3015 		}
3016 
3017 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3018 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3019 
3020 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3021 
3022 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3023 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3024 		break;
3025 
3026 	case VIO_SUBTYPE_ACK:
3027 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3028 
3029 		/*
3030 		 * We shouldn't ever get a multicast ACK message as
3031 		 * at the moment we never request multicast addresses
3032 		 * to be set on some other device. This may change in
3033 		 * the future if we have cascading switches.
3034 		 */
3035 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3036 			return;
3037 
3038 				/* Do nothing */
3039 		break;
3040 
3041 	case VIO_SUBTYPE_NACK:
3042 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3043 
3044 		/*
3045 		 * We shouldn't get a multicast NACK packet for the
3046 		 * same reasons as we shouldn't get a ACK packet.
3047 		 */
3048 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3049 			return;
3050 
3051 				/* Do nothing */
3052 		break;
3053 
3054 	default:
3055 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3056 		    mcst_pkt->tag.vio_subtype);
3057 	}
3058 
3059 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3060 }
3061 
3062 static void
3063 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3064 {
3065 	vio_rdx_msg_t	*rdx_pkt;
3066 	vsw_t		*vswp = ldcp->ldc_vswp;
3067 
3068 	/*
3069 	 * We know this is a ctrl/rdx packet so
3070 	 * cast it into the correct structure.
3071 	 */
3072 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3073 
3074 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3075 
3076 	switch (rdx_pkt->tag.vio_subtype) {
3077 	case VIO_SUBTYPE_INFO:
3078 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3079 
3080 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3081 			return;
3082 
3083 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3084 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3085 
3086 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3087 
3088 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3089 
3090 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3091 		    sizeof (vio_rdx_msg_t), B_TRUE);
3092 
3093 		vsw_next_milestone(ldcp);
3094 		break;
3095 
3096 	case VIO_SUBTYPE_ACK:
3097 		/*
3098 		 * Should be handled in-band by callback handler.
3099 		 */
3100 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3101 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3102 		break;
3103 
3104 	case VIO_SUBTYPE_NACK:
3105 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3106 
3107 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3108 			return;
3109 
3110 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3111 		vsw_next_milestone(ldcp);
3112 		break;
3113 
3114 	default:
3115 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3116 		    rdx_pkt->tag.vio_subtype);
3117 	}
3118 
3119 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3120 }
3121 
3122 static void
3123 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3124 	uint32_t msglen)
3125 {
3126 	uint16_t	env = tagp->vio_subtype_env;
3127 	vsw_t		*vswp = ldcp->ldc_vswp;
3128 
3129 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3130 
3131 	/* session id check */
3132 	if (ldcp->session_status & VSW_PEER_SESSION) {
3133 		if (ldcp->peer_session != tagp->vio_sid) {
3134 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3135 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3136 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3137 			return;
3138 		}
3139 	}
3140 
3141 	/*
3142 	 * It is an error for us to be getting data packets
3143 	 * before the handshake has completed.
3144 	 */
3145 	if (ldcp->hphase != VSW_MILESTONE4) {
3146 		DERR(vswp, "%s: got data packet before handshake complete "
3147 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3148 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3149 		DUMP_FLAGS(ldcp->lane_in.lstate);
3150 		DUMP_FLAGS(ldcp->lane_out.lstate);
3151 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3152 		return;
3153 	}
3154 
3155 	/*
3156 	 * To reduce the locking contention, release the
3157 	 * ldc_cblock here and re-acquire it once we are done
3158 	 * receiving packets.
3159 	 */
3160 	mutex_exit(&ldcp->ldc_cblock);
3161 	mutex_enter(&ldcp->ldc_rxlock);
3162 
3163 	/*
3164 	 * Switch on vio_subtype envelope, then let lower routines
3165 	 * decide if its an INFO, ACK or NACK packet.
3166 	 */
3167 	if (env == VIO_DRING_DATA) {
3168 		vsw_process_data_dring_pkt(ldcp, dpkt);
3169 	} else if (env == VIO_PKT_DATA) {
3170 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3171 	} else if (env == VIO_DESC_DATA) {
3172 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3173 	} else {
3174 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3175 	}
3176 
3177 	mutex_exit(&ldcp->ldc_rxlock);
3178 	mutex_enter(&ldcp->ldc_cblock);
3179 
3180 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3181 }
3182 
3183 #define	SND_DRING_NACK(ldcp, pkt) \
3184 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3185 	pkt->tag.vio_sid = ldcp->local_session; \
3186 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3187 			sizeof (vio_dring_msg_t), B_TRUE);
3188 
3189 static void
3190 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3191 {
3192 	vio_dring_msg_t		*dring_pkt;
3193 	vnet_public_desc_t	*pub_addr = NULL;
3194 	vsw_private_desc_t	*priv_addr = NULL;
3195 	dring_info_t		*dp = NULL;
3196 	vsw_t			*vswp = ldcp->ldc_vswp;
3197 	mblk_t			*mp = NULL;
3198 	mblk_t			*bp = NULL;
3199 	mblk_t			*bpt = NULL;
3200 	size_t			nbytes = 0;
3201 	uint64_t		ncookies = 0;
3202 	uint64_t		chain = 0;
3203 	uint64_t		len;
3204 	uint32_t		pos, start, datalen;
3205 	uint32_t		range_start, range_end;
3206 	int32_t			end, num, cnt = 0;
3207 	int			i, rv, msg_rv = 0;
3208 	boolean_t		ack_needed = B_FALSE;
3209 	boolean_t		prev_desc_ack = B_FALSE;
3210 	int			read_attempts = 0;
3211 	struct ether_header	*ehp;
3212 
3213 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3214 
3215 	/*
3216 	 * We know this is a data/dring packet so
3217 	 * cast it into the correct structure.
3218 	 */
3219 	dring_pkt = (vio_dring_msg_t *)dpkt;
3220 
3221 	/*
3222 	 * Switch on the vio_subtype. If its INFO then we need to
3223 	 * process the data. If its an ACK we need to make sure
3224 	 * it makes sense (i.e did we send an earlier data/info),
3225 	 * and if its a NACK then we maybe attempt a retry.
3226 	 */
3227 	switch (dring_pkt->tag.vio_subtype) {
3228 	case VIO_SUBTYPE_INFO:
3229 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3230 
3231 		READ_ENTER(&ldcp->lane_in.dlistrw);
3232 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3233 		    dring_pkt->dring_ident)) == NULL) {
3234 			RW_EXIT(&ldcp->lane_in.dlistrw);
3235 
3236 			DERR(vswp, "%s(%lld): unable to find dring from "
3237 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3238 			    dring_pkt->dring_ident);
3239 
3240 			SND_DRING_NACK(ldcp, dring_pkt);
3241 			return;
3242 		}
3243 
3244 		start = pos = dring_pkt->start_idx;
3245 		end = dring_pkt->end_idx;
3246 		len = dp->num_descriptors;
3247 
3248 		range_start = range_end = pos;
3249 
3250 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3251 		    __func__, ldcp->ldc_id, start, end);
3252 
3253 		if (end == -1) {
3254 			num = -1;
3255 		} else if (end >= 0) {
3256 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3257 
3258 			/* basic sanity check */
3259 			if (end > len) {
3260 				RW_EXIT(&ldcp->lane_in.dlistrw);
3261 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3262 				    "ring length %lld", __func__,
3263 				    ldcp->ldc_id, end, len);
3264 
3265 				SND_DRING_NACK(ldcp, dring_pkt);
3266 				return;
3267 			}
3268 		} else {
3269 			RW_EXIT(&ldcp->lane_in.dlistrw);
3270 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3271 			    __func__, ldcp->ldc_id, end);
3272 			SND_DRING_NACK(ldcp, dring_pkt);
3273 			return;
3274 		}
3275 
3276 		while (cnt != num) {
3277 vsw_recheck_desc:
3278 			if ((rv = ldc_mem_dring_acquire(dp->handle,
3279 			    pos, pos)) != 0) {
3280 				RW_EXIT(&ldcp->lane_in.dlistrw);
3281 				DERR(vswp, "%s(%lld): unable to acquire "
3282 				    "descriptor at pos %d: err %d",
3283 				    __func__, pos, ldcp->ldc_id, rv);
3284 				SND_DRING_NACK(ldcp, dring_pkt);
3285 				ldcp->ldc_stats.ierrors++;
3286 				return;
3287 			}
3288 
3289 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3290 
3291 			/*
3292 			 * When given a bounded range of descriptors
3293 			 * to process, its an error to hit a descriptor
3294 			 * which is not ready. In the non-bounded case
3295 			 * (end_idx == -1) this simply indicates we have
3296 			 * reached the end of the current active range.
3297 			 */
3298 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
3299 				/* unbound - no error */
3300 				if (end == -1) {
3301 					if (read_attempts == vsw_read_attempts)
3302 						break;
3303 
3304 					delay(drv_usectohz(vsw_desc_delay));
3305 					read_attempts++;
3306 					goto vsw_recheck_desc;
3307 				}
3308 
3309 				/* bounded - error - so NACK back */
3310 				RW_EXIT(&ldcp->lane_in.dlistrw);
3311 				DERR(vswp, "%s(%lld): descriptor not READY "
3312 				    "(%d)", __func__, ldcp->ldc_id,
3313 				    pub_addr->hdr.dstate);
3314 				SND_DRING_NACK(ldcp, dring_pkt);
3315 				return;
3316 			}
3317 
3318 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3319 
3320 			range_end = pos;
3321 
3322 			/*
3323 			 * If we ACK'd the previous descriptor then now
3324 			 * record the new range start position for later
3325 			 * ACK's.
3326 			 */
3327 			if (prev_desc_ack) {
3328 				range_start = pos;
3329 
3330 				D2(vswp, "%s(%lld): updating range start to be "
3331 				    "%d", __func__, ldcp->ldc_id, range_start);
3332 
3333 				prev_desc_ack = B_FALSE;
3334 			}
3335 
3336 			/*
3337 			 * Data is padded to align on 8 byte boundary,
3338 			 * datalen is actual data length, i.e. minus that
3339 			 * padding.
3340 			 */
3341 			datalen = pub_addr->nbytes;
3342 
3343 			/*
3344 			 * Does peer wish us to ACK when we have finished
3345 			 * with this descriptor ?
3346 			 */
3347 			if (pub_addr->hdr.ack)
3348 				ack_needed = B_TRUE;
3349 
3350 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3351 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3352 			    __func__, ldcp->ldc_id, pos, pub_addr,
3353 			    pub_addr->hdr.dstate, datalen);
3354 
3355 			/*
3356 			 * Mark that we are starting to process descriptor.
3357 			 */
3358 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
3359 
3360 			/*
3361 			 * Ensure that we ask ldc for an aligned
3362 			 * number of bytes.
3363 			 */
3364 			nbytes = (datalen + VNET_IPALIGN + 7) & ~7;
3365 
3366 			mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3367 			if (mp == NULL) {
3368 				ldcp->ldc_stats.rx_vio_allocb_fail++;
3369 				/*
3370 				 * No free receive buffers available, so
3371 				 * fallback onto allocb(9F). Make sure that
3372 				 * we get a data buffer which is a multiple
3373 				 * of 8 as this is required by ldc_mem_copy.
3374 				 */
3375 				DTRACE_PROBE(allocb);
3376 				if ((mp = allocb(datalen + VNET_IPALIGN + 8,
3377 				    BPRI_MED)) == NULL) {
3378 					DERR(vswp, "%s(%ld): allocb failed",
3379 					    __func__, ldcp->ldc_id);
3380 					pub_addr->hdr.dstate = VIO_DESC_DONE;
3381 					(void) ldc_mem_dring_release(dp->handle,
3382 					    pos, pos);
3383 					ldcp->ldc_stats.ierrors++;
3384 					ldcp->ldc_stats.rx_allocb_fail++;
3385 					break;
3386 				}
3387 			}
3388 
3389 			ncookies = pub_addr->ncookies;
3390 			rv = ldc_mem_copy(ldcp->ldc_handle,
3391 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3392 			    pub_addr->memcookie, ncookies, LDC_COPY_IN);
3393 
3394 			if (rv != 0) {
3395 				DERR(vswp, "%s(%d): unable to copy in data "
3396 				    "from %d cookies in desc %d (rv %d)",
3397 				    __func__, ldcp->ldc_id, ncookies, pos, rv);
3398 				freemsg(mp);
3399 
3400 				pub_addr->hdr.dstate = VIO_DESC_DONE;
3401 				(void) ldc_mem_dring_release(dp->handle,
3402 				    pos, pos);
3403 				ldcp->ldc_stats.ierrors++;
3404 				break;
3405 			} else {
3406 				D2(vswp, "%s(%d): copied in %ld bytes"
3407 				    " using %d cookies", __func__,
3408 				    ldcp->ldc_id, nbytes, ncookies);
3409 			}
3410 
3411 			/* adjust the read pointer to skip over the padding */
3412 			mp->b_rptr += VNET_IPALIGN;
3413 
3414 			/* point to the actual end of data */
3415 			mp->b_wptr = mp->b_rptr + datalen;
3416 
3417 			/* update statistics */
3418 			ehp = (struct ether_header *)mp->b_rptr;
3419 			if (IS_BROADCAST(ehp))
3420 				ldcp->ldc_stats.brdcstrcv++;
3421 			else if (IS_MULTICAST(ehp))
3422 				ldcp->ldc_stats.multircv++;
3423 
3424 			ldcp->ldc_stats.ipackets++;
3425 			ldcp->ldc_stats.rbytes += datalen;
3426 
3427 			/*
3428 			 * IPALIGN space can be used for VLAN_TAG
3429 			 */
3430 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3431 			    VSW_VNETPORT, mp);
3432 
3433 			/* build a chain of received packets */
3434 			if (bp == NULL) {
3435 				/* first pkt */
3436 				bp = mp;
3437 				bp->b_next = bp->b_prev = NULL;
3438 				bpt = bp;
3439 				chain = 1;
3440 			} else {
3441 				mp->b_next = mp->b_prev = NULL;
3442 				bpt->b_next = mp;
3443 				bpt = mp;
3444 				chain++;
3445 			}
3446 
3447 			/* mark we are finished with this descriptor */
3448 			pub_addr->hdr.dstate = VIO_DESC_DONE;
3449 
3450 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
3451 
3452 			/*
3453 			 * Send an ACK back to peer if requested.
3454 			 */
3455 			if (ack_needed) {
3456 				ack_needed = B_FALSE;
3457 
3458 				dring_pkt->start_idx = range_start;
3459 				dring_pkt->end_idx = range_end;
3460 
3461 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3462 				    " requested", __func__, ldcp->ldc_id,
3463 				    dring_pkt->start_idx, dring_pkt->end_idx);
3464 
3465 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3466 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3467 				dring_pkt->tag.vio_sid = ldcp->local_session;
3468 
3469 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3470 				    sizeof (vio_dring_msg_t), B_FALSE);
3471 
3472 				/*
3473 				 * Check if ACK was successfully sent. If not
3474 				 * we break and deal with that below.
3475 				 */
3476 				if (msg_rv != 0)
3477 					break;
3478 
3479 				prev_desc_ack = B_TRUE;
3480 				range_start = pos;
3481 			}
3482 
3483 			/* next descriptor */
3484 			pos = (pos + 1) % len;
3485 			cnt++;
3486 
3487 			/*
3488 			 * Break out of loop here and stop processing to
3489 			 * allow some other network device (or disk) to
3490 			 * get access to the cpu.
3491 			 */
3492 			if (chain > vsw_chain_len) {
3493 				D3(vswp, "%s(%lld): switching chain of %d "
3494 				    "msgs", __func__, ldcp->ldc_id, chain);
3495 				break;
3496 			}
3497 		}
3498 		RW_EXIT(&ldcp->lane_in.dlistrw);
3499 
3500 		/*
3501 		 * If when we attempted to send the ACK we found that the
3502 		 * channel had been reset then now handle this. We deal with
3503 		 * it here as we cannot reset the channel while holding the
3504 		 * dlistrw lock, and we don't want to acquire/release it
3505 		 * continuously in the above loop, as a channel reset should
3506 		 * be a rare event.
3507 		 */
3508 		if (msg_rv == ECONNRESET) {
3509 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3510 			break;
3511 		}
3512 
3513 		/* send the chain of packets to be switched */
3514 		if (bp != NULL) {
3515 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3516 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3517 			    __func__, ldcp->ldc_id, chain);
3518 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3519 			    ldcp->ldc_port, NULL);
3520 		}
3521 
3522 		DTRACE_PROBE1(msg_cnt, int, cnt);
3523 
3524 		/*
3525 		 * We are now finished so ACK back with the state
3526 		 * set to STOPPING so our peer knows we are finished
3527 		 */
3528 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3529 		dring_pkt->tag.vio_sid = ldcp->local_session;
3530 
3531 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3532 
3533 		DTRACE_PROBE(stop_process_sent);
3534 
3535 		/*
3536 		 * We have not processed any more descriptors beyond
3537 		 * the last one we ACK'd.
3538 		 */
3539 		if (prev_desc_ack)
3540 			range_start = range_end;
3541 
3542 		dring_pkt->start_idx = range_start;
3543 		dring_pkt->end_idx = range_end;
3544 
3545 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3546 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3547 		    dring_pkt->end_idx);
3548 
3549 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3550 		    sizeof (vio_dring_msg_t), B_TRUE);
3551 		break;
3552 
3553 	case VIO_SUBTYPE_ACK:
3554 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3555 		/*
3556 		 * Verify that the relevant descriptors are all
3557 		 * marked as DONE
3558 		 */
3559 		READ_ENTER(&ldcp->lane_out.dlistrw);
3560 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3561 		    dring_pkt->dring_ident)) == NULL) {
3562 			RW_EXIT(&ldcp->lane_out.dlistrw);
3563 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3564 			return;
3565 		}
3566 
3567 		start = end = 0;
3568 		start = dring_pkt->start_idx;
3569 		end = dring_pkt->end_idx;
3570 		len = dp->num_descriptors;
3571 
3572 
3573 		mutex_enter(&dp->dlock);
3574 		dp->last_ack_recv = end;
3575 		ldcp->ldc_stats.dring_data_acks++;
3576 		mutex_exit(&dp->dlock);
3577 
3578 		(void) vsw_reclaim_dring(dp, start);
3579 
3580 		/*
3581 		 * If our peer is stopping processing descriptors then
3582 		 * we check to make sure it has processed all the descriptors
3583 		 * we have updated. If not then we send it a new message
3584 		 * to prompt it to restart.
3585 		 */
3586 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3587 			DTRACE_PROBE(stop_process_recv);
3588 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3589 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3590 			    dring_pkt->end_idx);
3591 
3592 			/*
3593 			 * Check next descriptor in public section of ring.
3594 			 * If its marked as READY then we need to prompt our
3595 			 * peer to start processing the ring again.
3596 			 */
3597 			i = (end + 1) % len;
3598 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3599 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3600 
3601 			/*
3602 			 * Hold the restart lock across all of this to
3603 			 * make sure that its not possible for us to
3604 			 * decide that a msg needs to be sent in the future
3605 			 * but the sending code having already checked is
3606 			 * about to exit.
3607 			 */
3608 			mutex_enter(&dp->restart_lock);
3609 			ldcp->ldc_stats.dring_stopped_acks++;
3610 			mutex_enter(&priv_addr->dstate_lock);
3611 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3612 
3613 				mutex_exit(&priv_addr->dstate_lock);
3614 
3615 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3616 				dring_pkt->tag.vio_sid = ldcp->local_session;
3617 
3618 				dring_pkt->start_idx = (end + 1) % len;
3619 				dring_pkt->end_idx = -1;
3620 
3621 				D2(vswp, "%s(%lld) : sending restart msg:"
3622 				    " %d : %d", __func__, ldcp->ldc_id,
3623 				    dring_pkt->start_idx, dring_pkt->end_idx);
3624 
3625 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3626 				    sizeof (vio_dring_msg_t), B_FALSE);
3627 				ldcp->ldc_stats.dring_data_msgs++;
3628 
3629 			} else {
3630 				mutex_exit(&priv_addr->dstate_lock);
3631 				dp->restart_reqd = B_TRUE;
3632 			}
3633 			mutex_exit(&dp->restart_lock);
3634 		}
3635 		RW_EXIT(&ldcp->lane_out.dlistrw);
3636 
3637 		/* only do channel reset after dropping dlistrw lock */
3638 		if (msg_rv == ECONNRESET)
3639 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3640 
3641 		break;
3642 
3643 	case VIO_SUBTYPE_NACK:
3644 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3645 		    __func__, ldcp->ldc_id);
3646 		/*
3647 		 * Something is badly wrong if we are getting NACK's
3648 		 * for our data pkts. So reset the channel.
3649 		 */
3650 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3651 
3652 		break;
3653 
3654 	default:
3655 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3656 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3657 	}
3658 
3659 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3660 }
3661 
3662 /*
3663  * dummy pkt data handler function for vnet protocol version 1.0
3664  */
3665 static void
3666 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3667 {
3668 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3669 }
3670 
3671 /*
3672  * This function handles raw pkt data messages received over the channel.
3673  * Currently, only priority-eth-type frames are received through this mechanism.
3674  * In this case, the frame(data) is present within the message itself which
3675  * is copied into an mblk before switching it.
3676  */
3677 static void
3678 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3679 {
3680 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3681 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3682 	uint32_t		size;
3683 	mblk_t			*mp;
3684 	vsw_t			*vswp = ldcp->ldc_vswp;
3685 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3686 	lane_t			*lp = &ldcp->lane_out;
3687 
3688 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3689 	if (size < ETHERMIN || size > lp->mtu) {
3690 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3691 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3692 		    ldcp->ldc_id, size);
3693 		return;
3694 	}
3695 
3696 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3697 	if (mp == NULL) {
3698 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3699 		if (mp == NULL) {
3700 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3701 			DWARN(vswp, "%s(%lld) allocb failure, "
3702 			    "unable to process priority frame\n", __func__,
3703 			    ldcp->ldc_id);
3704 			return;
3705 		}
3706 	}
3707 
3708 	/* skip over the extra space for vlan tag */
3709 	mp->b_rptr += VLAN_TAGSZ;
3710 
3711 	/* copy the frame from the payload of raw data msg into the mblk */
3712 	bcopy(dpkt->data, mp->b_rptr, size);
3713 	mp->b_wptr = mp->b_rptr + size;
3714 
3715 	/* update stats */
3716 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3717 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3718 
3719 	/*
3720 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3721 	 */
3722 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3723 
3724 	/* switch the frame to destination */
3725 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3726 }
3727 
3728 /*
3729  * Process an in-band descriptor message (most likely from
3730  * OBP).
3731  */
3732 static void
3733 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3734 {
3735 	vnet_ibnd_desc_t	*ibnd_desc;
3736 	dring_info_t		*dp = NULL;
3737 	vsw_private_desc_t	*priv_addr = NULL;
3738 	vsw_t			*vswp = ldcp->ldc_vswp;
3739 	mblk_t			*mp = NULL;
3740 	size_t			nbytes = 0;
3741 	size_t			off = 0;
3742 	uint64_t		idx = 0;
3743 	uint32_t		num = 1, len, datalen = 0;
3744 	uint64_t		ncookies = 0;
3745 	int			i, rv;
3746 	int			j = 0;
3747 
3748 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3749 
3750 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3751 
3752 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3753 	case VIO_SUBTYPE_INFO:
3754 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3755 
3756 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3757 			return;
3758 
3759 		/*
3760 		 * Data is padded to align on a 8 byte boundary,
3761 		 * nbytes is actual data length, i.e. minus that
3762 		 * padding.
3763 		 */
3764 		datalen = ibnd_desc->nbytes;
3765 
3766 		D2(vswp, "%s(%lld): processing inband desc : "
3767 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3768 
3769 		ncookies = ibnd_desc->ncookies;
3770 
3771 		/*
3772 		 * allocb(9F) returns an aligned data block. We
3773 		 * need to ensure that we ask ldc for an aligned
3774 		 * number of bytes also.
3775 		 */
3776 		nbytes = datalen;
3777 		if (nbytes & 0x7) {
3778 			off = 8 - (nbytes & 0x7);
3779 			nbytes += off;
3780 		}
3781 
3782 		/* alloc extra space for VLAN_TAG */
3783 		mp = allocb(datalen + 8, BPRI_MED);
3784 		if (mp == NULL) {
3785 			DERR(vswp, "%s(%lld): allocb failed",
3786 			    __func__, ldcp->ldc_id);
3787 			ldcp->ldc_stats.rx_allocb_fail++;
3788 			return;
3789 		}
3790 
3791 		/* skip over the extra space for VLAN_TAG */
3792 		mp->b_rptr += 8;
3793 
3794 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3795 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3796 		    LDC_COPY_IN);
3797 
3798 		if (rv != 0) {
3799 			DERR(vswp, "%s(%d): unable to copy in data from "
3800 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3801 			freemsg(mp);
3802 			ldcp->ldc_stats.ierrors++;
3803 			return;
3804 		}
3805 
3806 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3807 		    __func__, ldcp->ldc_id, nbytes, ncookies);
3808 
3809 		/* point to the actual end of data */
3810 		mp->b_wptr = mp->b_rptr + datalen;
3811 		ldcp->ldc_stats.ipackets++;
3812 		ldcp->ldc_stats.rbytes += datalen;
3813 
3814 		/*
3815 		 * We ACK back every in-band descriptor message we process
3816 		 */
3817 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3818 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3819 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3820 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
3821 
3822 		/*
3823 		 * there is extra space alloc'd for VLAN_TAG
3824 		 */
3825 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3826 
3827 		/* send the packet to be switched */
3828 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3829 		    ldcp->ldc_port, NULL);
3830 
3831 		break;
3832 
3833 	case VIO_SUBTYPE_ACK:
3834 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3835 
3836 		/* Verify the ACK is valid */
3837 		idx = ibnd_desc->hdr.desc_handle;
3838 
3839 		if (idx >= vsw_ntxds) {
3840 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3841 			    "(idx %ld)", vswp->instance, idx);
3842 			return;
3843 		}
3844 
3845 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3846 			DERR(vswp, "%s: no dring found", __func__);
3847 			return;
3848 		}
3849 
3850 		len = dp->num_descriptors;
3851 		/*
3852 		 * If the descriptor we are being ACK'ed for is not the
3853 		 * one we expected, then pkts were lost somwhere, either
3854 		 * when we tried to send a msg, or a previous ACK msg from
3855 		 * our peer. In either case we now reclaim the descriptors
3856 		 * in the range from the last ACK we received up to the
3857 		 * current ACK.
3858 		 */
3859 		if (idx != dp->last_ack_recv) {
3860 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3861 			    __func__, dp->last_ack_recv, idx);
3862 			num = idx >= dp->last_ack_recv ?
3863 			    idx - dp->last_ack_recv + 1:
3864 			    (len - dp->last_ack_recv + 1) + idx;
3865 		}
3866 
3867 		/*
3868 		 * When we sent the in-band message to our peer we
3869 		 * marked the copy in our private ring as READY. We now
3870 		 * check that the descriptor we are being ACK'ed for is in
3871 		 * fact READY, i.e. it is one we have shared with our peer.
3872 		 *
3873 		 * If its not we flag an error, but still reset the descr
3874 		 * back to FREE.
3875 		 */
3876 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3877 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3878 			mutex_enter(&priv_addr->dstate_lock);
3879 			if (priv_addr->dstate != VIO_DESC_READY) {
3880 				DERR(vswp, "%s: (%ld) desc at index %ld not "
3881 				    "READY (0x%lx)", __func__,
3882 				    ldcp->ldc_id, idx, priv_addr->dstate);
3883 				DERR(vswp, "%s: bound %d: ncookies %ld : "
3884 				    "datalen %ld", __func__,
3885 				    priv_addr->bound, priv_addr->ncookies,
3886 				    priv_addr->datalen);
3887 			}
3888 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3889 			    ldcp->ldc_id, idx);
3890 			/* release resources associated with sent msg */
3891 			priv_addr->datalen = 0;
3892 			priv_addr->dstate = VIO_DESC_FREE;
3893 			mutex_exit(&priv_addr->dstate_lock);
3894 		}
3895 		/* update to next expected value */
3896 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3897 
3898 		break;
3899 
3900 	case VIO_SUBTYPE_NACK:
3901 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3902 
3903 		/*
3904 		 * We should only get a NACK if our peer doesn't like
3905 		 * something about a message we have sent it. If this
3906 		 * happens we just release the resources associated with
3907 		 * the message. (We are relying on higher layers to decide
3908 		 * whether or not to resend.
3909 		 */
3910 
3911 		/* limit check */
3912 		idx = ibnd_desc->hdr.desc_handle;
3913 
3914 		if (idx >= vsw_ntxds) {
3915 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3916 			    __func__, idx);
3917 			return;
3918 		}
3919 
3920 		if ((dp = ldcp->lane_out.dringp) == NULL) {
3921 			DERR(vswp, "%s: no dring found", __func__);
3922 			return;
3923 		}
3924 
3925 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3926 
3927 		/* move to correct location in ring */
3928 		priv_addr += idx;
3929 
3930 		/* release resources associated with sent msg */
3931 		mutex_enter(&priv_addr->dstate_lock);
3932 		priv_addr->datalen = 0;
3933 		priv_addr->dstate = VIO_DESC_FREE;
3934 		mutex_exit(&priv_addr->dstate_lock);
3935 
3936 		break;
3937 
3938 	default:
3939 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3940 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3941 	}
3942 
3943 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3944 }
3945 
3946 static void
3947 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3948 {
3949 	_NOTE(ARGUNUSED(epkt))
3950 
3951 	vsw_t		*vswp = ldcp->ldc_vswp;
3952 	uint16_t	env = tagp->vio_subtype_env;
3953 
3954 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3955 
3956 	/*
3957 	 * Error vio_subtypes have yet to be defined. So for
3958 	 * the moment we can't do anything.
3959 	 */
3960 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3961 
3962 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3963 }
3964 
3965 /* transmit the packet over the given port */
3966 int
3967 vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
3968 {
3969 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
3970 	vsw_ldc_t 	*ldcp;
3971 	int		status = 0;
3972 	uint32_t	n;
3973 
3974 	READ_ENTER(&ldcl->lockrw);
3975 	/*
3976 	 * Note for now, we have a single channel.
3977 	 */
3978 	ldcp = ldcl->head;
3979 	if (ldcp == NULL) {
3980 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
3981 		freemsgchain(mp);
3982 		RW_EXIT(&ldcl->lockrw);
3983 		return (1);
3984 	}
3985 
3986 	n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3987 
3988 	count -= n;
3989 	if (count == 0) {
3990 		goto vsw_portsend_exit;
3991 	}
3992 
3993 	status = ldcp->tx(ldcp, mp, mpt, count);
3994 
3995 vsw_portsend_exit:
3996 	RW_EXIT(&ldcl->lockrw);
3997 
3998 	return (status);
3999 }
4000 
4001 /*
4002  * Break up frames into 2 seperate chains: normal and
4003  * priority, based on the frame type. The number of
4004  * priority frames is also counted and returned.
4005  *
4006  * Params:
4007  * 	vswp:	pointer to the instance of vsw
4008  *	np:	head of packet chain to be broken
4009  *	npt:	tail of packet chain to be broken
4010  *
4011  * Returns:
4012  *	np:	head of normal data packets
4013  *	npt:	tail of normal data packets
4014  *	hp:	head of high priority packets
4015  *	hpt:	tail of high priority packets
4016  */
4017 static uint32_t
4018 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4019 	mblk_t **hp, mblk_t **hpt)
4020 {
4021 	mblk_t			*tmp = NULL;
4022 	mblk_t			*smp = NULL;
4023 	mblk_t			*hmp = NULL;	/* high prio pkts head */
4024 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4025 	mblk_t			*nmp = NULL;	/* normal pkts head */
4026 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4027 	uint32_t		count = 0;
4028 	int			i;
4029 	struct ether_header	*ehp;
4030 	uint32_t		num_types;
4031 	uint16_t		*types;
4032 
4033 	tmp = *np;
4034 	while (tmp != NULL) {
4035 
4036 		smp = tmp;
4037 		tmp = tmp->b_next;
4038 		smp->b_next = NULL;
4039 		smp->b_prev = NULL;
4040 
4041 		ehp = (struct ether_header *)smp->b_rptr;
4042 		num_types = vswp->pri_num_types;
4043 		types = vswp->pri_types;
4044 		for (i = 0; i < num_types; i++) {
4045 			if (ehp->ether_type == types[i]) {
4046 				/* high priority frame */
4047 
4048 				if (hmp != NULL) {
4049 					hmpt->b_next = smp;
4050 					hmpt = smp;
4051 				} else {
4052 					hmp = hmpt = smp;
4053 				}
4054 				count++;
4055 				break;
4056 			}
4057 		}
4058 		if (i == num_types) {
4059 			/* normal data frame */
4060 
4061 			if (nmp != NULL) {
4062 				nmpt->b_next = smp;
4063 				nmpt = smp;
4064 			} else {
4065 				nmp = nmpt = smp;
4066 			}
4067 		}
4068 	}
4069 
4070 	*hp = hmp;
4071 	*hpt = hmpt;
4072 	*np = nmp;
4073 	*npt = nmpt;
4074 
4075 	return (count);
4076 }
4077 
4078 /*
4079  * Wrapper function to transmit normal and/or priority frames over the channel.
4080  */
4081 static int
4082 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4083 {
4084 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4085 	mblk_t			*tmp;
4086 	mblk_t			*smp;
4087 	mblk_t			*hmp;	/* high prio pkts head */
4088 	mblk_t			*hmpt;	/* high prio pkts tail */
4089 	mblk_t			*nmp;	/* normal pkts head */
4090 	mblk_t			*nmpt;	/* normal pkts tail */
4091 	uint32_t		n = 0;
4092 	vsw_t			*vswp = ldcp->ldc_vswp;
4093 
4094 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4095 	ASSERT(count != 0);
4096 
4097 	nmp = mp;
4098 	nmpt = mpt;
4099 
4100 	/* gather any priority frames from the chain of packets */
4101 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4102 
4103 	/* transmit priority frames */
4104 	tmp = hmp;
4105 	while (tmp != NULL) {
4106 		smp = tmp;
4107 		tmp = tmp->b_next;
4108 		smp->b_next = NULL;
4109 		vsw_ldcsend_pkt(ldcp, smp);
4110 	}
4111 
4112 	count -= n;
4113 
4114 	if (count == 0) {
4115 		/* no normal data frames to process */
4116 		return (0);
4117 	}
4118 
4119 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4120 }
4121 
4122 /*
4123  * Wrapper function to transmit normal frames over the channel.
4124  */
4125 static int
4126 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4127 {
4128 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4129 	mblk_t		*tmp = NULL;
4130 
4131 	ASSERT(count != 0);
4132 	/*
4133 	 * If the TX thread is enabled, then queue the
4134 	 * ordinary frames and signal the tx thread.
4135 	 */
4136 	if (ldcp->tx_thread != NULL) {
4137 
4138 		mutex_enter(&ldcp->tx_thr_lock);
4139 
4140 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4141 			/*
4142 			 * If we reached queue limit,
4143 			 * do not queue new packets,
4144 			 * drop them.
4145 			 */
4146 			ldcp->ldc_stats.tx_qfull += count;
4147 			mutex_exit(&ldcp->tx_thr_lock);
4148 			freemsgchain(mp);
4149 			goto exit;
4150 		}
4151 		if (ldcp->tx_mhead == NULL) {
4152 			ldcp->tx_mhead = mp;
4153 			ldcp->tx_mtail = mpt;
4154 			cv_signal(&ldcp->tx_thr_cv);
4155 		} else {
4156 			ldcp->tx_mtail->b_next = mp;
4157 			ldcp->tx_mtail = mpt;
4158 		}
4159 		ldcp->tx_cnt += count;
4160 		mutex_exit(&ldcp->tx_thr_lock);
4161 	} else {
4162 		while (mp != NULL) {
4163 			tmp = mp->b_next;
4164 			mp->b_next = mp->b_prev = NULL;
4165 			(void) vsw_ldcsend(ldcp, mp, 1);
4166 			mp = tmp;
4167 		}
4168 	}
4169 
4170 exit:
4171 	return (0);
4172 }
4173 
4174 /*
4175  * This function transmits the frame in the payload of a raw data
4176  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4177  * send special frames with high priorities, without going through
4178  * the normal data path which uses descriptor ring mechanism.
4179  */
4180 static void
4181 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4182 {
4183 	vio_raw_data_msg_t	*pkt;
4184 	mblk_t			*bp;
4185 	mblk_t			*nmp = NULL;
4186 	caddr_t			dst;
4187 	uint32_t		mblksz;
4188 	uint32_t		size;
4189 	uint32_t		nbytes;
4190 	int			rv;
4191 	vsw_t			*vswp = ldcp->ldc_vswp;
4192 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4193 
4194 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4195 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4196 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4197 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4198 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4199 		    ldcp->lane_out.lstate);
4200 		goto send_pkt_exit;
4201 	}
4202 
4203 	size = msgsize(mp);
4204 
4205 	/* frame size bigger than available payload len of raw data msg ? */
4206 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4207 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4208 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4209 		    ldcp->ldc_id, size);
4210 		goto send_pkt_exit;
4211 	}
4212 
4213 	if (size < ETHERMIN)
4214 		size = ETHERMIN;
4215 
4216 	/* alloc space for a raw data message */
4217 	nmp = vio_allocb(vswp->pri_tx_vmp);
4218 	if (nmp == NULL) {
4219 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4220 		DWARN(vswp, "vio_allocb failed\n");
4221 		goto send_pkt_exit;
4222 	}
4223 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4224 
4225 	/* copy frame into the payload of raw data message */
4226 	dst = (caddr_t)pkt->data;
4227 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4228 		mblksz = MBLKL(bp);
4229 		bcopy(bp->b_rptr, dst, mblksz);
4230 		dst += mblksz;
4231 	}
4232 
4233 	/* setup the raw data msg */
4234 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4235 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4236 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4237 	pkt->tag.vio_sid = ldcp->local_session;
4238 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4239 
4240 	/* send the msg over ldc */
4241 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4242 	if (rv != 0) {
4243 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4244 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4245 		    ldcp->ldc_id);
4246 		goto send_pkt_exit;
4247 	}
4248 
4249 	/* update stats */
4250 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4251 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4252 
4253 send_pkt_exit:
4254 	if (nmp != NULL)
4255 		freemsg(nmp);
4256 	freemsg(mp);
4257 }
4258 
4259 /*
4260  * Transmit the packet over the given LDC channel.
4261  *
4262  * The 'retries' argument indicates how many times a packet
4263  * is retried before it is dropped. Note, the retry is done
4264  * only for a resource related failure, for all other failures
4265  * the packet is dropped immediately.
4266  */
4267 static int
4268 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4269 {
4270 	int i;
4271 	int rc;
4272 	int status = 0;
4273 	vsw_port_t *port = ldcp->ldc_port;
4274 	dring_info_t *dp = NULL;
4275 
4276 
4277 	for (i = 0; i < retries; ) {
4278 		/*
4279 		 * Send the message out using the appropriate
4280 		 * transmit function which will free mblock when it
4281 		 * is finished with it.
4282 		 */
4283 		mutex_enter(&port->tx_lock);
4284 		if (port->transmit != NULL) {
4285 			status = (*port->transmit)(ldcp, mp);
4286 		}
4287 		if (status == LDC_TX_SUCCESS) {
4288 			mutex_exit(&port->tx_lock);
4289 			break;
4290 		}
4291 		i++;	/* increment the counter here */
4292 
4293 		/* If its the last retry, then update the oerror */
4294 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4295 			ldcp->ldc_stats.oerrors++;
4296 		}
4297 		mutex_exit(&port->tx_lock);
4298 
4299 		if (status != LDC_TX_NORESOURCES) {
4300 			/*
4301 			 * No retrying required for errors un-related
4302 			 * to resources.
4303 			 */
4304 			break;
4305 		}
4306 		READ_ENTER(&ldcp->lane_out.dlistrw);
4307 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4308 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4309 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4310 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4311 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4312 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4313 		} else {
4314 			/*
4315 			 * If there is no dring or the xfer_mode is
4316 			 * set to DESC_MODE(ie., OBP), then simply break here.
4317 			 */
4318 			RW_EXIT(&ldcp->lane_out.dlistrw);
4319 			break;
4320 		}
4321 		RW_EXIT(&ldcp->lane_out.dlistrw);
4322 
4323 		/*
4324 		 * Delay only if none were reclaimed
4325 		 * and its not the last retry.
4326 		 */
4327 		if ((rc == 0) && (i < retries)) {
4328 			delay(drv_usectohz(vsw_ldc_tx_delay));
4329 		}
4330 	}
4331 	freemsg(mp);
4332 	return (status);
4333 }
4334 
4335 /*
4336  * Send packet out via descriptor ring to a logical device.
4337  */
4338 static int
4339 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4340 {
4341 	vio_dring_msg_t		dring_pkt;
4342 	dring_info_t		*dp = NULL;
4343 	vsw_private_desc_t	*priv_desc = NULL;
4344 	vnet_public_desc_t	*pub = NULL;
4345 	vsw_t			*vswp = ldcp->ldc_vswp;
4346 	mblk_t			*bp;
4347 	size_t			n, size;
4348 	caddr_t			bufp;
4349 	int			idx;
4350 	int			status = LDC_TX_SUCCESS;
4351 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4352 	lane_t			*lp = &ldcp->lane_out;
4353 
4354 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4355 
4356 	/* TODO: make test a macro */
4357 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4358 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4359 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4360 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4361 		    ldcp->lane_out.lstate);
4362 		ldcp->ldc_stats.oerrors++;
4363 		return (LDC_TX_FAILURE);
4364 	}
4365 
4366 	/*
4367 	 * Note - using first ring only, this may change
4368 	 * in the future.
4369 	 */
4370 	READ_ENTER(&ldcp->lane_out.dlistrw);
4371 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4372 		RW_EXIT(&ldcp->lane_out.dlistrw);
4373 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4374 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4375 		ldcp->ldc_stats.oerrors++;
4376 		return (LDC_TX_FAILURE);
4377 	}
4378 
4379 	size = msgsize(mp);
4380 	if (size > (size_t)lp->mtu) {
4381 		RW_EXIT(&ldcp->lane_out.dlistrw);
4382 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4383 		    ldcp->ldc_id, size);
4384 		ldcp->ldc_stats.oerrors++;
4385 		return (LDC_TX_FAILURE);
4386 	}
4387 
4388 	/*
4389 	 * Find a free descriptor
4390 	 *
4391 	 * Note: for the moment we are assuming that we will only
4392 	 * have one dring going from the switch to each of its
4393 	 * peers. This may change in the future.
4394 	 */
4395 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4396 		D2(vswp, "%s(%lld): no descriptor available for ring "
4397 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4398 
4399 		/* nothing more we can do */
4400 		status = LDC_TX_NORESOURCES;
4401 		ldcp->ldc_stats.tx_no_desc++;
4402 		goto vsw_dringsend_free_exit;
4403 	} else {
4404 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4405 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4406 	}
4407 
4408 	/* copy data into the descriptor */
4409 	bufp = priv_desc->datap;
4410 	bufp += VNET_IPALIGN;
4411 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4412 		n = MBLKL(bp);
4413 		bcopy(bp->b_rptr, bufp, n);
4414 		bufp += n;
4415 	}
4416 
4417 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4418 
4419 	pub = priv_desc->descp;
4420 	pub->nbytes = priv_desc->datalen;
4421 
4422 	/* update statistics */
4423 	if (IS_BROADCAST(ehp))
4424 		ldcp->ldc_stats.brdcstxmt++;
4425 	else if (IS_MULTICAST(ehp))
4426 		ldcp->ldc_stats.multixmt++;
4427 	ldcp->ldc_stats.opackets++;
4428 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4429 
4430 	mutex_enter(&priv_desc->dstate_lock);
4431 	pub->hdr.dstate = VIO_DESC_READY;
4432 	mutex_exit(&priv_desc->dstate_lock);
4433 
4434 	/*
4435 	 * Determine whether or not we need to send a message to our
4436 	 * peer prompting them to read our newly updated descriptor(s).
4437 	 */
4438 	mutex_enter(&dp->restart_lock);
4439 	if (dp->restart_reqd) {
4440 		dp->restart_reqd = B_FALSE;
4441 		ldcp->ldc_stats.dring_data_msgs++;
4442 		mutex_exit(&dp->restart_lock);
4443 
4444 		/*
4445 		 * Send a vio_dring_msg to peer to prompt them to read
4446 		 * the updated descriptor ring.
4447 		 */
4448 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4449 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4450 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4451 		dring_pkt.tag.vio_sid = ldcp->local_session;
4452 
4453 		/* Note - for now using first ring */
4454 		dring_pkt.dring_ident = dp->ident;
4455 
4456 		/*
4457 		 * If last_ack_recv is -1 then we know we've not
4458 		 * received any ack's yet, so this must be the first
4459 		 * msg sent, so set the start to the begining of the ring.
4460 		 */
4461 		mutex_enter(&dp->dlock);
4462 		if (dp->last_ack_recv == -1) {
4463 			dring_pkt.start_idx = 0;
4464 		} else {
4465 			dring_pkt.start_idx =
4466 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4467 		}
4468 		dring_pkt.end_idx = -1;
4469 		mutex_exit(&dp->dlock);
4470 
4471 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4472 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4473 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4474 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4475 		    dring_pkt.end_idx);
4476 
4477 		RW_EXIT(&ldcp->lane_out.dlistrw);
4478 
4479 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4480 		    sizeof (vio_dring_msg_t), B_TRUE);
4481 
4482 		return (status);
4483 
4484 	} else {
4485 		mutex_exit(&dp->restart_lock);
4486 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4487 		    ldcp->ldc_id, idx);
4488 	}
4489 
4490 vsw_dringsend_free_exit:
4491 
4492 	RW_EXIT(&ldcp->lane_out.dlistrw);
4493 
4494 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4495 	return (status);
4496 }
4497 
4498 /*
4499  * Send an in-band descriptor message over ldc.
4500  */
4501 static int
4502 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4503 {
4504 	vsw_t			*vswp = ldcp->ldc_vswp;
4505 	vnet_ibnd_desc_t	ibnd_msg;
4506 	vsw_private_desc_t	*priv_desc = NULL;
4507 	dring_info_t		*dp = NULL;
4508 	size_t			n, size = 0;
4509 	caddr_t			bufp;
4510 	mblk_t			*bp;
4511 	int			idx, i;
4512 	int			status = LDC_TX_SUCCESS;
4513 	static int		warn_msg = 1;
4514 	lane_t			*lp = &ldcp->lane_out;
4515 
4516 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4517 
4518 	ASSERT(mp != NULL);
4519 
4520 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4521 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4522 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4523 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4524 		    ldcp->lane_out.lstate);
4525 		ldcp->ldc_stats.oerrors++;
4526 		return (LDC_TX_FAILURE);
4527 	}
4528 
4529 	/*
4530 	 * only expect single dring to exist, which we use
4531 	 * as an internal buffer, rather than a transfer channel.
4532 	 */
4533 	READ_ENTER(&ldcp->lane_out.dlistrw);
4534 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4535 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4536 		    __func__, ldcp->ldc_id);
4537 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4538 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4539 		RW_EXIT(&ldcp->lane_out.dlistrw);
4540 		ldcp->ldc_stats.oerrors++;
4541 		return (LDC_TX_FAILURE);
4542 	}
4543 
4544 	size = msgsize(mp);
4545 	if (size > (size_t)lp->mtu) {
4546 		RW_EXIT(&ldcp->lane_out.dlistrw);
4547 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4548 		    ldcp->ldc_id, size);
4549 		ldcp->ldc_stats.oerrors++;
4550 		return (LDC_TX_FAILURE);
4551 	}
4552 
4553 	/*
4554 	 * Find a free descriptor in our buffer ring
4555 	 */
4556 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4557 		RW_EXIT(&ldcp->lane_out.dlistrw);
4558 		if (warn_msg) {
4559 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4560 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4561 			warn_msg = 0;
4562 		}
4563 
4564 		/* nothing more we can do */
4565 		status = LDC_TX_NORESOURCES;
4566 		goto vsw_descrsend_free_exit;
4567 	} else {
4568 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4569 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4570 		warn_msg = 1;
4571 	}
4572 
4573 	/* copy data into the descriptor */
4574 	bufp = priv_desc->datap;
4575 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4576 		n = MBLKL(bp);
4577 		bcopy(bp->b_rptr, bufp, n);
4578 		bufp += n;
4579 	}
4580 
4581 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4582 
4583 	/* create and send the in-band descp msg */
4584 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4585 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4586 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4587 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4588 
4589 	/*
4590 	 * Copy the mem cookies describing the data from the
4591 	 * private region of the descriptor ring into the inband
4592 	 * descriptor.
4593 	 */
4594 	for (i = 0; i < priv_desc->ncookies; i++) {
4595 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4596 		    sizeof (ldc_mem_cookie_t));
4597 	}
4598 
4599 	ibnd_msg.hdr.desc_handle = idx;
4600 	ibnd_msg.ncookies = priv_desc->ncookies;
4601 	ibnd_msg.nbytes = size;
4602 
4603 	ldcp->ldc_stats.opackets++;
4604 	ldcp->ldc_stats.obytes += size;
4605 
4606 	RW_EXIT(&ldcp->lane_out.dlistrw);
4607 
4608 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4609 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4610 
4611 vsw_descrsend_free_exit:
4612 
4613 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4614 	return (status);
4615 }
4616 
4617 static void
4618 vsw_send_ver(void *arg)
4619 {
4620 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4621 	vsw_t		*vswp = ldcp->ldc_vswp;
4622 	lane_t		*lp = &ldcp->lane_out;
4623 	vio_ver_msg_t	ver_msg;
4624 
4625 	D1(vswp, "%s enter", __func__);
4626 
4627 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4628 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4629 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4630 	ver_msg.tag.vio_sid = ldcp->local_session;
4631 
4632 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4633 		ver_msg.ver_major = vsw_versions[0].ver_major;
4634 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4635 	} else {
4636 		/* use the major,minor that we've ack'd */
4637 		lane_t	*lpi = &ldcp->lane_in;
4638 		ver_msg.ver_major = lpi->ver_major;
4639 		ver_msg.ver_minor = lpi->ver_minor;
4640 	}
4641 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4642 
4643 	lp->lstate |= VSW_VER_INFO_SENT;
4644 	lp->ver_major = ver_msg.ver_major;
4645 	lp->ver_minor = ver_msg.ver_minor;
4646 
4647 	DUMP_TAG(ver_msg.tag);
4648 
4649 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4650 
4651 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4652 }
4653 
4654 static void
4655 vsw_send_attr(vsw_ldc_t *ldcp)
4656 {
4657 	vsw_t			*vswp = ldcp->ldc_vswp;
4658 	lane_t			*lp = &ldcp->lane_out;
4659 	vnet_attr_msg_t		attr_msg;
4660 
4661 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4662 
4663 	/*
4664 	 * Subtype is set to INFO by default
4665 	 */
4666 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4667 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4668 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4669 	attr_msg.tag.vio_sid = ldcp->local_session;
4670 
4671 	/* payload copied from default settings for lane */
4672 	attr_msg.mtu = lp->mtu;
4673 	attr_msg.addr_type = lp->addr_type;
4674 	attr_msg.xfer_mode = lp->xfer_mode;
4675 	attr_msg.ack_freq = lp->xfer_mode;
4676 
4677 	READ_ENTER(&vswp->if_lockrw);
4678 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4679 	RW_EXIT(&vswp->if_lockrw);
4680 
4681 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4682 
4683 	DUMP_TAG(attr_msg.tag);
4684 
4685 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4686 
4687 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4688 }
4689 
4690 /*
4691  * Create dring info msg (which also results in the creation of
4692  * a dring).
4693  */
4694 static vio_dring_reg_msg_t *
4695 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4696 {
4697 	vio_dring_reg_msg_t	*mp;
4698 	dring_info_t		*dp;
4699 	vsw_t			*vswp = ldcp->ldc_vswp;
4700 
4701 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4702 
4703 	/*
4704 	 * If we can't create a dring, obviously no point sending
4705 	 * a message.
4706 	 */
4707 	if ((dp = vsw_create_dring(ldcp)) == NULL)
4708 		return (NULL);
4709 
4710 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4711 
4712 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4713 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4714 	mp->tag.vio_subtype_env = VIO_DRING_REG;
4715 	mp->tag.vio_sid = ldcp->local_session;
4716 
4717 	/* payload */
4718 	mp->num_descriptors = dp->num_descriptors;
4719 	mp->descriptor_size = dp->descriptor_size;
4720 	mp->options = dp->options;
4721 	mp->ncookies = dp->ncookies;
4722 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4723 
4724 	mp->dring_ident = 0;
4725 
4726 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4727 
4728 	return (mp);
4729 }
4730 
4731 static void
4732 vsw_send_dring_info(vsw_ldc_t *ldcp)
4733 {
4734 	vio_dring_reg_msg_t	*dring_msg;
4735 	vsw_t			*vswp = ldcp->ldc_vswp;
4736 
4737 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4738 
4739 	dring_msg = vsw_create_dring_info_pkt(ldcp);
4740 	if (dring_msg == NULL) {
4741 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4742 		    vswp->instance, __func__);
4743 		return;
4744 	}
4745 
4746 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4747 
4748 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4749 
4750 	(void) vsw_send_msg(ldcp, dring_msg,
4751 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4752 
4753 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4754 
4755 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4756 }
4757 
4758 static void
4759 vsw_send_rdx(vsw_ldc_t *ldcp)
4760 {
4761 	vsw_t		*vswp = ldcp->ldc_vswp;
4762 	vio_rdx_msg_t	rdx_msg;
4763 
4764 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4765 
4766 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4767 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4768 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
4769 	rdx_msg.tag.vio_sid = ldcp->local_session;
4770 
4771 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4772 
4773 	DUMP_TAG(rdx_msg.tag);
4774 
4775 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4776 
4777 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4778 }
4779 
4780 /*
4781  * Generic routine to send message out over ldc channel.
4782  *
4783  * It is possible that when we attempt to write over the ldc channel
4784  * that we get notified that it has been reset. Depending on the value
4785  * of the handle_reset flag we either handle that event here or simply
4786  * notify the caller that the channel was reset.
4787  */
4788 int
4789 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
4790 {
4791 	int			rv;
4792 	size_t			msglen = size;
4793 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
4794 	vsw_t			*vswp = ldcp->ldc_vswp;
4795 	vio_dring_msg_t		*dmsg;
4796 	vio_raw_data_msg_t	*rmsg;
4797 	vnet_ibnd_desc_t	*imsg;
4798 	boolean_t		data_msg = B_FALSE;
4799 
4800 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
4801 	    ldcp->ldc_id, size);
4802 
4803 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
4804 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
4805 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
4806 
4807 	mutex_enter(&ldcp->ldc_txlock);
4808 
4809 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
4810 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
4811 			dmsg = (vio_dring_msg_t *)tag;
4812 			dmsg->seq_num = ldcp->lane_out.seq_num;
4813 			data_msg = B_TRUE;
4814 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
4815 			rmsg = (vio_raw_data_msg_t *)tag;
4816 			rmsg->seq_num = ldcp->lane_out.seq_num;
4817 			data_msg = B_TRUE;
4818 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
4819 			imsg = (vnet_ibnd_desc_t *)tag;
4820 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
4821 			data_msg = B_TRUE;
4822 		}
4823 	}
4824 
4825 	do {
4826 		msglen = size;
4827 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
4828 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
4829 
4830 	if (rv == 0 && data_msg == B_TRUE) {
4831 		ldcp->lane_out.seq_num++;
4832 	}
4833 
4834 	if ((rv != 0) || (msglen != size)) {
4835 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
4836 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
4837 		ldcp->ldc_stats.oerrors++;
4838 	}
4839 
4840 	mutex_exit(&ldcp->ldc_txlock);
4841 
4842 	/*
4843 	 * If channel has been reset we either handle it here or
4844 	 * simply report back that it has been reset and let caller
4845 	 * decide what to do.
4846 	 */
4847 	if (rv == ECONNRESET) {
4848 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
4849 
4850 		/*
4851 		 * N.B - must never be holding the dlistrw lock when
4852 		 * we do a reset of the channel.
4853 		 */
4854 		if (handle_reset) {
4855 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4856 		}
4857 	}
4858 
4859 	return (rv);
4860 }
4861 
4862 /*
4863  * Remove the specified address from the list of address maintained
4864  * in this port node.
4865  */
4866 mcst_addr_t *
4867 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4868 {
4869 	vsw_t		*vswp = NULL;
4870 	vsw_port_t	*port = NULL;
4871 	mcst_addr_t	*prev_p = NULL;
4872 	mcst_addr_t	*curr_p = NULL;
4873 
4874 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4875 	    __func__, devtype, addr);
4876 
4877 	if (devtype == VSW_VNETPORT) {
4878 		port = (vsw_port_t *)arg;
4879 		mutex_enter(&port->mca_lock);
4880 		prev_p = curr_p = port->mcap;
4881 	} else {
4882 		vswp = (vsw_t *)arg;
4883 		mutex_enter(&vswp->mca_lock);
4884 		prev_p = curr_p = vswp->mcap;
4885 	}
4886 
4887 	while (curr_p != NULL) {
4888 		if (curr_p->addr == addr) {
4889 			D2(NULL, "%s: address found", __func__);
4890 			/* match found */
4891 			if (prev_p == curr_p) {
4892 				/* list head */
4893 				if (devtype == VSW_VNETPORT)
4894 					port->mcap = curr_p->nextp;
4895 				else
4896 					vswp->mcap = curr_p->nextp;
4897 			} else {
4898 				prev_p->nextp = curr_p->nextp;
4899 			}
4900 			break;
4901 		} else {
4902 			prev_p = curr_p;
4903 			curr_p = curr_p->nextp;
4904 		}
4905 	}
4906 
4907 	if (devtype == VSW_VNETPORT)
4908 		mutex_exit(&port->mca_lock);
4909 	else
4910 		mutex_exit(&vswp->mca_lock);
4911 
4912 	D1(NULL, "%s: exit", __func__);
4913 
4914 	return (curr_p);
4915 }
4916 
4917 /*
4918  * Creates a descriptor ring (dring) and links it into the
4919  * link of outbound drings for this channel.
4920  *
4921  * Returns NULL if creation failed.
4922  */
4923 static dring_info_t *
4924 vsw_create_dring(vsw_ldc_t *ldcp)
4925 {
4926 	vsw_private_desc_t	*priv_addr = NULL;
4927 	vsw_t			*vswp = ldcp->ldc_vswp;
4928 	ldc_mem_info_t		minfo;
4929 	dring_info_t		*dp, *tp;
4930 	int			i;
4931 
4932 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4933 
4934 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4935 
4936 	/* create public section of ring */
4937 	if ((ldc_mem_dring_create(vsw_ntxds,
4938 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
4939 
4940 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
4941 		    "failed", ldcp->ldc_id);
4942 		goto create_fail_exit;
4943 	}
4944 
4945 	ASSERT(dp->handle != NULL);
4946 
4947 	/*
4948 	 * Get the base address of the public section of the ring.
4949 	 */
4950 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4951 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
4952 		    ldcp->ldc_id);
4953 		goto dring_fail_exit;
4954 	} else {
4955 		ASSERT(minfo.vaddr != 0);
4956 		dp->pub_addr = minfo.vaddr;
4957 	}
4958 
4959 	dp->num_descriptors = vsw_ntxds;
4960 	dp->descriptor_size = VSW_PUB_SIZE;
4961 	dp->options = VIO_TX_DRING;
4962 	dp->ncookies = 1;	/* guaranteed by ldc */
4963 
4964 	/*
4965 	 * create private portion of ring
4966 	 */
4967 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
4968 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
4969 
4970 	if (vsw_setup_ring(ldcp, dp)) {
4971 		DERR(vswp, "%s: unable to setup ring", __func__);
4972 		goto dring_fail_exit;
4973 	}
4974 
4975 	/* haven't used any descriptors yet */
4976 	dp->end_idx = 0;
4977 	dp->last_ack_recv = -1;
4978 
4979 	/* bind dring to the channel */
4980 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
4981 	    LDC_SHADOW_MAP, LDC_MEM_RW,
4982 	    &dp->cookie[0], &dp->ncookies)) != 0) {
4983 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
4984 		    "%lld", ldcp->ldc_id);
4985 		goto dring_fail_exit;
4986 	}
4987 
4988 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4989 	dp->restart_reqd = B_TRUE;
4990 
4991 	/*
4992 	 * Only ever create rings for outgoing lane. Link it onto
4993 	 * end of list.
4994 	 */
4995 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
4996 	if (ldcp->lane_out.dringp == NULL) {
4997 		D2(vswp, "vsw_create_dring: adding first outbound ring");
4998 		ldcp->lane_out.dringp = dp;
4999 	} else {
5000 		tp = ldcp->lane_out.dringp;
5001 		while (tp->next != NULL)
5002 			tp = tp->next;
5003 
5004 		tp->next = dp;
5005 	}
5006 	RW_EXIT(&ldcp->lane_out.dlistrw);
5007 
5008 	return (dp);
5009 
5010 dring_fail_exit:
5011 	(void) ldc_mem_dring_destroy(dp->handle);
5012 
5013 create_fail_exit:
5014 	if (dp->priv_addr != NULL) {
5015 		priv_addr = dp->priv_addr;
5016 		for (i = 0; i < vsw_ntxds; i++) {
5017 			if (priv_addr->memhandle != NULL)
5018 				(void) ldc_mem_free_handle(
5019 				    priv_addr->memhandle);
5020 			priv_addr++;
5021 		}
5022 		kmem_free(dp->priv_addr,
5023 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5024 	}
5025 	mutex_destroy(&dp->dlock);
5026 
5027 	kmem_free(dp, sizeof (dring_info_t));
5028 	return (NULL);
5029 }
5030 
5031 /*
5032  * Create a ring consisting of just a private portion and link
5033  * it into the list of rings for the outbound lane.
5034  *
5035  * These type of rings are used primarily for temporary data
5036  * storage (i.e. as data buffers).
5037  */
5038 void
5039 vsw_create_privring(vsw_ldc_t *ldcp)
5040 {
5041 	dring_info_t		*dp, *tp;
5042 	vsw_t			*vswp = ldcp->ldc_vswp;
5043 
5044 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5045 
5046 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5047 
5048 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5049 
5050 	/* no public section */
5051 	dp->pub_addr = NULL;
5052 
5053 	dp->priv_addr = kmem_zalloc(
5054 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5055 
5056 	dp->num_descriptors = vsw_ntxds;
5057 
5058 	if (vsw_setup_ring(ldcp, dp)) {
5059 		DERR(vswp, "%s: setup of ring failed", __func__);
5060 		kmem_free(dp->priv_addr,
5061 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5062 		mutex_destroy(&dp->dlock);
5063 		kmem_free(dp, sizeof (dring_info_t));
5064 		return;
5065 	}
5066 
5067 	/* haven't used any descriptors yet */
5068 	dp->end_idx = 0;
5069 
5070 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5071 	dp->restart_reqd = B_TRUE;
5072 
5073 	/*
5074 	 * Only ever create rings for outgoing lane. Link it onto
5075 	 * end of list.
5076 	 */
5077 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5078 	if (ldcp->lane_out.dringp == NULL) {
5079 		D2(vswp, "%s: adding first outbound privring", __func__);
5080 		ldcp->lane_out.dringp = dp;
5081 	} else {
5082 		tp = ldcp->lane_out.dringp;
5083 		while (tp->next != NULL)
5084 			tp = tp->next;
5085 
5086 		tp->next = dp;
5087 	}
5088 	RW_EXIT(&ldcp->lane_out.dlistrw);
5089 
5090 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5091 }
5092 
5093 /*
5094  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5095  * failure.
5096  */
5097 int
5098 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5099 {
5100 	vnet_public_desc_t	*pub_addr = NULL;
5101 	vsw_private_desc_t	*priv_addr = NULL;
5102 	vsw_t			*vswp = ldcp->ldc_vswp;
5103 	uint64_t		*tmpp;
5104 	uint64_t		offset = 0;
5105 	uint32_t		ncookies = 0;
5106 	static char		*name = "vsw_setup_ring";
5107 	int			i, j, nc, rv;
5108 	size_t			data_sz;
5109 
5110 	priv_addr = dp->priv_addr;
5111 	pub_addr = dp->pub_addr;
5112 
5113 	/* public section may be null but private should never be */
5114 	ASSERT(priv_addr != NULL);
5115 
5116 	/*
5117 	 * Allocate the region of memory which will be used to hold
5118 	 * the data the descriptors will refer to.
5119 	 */
5120 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5121 	data_sz = VNET_ROUNDUP_2K(data_sz);
5122 	dp->desc_data_sz = data_sz;
5123 	dp->data_sz = vsw_ntxds * data_sz;
5124 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5125 
5126 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5127 	    dp->data_sz, dp->data_addr);
5128 
5129 	tmpp = (uint64_t *)dp->data_addr;
5130 	offset = dp->desc_data_sz/sizeof (tmpp);
5131 
5132 	/*
5133 	 * Initialise some of the private and public (if they exist)
5134 	 * descriptor fields.
5135 	 */
5136 	for (i = 0; i < vsw_ntxds; i++) {
5137 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5138 
5139 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5140 		    &priv_addr->memhandle)) != 0) {
5141 			DERR(vswp, "%s: alloc mem handle failed", name);
5142 			goto setup_ring_cleanup;
5143 		}
5144 
5145 		priv_addr->datap = (void *)tmpp;
5146 
5147 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5148 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5149 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5150 		    &(priv_addr->memcookie[0]), &ncookies);
5151 		if (rv != 0) {
5152 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5153 			    "(rv %d)", name, ldcp->ldc_id, rv);
5154 			goto setup_ring_cleanup;
5155 		}
5156 		priv_addr->bound = 1;
5157 
5158 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5159 		    name, i, priv_addr->memcookie[0].addr,
5160 		    priv_addr->memcookie[0].size);
5161 
5162 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5163 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5164 			    "invalid num of cookies (%d) for size 0x%llx",
5165 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5166 
5167 			goto setup_ring_cleanup;
5168 		} else {
5169 			for (j = 1; j < ncookies; j++) {
5170 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5171 				    &(priv_addr->memcookie[j]));
5172 				if (rv != 0) {
5173 					DERR(vswp, "%s: ldc_mem_nextcookie "
5174 					    "failed rv (%d)", name, rv);
5175 					goto setup_ring_cleanup;
5176 				}
5177 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5178 				    "size 0x%llx", name, j,
5179 				    priv_addr->memcookie[j].addr,
5180 				    priv_addr->memcookie[j].size);
5181 			}
5182 
5183 		}
5184 		priv_addr->ncookies = ncookies;
5185 		priv_addr->dstate = VIO_DESC_FREE;
5186 
5187 		if (pub_addr != NULL) {
5188 
5189 			/* link pub and private sides */
5190 			priv_addr->descp = pub_addr;
5191 
5192 			pub_addr->ncookies = priv_addr->ncookies;
5193 
5194 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5195 				bcopy(&priv_addr->memcookie[nc],
5196 				    &pub_addr->memcookie[nc],
5197 				    sizeof (ldc_mem_cookie_t));
5198 			}
5199 
5200 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5201 			pub_addr++;
5202 		}
5203 
5204 		/*
5205 		 * move to next element in the dring and the next
5206 		 * position in the data buffer.
5207 		 */
5208 		priv_addr++;
5209 		tmpp += offset;
5210 	}
5211 
5212 	return (0);
5213 
5214 setup_ring_cleanup:
5215 	priv_addr = dp->priv_addr;
5216 
5217 	for (j = 0; j < i; j++) {
5218 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5219 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5220 
5221 		mutex_destroy(&priv_addr->dstate_lock);
5222 
5223 		priv_addr++;
5224 	}
5225 	kmem_free(dp->data_addr, dp->data_sz);
5226 
5227 	return (1);
5228 }
5229 
5230 /*
5231  * Searches the private section of a ring for a free descriptor,
5232  * starting at the location of the last free descriptor found
5233  * previously.
5234  *
5235  * Returns 0 if free descriptor is available, and updates state
5236  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5237  *
5238  * FUTURE: might need to return contiguous range of descriptors
5239  * as dring info msg assumes all will be contiguous.
5240  */
5241 static int
5242 vsw_dring_find_free_desc(dring_info_t *dringp,
5243 		vsw_private_desc_t **priv_p, int *idx)
5244 {
5245 	vsw_private_desc_t	*addr = NULL;
5246 	int			num = vsw_ntxds;
5247 	int			ret = 1;
5248 
5249 	D1(NULL, "%s enter\n", __func__);
5250 
5251 	ASSERT(dringp->priv_addr != NULL);
5252 
5253 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5254 	    __func__, dringp, dringp->end_idx);
5255 
5256 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5257 
5258 	mutex_enter(&addr->dstate_lock);
5259 	if (addr->dstate == VIO_DESC_FREE) {
5260 		addr->dstate = VIO_DESC_READY;
5261 		*priv_p = addr;
5262 		*idx = dringp->end_idx;
5263 		dringp->end_idx = (dringp->end_idx + 1) % num;
5264 		ret = 0;
5265 
5266 	}
5267 	mutex_exit(&addr->dstate_lock);
5268 
5269 	/* ring full */
5270 	if (ret == 1) {
5271 		D2(NULL, "%s: no desp free: started at %d", __func__,
5272 		    dringp->end_idx);
5273 	}
5274 
5275 	D1(NULL, "%s: exit\n", __func__);
5276 
5277 	return (ret);
5278 }
5279 
5280 /*
5281  * Map from a dring identifier to the ring itself. Returns
5282  * pointer to ring or NULL if no match found.
5283  *
5284  * Should be called with dlistrw rwlock held as reader.
5285  */
5286 static dring_info_t *
5287 vsw_ident2dring(lane_t *lane, uint64_t ident)
5288 {
5289 	dring_info_t	*dp = NULL;
5290 
5291 	if ((dp = lane->dringp) == NULL) {
5292 		return (NULL);
5293 	} else {
5294 		if (dp->ident == ident)
5295 			return (dp);
5296 
5297 		while (dp != NULL) {
5298 			if (dp->ident == ident)
5299 				break;
5300 			dp = dp->next;
5301 		}
5302 	}
5303 
5304 	return (dp);
5305 }
5306 
5307 /*
5308  * Set the default lane attributes. These are copied into
5309  * the attr msg we send to our peer. If they are not acceptable
5310  * then (currently) the handshake ends.
5311  */
5312 static void
5313 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5314 {
5315 	bzero(lp, sizeof (lane_t));
5316 
5317 	READ_ENTER(&vswp->if_lockrw);
5318 	ether_copy(&(vswp->if_addr), &(lp->addr));
5319 	RW_EXIT(&vswp->if_lockrw);
5320 
5321 	lp->mtu = vswp->max_frame_size;
5322 	lp->addr_type = ADDR_TYPE_MAC;
5323 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5324 	lp->ack_freq = 0;	/* for shared mode */
5325 	lp->seq_num = VNET_ISS;
5326 }
5327 
5328 /*
5329  * Verify that the attributes are acceptable.
5330  *
5331  * FUTURE: If some attributes are not acceptable, change them
5332  * our desired values.
5333  */
5334 static int
5335 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5336 {
5337 	int			ret = 0;
5338 	struct ether_addr	ea;
5339 	vsw_port_t		*port = ldcp->ldc_port;
5340 	lane_t			*lp = &ldcp->lane_out;
5341 
5342 	D1(NULL, "vsw_check_attr enter\n");
5343 
5344 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5345 	    (pkt->xfer_mode != lp->xfer_mode)) {
5346 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5347 		ret = 1;
5348 	}
5349 
5350 	/* Only support MAC addresses at moment. */
5351 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5352 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5353 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5354 		ret = 1;
5355 	}
5356 
5357 	/*
5358 	 * MAC address supplied by device should match that stored
5359 	 * in the vsw-port OBP node. Need to decide what to do if they
5360 	 * don't match, for the moment just warn but don't fail.
5361 	 */
5362 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5363 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5364 		DERR(NULL, "vsw_check_attr: device supplied address "
5365 		    "0x%llx doesn't match node address 0x%llx\n",
5366 		    pkt->addr, port->p_macaddr);
5367 	}
5368 
5369 	/*
5370 	 * Ack freq only makes sense in pkt mode, in shared
5371 	 * mode the ring descriptors say whether or not to
5372 	 * send back an ACK.
5373 	 */
5374 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5375 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5376 	    (VSW_VER_LT(ldcp, 1, 2) &&
5377 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5378 		if (pkt->ack_freq > 0) {
5379 			D2(NULL, "vsw_check_attr: non zero ack freq "
5380 			    " in SHM mode\n");
5381 			ret = 1;
5382 		}
5383 	}
5384 
5385 	/*
5386 	 * Note: for the moment we only support ETHER
5387 	 * frames. This may change in the future.
5388 	 */
5389 	if ((pkt->mtu > lp->mtu) || (pkt->mtu <= 0)) {
5390 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5391 		    pkt->mtu);
5392 		ret = 1;
5393 	}
5394 
5395 	D1(NULL, "vsw_check_attr exit\n");
5396 
5397 	return (ret);
5398 }
5399 
5400 /*
5401  * Returns 1 if there is a problem, 0 otherwise.
5402  */
5403 static int
5404 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5405 {
5406 	_NOTE(ARGUNUSED(pkt))
5407 
5408 	int	ret = 0;
5409 
5410 	D1(NULL, "vsw_check_dring_info enter\n");
5411 
5412 	if ((pkt->num_descriptors == 0) ||
5413 	    (pkt->descriptor_size == 0) ||
5414 	    (pkt->ncookies != 1)) {
5415 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5416 		ret = 1;
5417 	}
5418 
5419 	D1(NULL, "vsw_check_dring_info exit\n");
5420 
5421 	return (ret);
5422 }
5423 
5424 /*
5425  * Returns 1 if two memory cookies match. Otherwise returns 0.
5426  */
5427 static int
5428 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5429 {
5430 	if ((m1->addr != m2->addr) ||
5431 	    (m2->size != m2->size)) {
5432 		return (0);
5433 	} else {
5434 		return (1);
5435 	}
5436 }
5437 
5438 /*
5439  * Returns 1 if ring described in reg message matches that
5440  * described by dring_info structure. Otherwise returns 0.
5441  */
5442 static int
5443 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5444 {
5445 	if ((msg->descriptor_size != dp->descriptor_size) ||
5446 	    (msg->num_descriptors != dp->num_descriptors) ||
5447 	    (msg->ncookies != dp->ncookies) ||
5448 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5449 		return (0);
5450 	} else {
5451 		return (1);
5452 	}
5453 
5454 }
5455 
5456 static caddr_t
5457 vsw_print_ethaddr(uint8_t *a, char *ebuf)
5458 {
5459 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
5460 	    a[0], a[1], a[2], a[3], a[4], a[5]);
5461 	return (ebuf);
5462 }
5463 
5464 /*
5465  * Reset and free all the resources associated with
5466  * the channel.
5467  */
5468 static void
5469 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5470 {
5471 	dring_info_t		*dp, *dpp;
5472 	lane_t			*lp = NULL;
5473 	int			rv = 0;
5474 
5475 	ASSERT(ldcp != NULL);
5476 
5477 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5478 
5479 	if (dir == INBOUND) {
5480 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5481 		    " of channel %lld", __func__, ldcp->ldc_id);
5482 		lp = &ldcp->lane_in;
5483 	} else {
5484 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5485 		    " of channel %lld", __func__, ldcp->ldc_id);
5486 		lp = &ldcp->lane_out;
5487 	}
5488 
5489 	lp->lstate = VSW_LANE_INACTIV;
5490 	lp->seq_num = VNET_ISS;
5491 
5492 	if (lp->dringp) {
5493 		if (dir == INBOUND) {
5494 			WRITE_ENTER(&lp->dlistrw);
5495 			dp = lp->dringp;
5496 			while (dp != NULL) {
5497 				dpp = dp->next;
5498 				if (dp->handle != NULL)
5499 					(void) ldc_mem_dring_unmap(dp->handle);
5500 				kmem_free(dp, sizeof (dring_info_t));
5501 				dp = dpp;
5502 			}
5503 			RW_EXIT(&lp->dlistrw);
5504 		} else {
5505 			/*
5506 			 * unbind, destroy exported dring, free dring struct
5507 			 */
5508 			WRITE_ENTER(&lp->dlistrw);
5509 			dp = lp->dringp;
5510 			rv = vsw_free_ring(dp);
5511 			RW_EXIT(&lp->dlistrw);
5512 		}
5513 		if (rv == 0) {
5514 			lp->dringp = NULL;
5515 		}
5516 	}
5517 
5518 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5519 }
5520 
5521 /*
5522  * Free ring and all associated resources.
5523  *
5524  * Should be called with dlistrw rwlock held as writer.
5525  */
5526 static int
5527 vsw_free_ring(dring_info_t *dp)
5528 {
5529 	vsw_private_desc_t	*paddr = NULL;
5530 	dring_info_t		*dpp;
5531 	int			i, rv = 1;
5532 
5533 	while (dp != NULL) {
5534 		mutex_enter(&dp->dlock);
5535 		dpp = dp->next;
5536 		if (dp->priv_addr != NULL) {
5537 			/*
5538 			 * First unbind and free the memory handles
5539 			 * stored in each descriptor within the ring.
5540 			 */
5541 			for (i = 0; i < vsw_ntxds; i++) {
5542 				paddr = (vsw_private_desc_t *)
5543 				    dp->priv_addr + i;
5544 				if (paddr->memhandle != NULL) {
5545 					if (paddr->bound == 1) {
5546 						rv = ldc_mem_unbind_handle(
5547 						    paddr->memhandle);
5548 
5549 						if (rv != 0) {
5550 							DERR(NULL, "error "
5551 							"unbinding handle for "
5552 							"ring 0x%llx at pos %d",
5553 							    dp, i);
5554 							mutex_exit(&dp->dlock);
5555 							return (rv);
5556 						}
5557 						paddr->bound = 0;
5558 					}
5559 
5560 					rv = ldc_mem_free_handle(
5561 					    paddr->memhandle);
5562 					if (rv != 0) {
5563 						DERR(NULL, "error freeing "
5564 						    "handle for ring 0x%llx "
5565 						    "at pos %d", dp, i);
5566 						mutex_exit(&dp->dlock);
5567 						return (rv);
5568 					}
5569 					paddr->memhandle = NULL;
5570 				}
5571 				mutex_destroy(&paddr->dstate_lock);
5572 			}
5573 			kmem_free(dp->priv_addr,
5574 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5575 		}
5576 
5577 		/*
5578 		 * Now unbind and destroy the ring itself.
5579 		 */
5580 		if (dp->handle != NULL) {
5581 			(void) ldc_mem_dring_unbind(dp->handle);
5582 			(void) ldc_mem_dring_destroy(dp->handle);
5583 		}
5584 
5585 		if (dp->data_addr != NULL) {
5586 			kmem_free(dp->data_addr, dp->data_sz);
5587 		}
5588 
5589 		mutex_exit(&dp->dlock);
5590 		mutex_destroy(&dp->dlock);
5591 		mutex_destroy(&dp->restart_lock);
5592 		kmem_free(dp, sizeof (dring_info_t));
5593 
5594 		dp = dpp;
5595 	}
5596 	return (0);
5597 }
5598 
5599 /*
5600  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5601  * This thread is woken up by the LDC interrupt handler to process
5602  * LDC packets and receive data.
5603  */
5604 static void
5605 vsw_ldc_rx_worker(void *arg)
5606 {
5607 	callb_cpr_t	cprinfo;
5608 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5609 	vsw_t *vswp = ldcp->ldc_vswp;
5610 
5611 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5612 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5613 	    "vsw_rx_thread");
5614 	mutex_enter(&ldcp->rx_thr_lock);
5615 	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5616 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5617 
5618 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5619 		/*
5620 		 * Wait until the data is received or a stop
5621 		 * request is received.
5622 		 */
5623 		while (!(ldcp->rx_thr_flags &
5624 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5625 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5626 		}
5627 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5628 
5629 		/*
5630 		 * First process the stop request.
5631 		 */
5632 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5633 			D2(vswp, "%s(%lld):Rx thread stopped\n",
5634 			    __func__, ldcp->ldc_id);
5635 			break;
5636 		}
5637 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5638 		mutex_exit(&ldcp->rx_thr_lock);
5639 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5640 		    __func__, ldcp->ldc_id);
5641 		mutex_enter(&ldcp->ldc_cblock);
5642 		vsw_process_pkt(ldcp);
5643 		mutex_exit(&ldcp->ldc_cblock);
5644 		mutex_enter(&ldcp->rx_thr_lock);
5645 	}
5646 
5647 	/*
5648 	 * Update the run status and wakeup the thread that
5649 	 * has sent the stop request.
5650 	 */
5651 	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5652 	cv_signal(&ldcp->rx_thr_cv);
5653 	CALLB_CPR_EXIT(&cprinfo);
5654 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5655 	thread_exit();
5656 }
5657 
5658 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5659 static void
5660 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5661 {
5662 	vsw_t *vswp = ldcp->ldc_vswp;
5663 
5664 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5665 	/*
5666 	 * Send a stop request by setting the stop flag and
5667 	 * wait until the receive thread stops.
5668 	 */
5669 	mutex_enter(&ldcp->rx_thr_lock);
5670 	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5671 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5672 		cv_signal(&ldcp->rx_thr_cv);
5673 		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5674 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5675 		}
5676 	}
5677 	mutex_exit(&ldcp->rx_thr_lock);
5678 	ldcp->rx_thread = NULL;
5679 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5680 }
5681 
5682 /*
5683  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5684  * This thread is woken up by the vsw_portsend to transmit
5685  * packets.
5686  */
5687 static void
5688 vsw_ldc_tx_worker(void *arg)
5689 {
5690 	callb_cpr_t	cprinfo;
5691 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5692 	vsw_t *vswp = ldcp->ldc_vswp;
5693 	mblk_t *mp;
5694 	mblk_t *tmp;
5695 
5696 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5697 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5698 	    "vnet_tx_thread");
5699 	mutex_enter(&ldcp->tx_thr_lock);
5700 	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5701 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5702 
5703 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5704 		/*
5705 		 * Wait until the data is received or a stop
5706 		 * request is received.
5707 		 */
5708 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5709 		    (ldcp->tx_mhead == NULL)) {
5710 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5711 		}
5712 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5713 
5714 		/*
5715 		 * First process the stop request.
5716 		 */
5717 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5718 			D2(vswp, "%s(%lld):tx thread stopped\n",
5719 			    __func__, ldcp->ldc_id);
5720 			break;
5721 		}
5722 		mp = ldcp->tx_mhead;
5723 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5724 		ldcp->tx_cnt = 0;
5725 		mutex_exit(&ldcp->tx_thr_lock);
5726 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5727 		    __func__, ldcp->ldc_id);
5728 		while (mp != NULL) {
5729 			tmp = mp->b_next;
5730 			mp->b_next = mp->b_prev = NULL;
5731 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5732 			mp = tmp;
5733 		}
5734 		mutex_enter(&ldcp->tx_thr_lock);
5735 	}
5736 
5737 	/*
5738 	 * Update the run status and wakeup the thread that
5739 	 * has sent the stop request.
5740 	 */
5741 	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5742 	cv_signal(&ldcp->tx_thr_cv);
5743 	CALLB_CPR_EXIT(&cprinfo);
5744 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5745 	thread_exit();
5746 }
5747 
5748 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
5749 static void
5750 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
5751 {
5752 	vsw_t *vswp = ldcp->ldc_vswp;
5753 
5754 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5755 	/*
5756 	 * Send a stop request by setting the stop flag and
5757 	 * wait until the receive thread stops.
5758 	 */
5759 	mutex_enter(&ldcp->tx_thr_lock);
5760 	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5761 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
5762 		cv_signal(&ldcp->tx_thr_cv);
5763 		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
5764 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5765 		}
5766 	}
5767 	mutex_exit(&ldcp->tx_thr_lock);
5768 	ldcp->tx_thread = NULL;
5769 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5770 }
5771 
5772 /* vsw_reclaim_dring -- reclaim descriptors */
5773 static int
5774 vsw_reclaim_dring(dring_info_t *dp, int start)
5775 {
5776 	int i, j, len;
5777 	vsw_private_desc_t *priv_addr;
5778 	vnet_public_desc_t *pub_addr;
5779 
5780 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5781 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5782 	len = dp->num_descriptors;
5783 
5784 	D2(NULL, "%s: start index %ld\n", __func__, start);
5785 
5786 	j = 0;
5787 	for (i = start; j < len; i = (i + 1) % len, j++) {
5788 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5789 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5790 
5791 		mutex_enter(&priv_addr->dstate_lock);
5792 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
5793 			mutex_exit(&priv_addr->dstate_lock);
5794 			break;
5795 		}
5796 		pub_addr->hdr.dstate = VIO_DESC_FREE;
5797 		priv_addr->dstate = VIO_DESC_FREE;
5798 		/* clear all the fields */
5799 		priv_addr->datalen = 0;
5800 		pub_addr->hdr.ack = 0;
5801 		mutex_exit(&priv_addr->dstate_lock);
5802 
5803 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
5804 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
5805 	}
5806 	return (j);
5807 }
5808 
5809 /*
5810  * Debugging routines
5811  */
5812 static void
5813 display_state(void)
5814 {
5815 	vsw_t		*vswp;
5816 	vsw_port_list_t	*plist;
5817 	vsw_port_t 	*port;
5818 	vsw_ldc_list_t	*ldcl;
5819 	vsw_ldc_t 	*ldcp;
5820 	extern vsw_t 	*vsw_head;
5821 
5822 	cmn_err(CE_NOTE, "***** system state *****");
5823 
5824 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
5825 		plist = &vswp->plist;
5826 		READ_ENTER(&plist->lockrw);
5827 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
5828 		    vswp->instance, plist->num_ports);
5829 
5830 		for (port = plist->head; port != NULL; port = port->p_next) {
5831 			ldcl = &port->p_ldclist;
5832 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
5833 			    port->p_instance, port->num_ldcs);
5834 			READ_ENTER(&ldcl->lockrw);
5835 			ldcp = ldcl->head;
5836 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
5837 				cmn_err(CE_CONT, "chan %lu : dev %d : "
5838 				    "status %d : phase %u\n",
5839 				    ldcp->ldc_id, ldcp->dev_class,
5840 				    ldcp->ldc_status, ldcp->hphase);
5841 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
5842 				    "psession %lu\n", ldcp->ldc_id,
5843 				    ldcp->local_session, ldcp->peer_session);
5844 
5845 				cmn_err(CE_CONT, "Inbound lane:\n");
5846 				display_lane(&ldcp->lane_in);
5847 				cmn_err(CE_CONT, "Outbound lane:\n");
5848 				display_lane(&ldcp->lane_out);
5849 			}
5850 			RW_EXIT(&ldcl->lockrw);
5851 		}
5852 		RW_EXIT(&plist->lockrw);
5853 	}
5854 	cmn_err(CE_NOTE, "***** system state *****");
5855 }
5856 
5857 static void
5858 display_lane(lane_t *lp)
5859 {
5860 	dring_info_t	*drp;
5861 
5862 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
5863 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
5864 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
5865 	    lp->addr_type, lp->addr, lp->xfer_mode);
5866 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
5867 
5868 	cmn_err(CE_CONT, "Dring info:\n");
5869 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
5870 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
5871 		    drp->num_descriptors, drp->descriptor_size);
5872 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
5873 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
5874 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
5875 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
5876 		    drp->ident, drp->end_idx);
5877 		display_ring(drp);
5878 	}
5879 }
5880 
5881 static void
5882 display_ring(dring_info_t *dringp)
5883 {
5884 	uint64_t		i;
5885 	uint64_t		priv_count = 0;
5886 	uint64_t		pub_count = 0;
5887 	vnet_public_desc_t	*pub_addr = NULL;
5888 	vsw_private_desc_t	*priv_addr = NULL;
5889 
5890 	for (i = 0; i < vsw_ntxds; i++) {
5891 		if (dringp->pub_addr != NULL) {
5892 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
5893 
5894 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
5895 				pub_count++;
5896 		}
5897 
5898 		if (dringp->priv_addr != NULL) {
5899 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
5900 
5901 			if (priv_addr->dstate == VIO_DESC_FREE)
5902 				priv_count++;
5903 		}
5904 	}
5905 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
5906 	    i, priv_count, pub_count);
5907 }
5908 
5909 static void
5910 dump_flags(uint64_t state)
5911 {
5912 	int	i;
5913 
5914 	typedef struct flag_name {
5915 		int	flag_val;
5916 		char	*flag_name;
5917 	} flag_name_t;
5918 
5919 	flag_name_t	flags[] = {
5920 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
5921 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
5922 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
5923 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
5924 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
5925 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
5926 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
5927 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
5928 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
5929 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
5930 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
5931 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
5932 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
5933 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
5934 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
5935 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
5936 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
5937 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
5938 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
5939 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
5940 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
5941 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
5942 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
5943 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
5944 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
5945 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
5946 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
5947 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
5948 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
5949 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
5950 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
5951 
5952 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
5953 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
5954 		if (state & flags[i].flag_val)
5955 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
5956 	}
5957 }
5958