xref: /titanic_51/usr/src/uts/sun4v/io/vsw_ldc.c (revision e041b2e79357babb5b90ede68defaeec57ed9145)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/callb.h>
72 #include <sys/vlan.h>
73 
74 /* Port add/deletion/etc routines */
75 static	int vsw_port_delete(vsw_port_t *port);
76 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
77 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
78 static	int vsw_init_ldcs(vsw_port_t *port);
79 static	int vsw_uninit_ldcs(vsw_port_t *port);
80 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
81 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
82 static	int vsw_drain_ldcs(vsw_port_t *port);
83 static	int vsw_drain_port_taskq(vsw_port_t *port);
84 static	void vsw_marker_task(void *);
85 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
86 int vsw_detach_ports(vsw_t *vswp);
87 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
88 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
89 int vsw_port_detach(vsw_t *vswp, int p_instance);
90 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
91 int vsw_port_attach(vsw_port_t *portp);
92 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
93 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
94 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
95 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
96 void vsw_reset_ports(vsw_t *vswp);
97 void vsw_port_reset(vsw_port_t *portp);
98 
99 /* Interrupt routines */
100 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
101 
102 /* Handshake routines */
103 static	void vsw_ldc_reinit(vsw_ldc_t *);
104 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
105 static	void vsw_conn_task(void *);
106 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
107 static	void vsw_next_milestone(vsw_ldc_t *);
108 static	int vsw_supported_version(vio_ver_msg_t *);
109 static	void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
110 static	void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
111 
112 /* Data processing routines */
113 static void vsw_process_pkt(void *);
114 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *);
115 static void vsw_process_ctrl_pkt(void *);
116 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
117 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
121 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
122 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
123 	uint32_t);
124 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
125 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
126 static void vsw_process_pkt_data(void *, void *, uint32_t);
127 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
128 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
129 
130 /* Switching/data transmit routines */
131 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
132 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
133 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
134 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
135 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
136 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137 
138 /* Packet creation routines */
139 static void vsw_send_ver(void *);
140 static void vsw_send_attr(vsw_ldc_t *);
141 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
142 static void vsw_send_dring_info(vsw_ldc_t *);
143 static void vsw_send_rdx(vsw_ldc_t *);
144 
145 /* Dring routines */
146 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
147 static void vsw_create_privring(vsw_ldc_t *);
148 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
149 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
150     int *);
151 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
152 static int vsw_reclaim_dring(dring_info_t *dp, int start);
153 
154 static void vsw_set_lane_attr(vsw_t *, lane_t *);
155 static int vsw_check_attr(vnet_attr_msg_t *, vsw_ldc_t *);
156 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
157 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
158 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
159 
160 /* Rcv/Tx thread routines */
161 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
162 static void vsw_ldc_tx_worker(void *arg);
163 static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
164 static void vsw_ldc_rx_worker(void *arg);
165 
166 /* Misc support routines */
167 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
168 static void vsw_free_ring(dring_info_t *);
169 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
170 static int vsw_get_same_dest_list(struct ether_header *ehp,
171     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
172 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
173 
174 /* Debugging routines */
175 static void dump_flags(uint64_t);
176 static void display_state(void);
177 static void display_lane(lane_t *);
178 static void display_ring(dring_info_t *);
179 
180 /*
181  * Functions imported from other files.
182  */
183 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
184 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
185 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
186 extern void vsw_del_mcst_port(vsw_port_t *port);
187 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
188 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
189 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
190 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
191 extern void vsw_create_vlans(void *arg, int type);
192 extern void vsw_destroy_vlans(void *arg, int type);
193 extern void vsw_vlan_add_ids(void *arg, int type);
194 extern void vsw_vlan_remove_ids(void *arg, int type);
195 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
196 	struct ether_header *ehp, uint16_t *vidp);
197 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
198 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
199 	mblk_t **npt);
200 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
201 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
202 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
203 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
204 extern void vsw_hio_stop_port(vsw_port_t *portp);
205 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
206 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
207 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
208 
209 
210 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
211 
212 /*
213  * Tunables used in this file.
214  */
215 extern int vsw_num_handshakes;
216 extern int vsw_wretries;
217 extern int vsw_desc_delay;
218 extern int vsw_read_attempts;
219 extern int vsw_ldc_tx_delay;
220 extern int vsw_ldc_tx_retries;
221 extern boolean_t vsw_ldc_rxthr_enabled;
222 extern boolean_t vsw_ldc_txthr_enabled;
223 extern uint32_t vsw_ntxds;
224 extern uint32_t vsw_max_tx_qcount;
225 extern uint32_t vsw_chain_len;
226 extern uint32_t vsw_mblk_size1;
227 extern uint32_t vsw_mblk_size2;
228 extern uint32_t vsw_mblk_size3;
229 extern uint32_t vsw_mblk_size4;
230 extern uint32_t vsw_num_mblks1;
231 extern uint32_t vsw_num_mblks2;
232 extern uint32_t vsw_num_mblks3;
233 extern uint32_t vsw_num_mblks4;
234 extern boolean_t vsw_obp_ver_proto_workaround;
235 extern uint32_t vsw_publish_macaddr_count;
236 extern boolean_t vsw_jumbo_rxpools;
237 
238 #define	LDC_ENTER_LOCK(ldcp)	\
239 				mutex_enter(&((ldcp)->ldc_cblock));\
240 				mutex_enter(&((ldcp)->ldc_rxlock));\
241 				mutex_enter(&((ldcp)->ldc_txlock));
242 #define	LDC_EXIT_LOCK(ldcp)	\
243 				mutex_exit(&((ldcp)->ldc_txlock));\
244 				mutex_exit(&((ldcp)->ldc_rxlock));\
245 				mutex_exit(&((ldcp)->ldc_cblock));
246 
247 #define	VSW_VER_EQ(ldcp, major, minor)	\
248 	((ldcp)->lane_out.ver_major == (major) &&	\
249 	    (ldcp)->lane_out.ver_minor == (minor))
250 
251 #define	VSW_VER_LT(ldcp, major, minor)	\
252 	(((ldcp)->lane_out.ver_major < (major)) ||	\
253 	    ((ldcp)->lane_out.ver_major == (major) &&	\
254 	    (ldcp)->lane_out.ver_minor < (minor)))
255 
256 #define	VSW_VER_GTEQ(ldcp, major, minor)	\
257 	(((ldcp)->lane_out.ver_major > (major)) ||	\
258 	    ((ldcp)->lane_out.ver_major == (major) &&	\
259 	    (ldcp)->lane_out.ver_minor >= (minor)))
260 
261 /* supported versions */
262 static	ver_sup_t	vsw_versions[] = { {1, 4} };
263 
264 /*
265  * For the moment the state dump routines have their own
266  * private flag.
267  */
268 #define	DUMP_STATE	0
269 
270 #if DUMP_STATE
271 
272 #define	DUMP_TAG(tag) \
273 {			\
274 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
275 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
276 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
277 }
278 
279 #define	DUMP_TAG_PTR(tag) \
280 {			\
281 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
282 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
283 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
284 }
285 
286 #define	DUMP_FLAGS(flags) dump_flags(flags);
287 #define	DISPLAY_STATE()	display_state()
288 
289 #else
290 
291 #define	DUMP_TAG(tag)
292 #define	DUMP_TAG_PTR(tag)
293 #define	DUMP_FLAGS(state)
294 #define	DISPLAY_STATE()
295 
296 #endif	/* DUMP_STATE */
297 
298 /*
299  * Attach the specified port.
300  *
301  * Returns 0 on success, 1 on failure.
302  */
303 int
304 vsw_port_attach(vsw_port_t *port)
305 {
306 	vsw_t			*vswp = port->p_vswp;
307 	vsw_port_list_t		*plist = &vswp->plist;
308 	vsw_port_t		*p, **pp;
309 	int			i;
310 	int			nids = port->num_ldcs;
311 	uint64_t		*ldcids;
312 	int			rv;
313 
314 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
315 
316 	/* port already exists? */
317 	READ_ENTER(&plist->lockrw);
318 	for (p = plist->head; p != NULL; p = p->p_next) {
319 		if (p->p_instance == port->p_instance) {
320 			DWARN(vswp, "%s: port instance %d already attached",
321 			    __func__, p->p_instance);
322 			RW_EXIT(&plist->lockrw);
323 			return (1);
324 		}
325 	}
326 	RW_EXIT(&plist->lockrw);
327 
328 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
329 
330 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
331 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
332 	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
333 
334 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
335 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
336 	port->state = VSW_PORT_INIT;
337 
338 	D2(vswp, "%s: %d nids", __func__, nids);
339 	ldcids = port->ldc_ids;
340 	for (i = 0; i < nids; i++) {
341 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
342 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
343 			DERR(vswp, "%s: ldc_attach failed", __func__);
344 			goto exit_error;
345 		}
346 	}
347 
348 	if (vswp->switching_setup_done == B_TRUE) {
349 		/*
350 		 * If the underlying network device has been setup,
351 		 * then open a mac client and porgram the mac address
352 		 * for this port.
353 		 */
354 		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
355 		if (rv != 0) {
356 			goto exit_error;
357 		}
358 	}
359 
360 	/* create the fdb entry for this port/mac address */
361 	vsw_fdbe_add(vswp, port);
362 
363 	vsw_create_vlans(port, VSW_VNETPORT);
364 
365 	WRITE_ENTER(&plist->lockrw);
366 
367 	/* link it into the list of ports for this vsw instance */
368 	pp = (vsw_port_t **)(&plist->head);
369 	port->p_next = *pp;
370 	*pp = port;
371 	plist->num_ports++;
372 
373 	RW_EXIT(&plist->lockrw);
374 
375 	/*
376 	 * Initialise the port and any ldc's under it.
377 	 */
378 	(void) vsw_init_ldcs(port);
379 
380 	/* announce macaddr of vnet to the physical switch */
381 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
382 		vsw_publish_macaddr(vswp, port);
383 	}
384 
385 	D1(vswp, "%s: exit", __func__);
386 	return (0);
387 
388 exit_error:
389 	rw_destroy(&port->p_ldclist.lockrw);
390 
391 	cv_destroy(&port->state_cv);
392 	mutex_destroy(&port->state_lock);
393 
394 	rw_destroy(&port->maccl_rwlock);
395 	mutex_destroy(&port->tx_lock);
396 	mutex_destroy(&port->mca_lock);
397 	kmem_free(port, sizeof (vsw_port_t));
398 	return (1);
399 }
400 
401 /*
402  * Detach the specified port.
403  *
404  * Returns 0 on success, 1 on failure.
405  */
406 int
407 vsw_port_detach(vsw_t *vswp, int p_instance)
408 {
409 	vsw_port_t	*port = NULL;
410 	vsw_port_list_t	*plist = &vswp->plist;
411 
412 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
413 
414 	WRITE_ENTER(&plist->lockrw);
415 
416 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
417 		RW_EXIT(&plist->lockrw);
418 		return (1);
419 	}
420 
421 	if (vsw_plist_del_node(vswp, port)) {
422 		RW_EXIT(&plist->lockrw);
423 		return (1);
424 	}
425 
426 	/* cleanup any HybridIO for this port */
427 	vsw_hio_stop_port(port);
428 
429 	/*
430 	 * No longer need to hold writer lock on port list now
431 	 * that we have unlinked the target port from the list.
432 	 */
433 	RW_EXIT(&plist->lockrw);
434 
435 	/* Cleanup and close the mac client */
436 	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
437 
438 	/* Remove the fdb entry for this port/mac address */
439 	vsw_fdbe_del(vswp, &(port->p_macaddr));
440 	vsw_destroy_vlans(port, VSW_VNETPORT);
441 
442 	/* Remove any multicast addresses.. */
443 	vsw_del_mcst_port(port);
444 
445 	if (vsw_port_delete(port)) {
446 		return (1);
447 	}
448 
449 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
450 	return (0);
451 }
452 
453 /*
454  * Detach all active ports.
455  *
456  * Returns 0 on success, 1 on failure.
457  */
458 int
459 vsw_detach_ports(vsw_t *vswp)
460 {
461 	vsw_port_list_t 	*plist = &vswp->plist;
462 	vsw_port_t		*port = NULL;
463 
464 	D1(vswp, "%s: enter", __func__);
465 
466 	WRITE_ENTER(&plist->lockrw);
467 
468 	while ((port = plist->head) != NULL) {
469 		if (vsw_plist_del_node(vswp, port)) {
470 			DERR(vswp, "%s: Error deleting port %d"
471 			    " from port list", __func__, port->p_instance);
472 			RW_EXIT(&plist->lockrw);
473 			return (1);
474 		}
475 
476 		/* Cleanup and close the mac client */
477 		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
478 
479 		/* Remove the fdb entry for this port/mac address */
480 		vsw_fdbe_del(vswp, &(port->p_macaddr));
481 		vsw_destroy_vlans(port, VSW_VNETPORT);
482 
483 		/* Remove any multicast addresses.. */
484 		vsw_del_mcst_port(port);
485 
486 		/*
487 		 * No longer need to hold the lock on the port list
488 		 * now that we have unlinked the target port from the
489 		 * list.
490 		 */
491 		RW_EXIT(&plist->lockrw);
492 		if (vsw_port_delete(port)) {
493 			DERR(vswp, "%s: Error deleting port %d",
494 			    __func__, port->p_instance);
495 			return (1);
496 		}
497 		WRITE_ENTER(&plist->lockrw);
498 	}
499 	RW_EXIT(&plist->lockrw);
500 
501 	D1(vswp, "%s: exit", __func__);
502 
503 	return (0);
504 }
505 
506 /*
507  * Delete the specified port.
508  *
509  * Returns 0 on success, 1 on failure.
510  */
511 static int
512 vsw_port_delete(vsw_port_t *port)
513 {
514 	vsw_ldc_list_t 		*ldcl;
515 	vsw_t			*vswp = port->p_vswp;
516 	int			num_ldcs;
517 
518 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
519 
520 	(void) vsw_uninit_ldcs(port);
521 
522 	/*
523 	 * Wait for any pending ctrl msg tasks which reference this
524 	 * port to finish.
525 	 */
526 	if (vsw_drain_port_taskq(port))
527 		return (1);
528 
529 	/*
530 	 * Wait for any active callbacks to finish
531 	 */
532 	if (vsw_drain_ldcs(port))
533 		return (1);
534 
535 	ldcl = &port->p_ldclist;
536 	num_ldcs = port->num_ldcs;
537 	WRITE_ENTER(&ldcl->lockrw);
538 	while (num_ldcs > 0) {
539 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
540 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
541 			    vswp->instance, ldcl->head->ldc_id);
542 			RW_EXIT(&ldcl->lockrw);
543 			port->num_ldcs = num_ldcs;
544 			return (1);
545 		}
546 		num_ldcs--;
547 	}
548 	RW_EXIT(&ldcl->lockrw);
549 
550 	rw_destroy(&port->p_ldclist.lockrw);
551 
552 	rw_destroy(&port->maccl_rwlock);
553 	mutex_destroy(&port->mca_lock);
554 	mutex_destroy(&port->tx_lock);
555 
556 	cv_destroy(&port->state_cv);
557 	mutex_destroy(&port->state_lock);
558 
559 	if (port->num_ldcs != 0) {
560 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
561 		port->num_ldcs = 0;
562 	}
563 
564 	if (port->nvids != 0) {
565 		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
566 	}
567 
568 	kmem_free(port, sizeof (vsw_port_t));
569 
570 	D1(vswp, "%s: exit", __func__);
571 
572 	return (0);
573 }
574 
575 static int
576 vsw_init_multipools(vsw_ldc_t *ldcp, vsw_t *vswp)
577 {
578 	size_t		data_sz;
579 	int		rv;
580 	uint32_t	sz1 = 0;
581 	uint32_t	sz2 = 0;
582 	uint32_t	sz3 = 0;
583 	uint32_t	sz4 = 0;
584 
585 	/*
586 	 * We round up the mtu specified to be a multiple of 2K to limit the
587 	 * number of rx buffer pools created for a given mtu.
588 	 */
589 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
590 	data_sz = VNET_ROUNDUP_2K(data_sz);
591 
592 	/*
593 	 * If pool sizes are specified, use them. Note that the presence of
594 	 * the first tunable will be used as a hint.
595 	 */
596 	if (vsw_mblk_size1 != 0) {
597 		sz1 = vsw_mblk_size1;
598 		sz2 = vsw_mblk_size2;
599 		sz3 = vsw_mblk_size3;
600 		sz4 = vsw_mblk_size4;
601 
602 		if (sz4 == 0) { /* need 3 pools */
603 
604 			ldcp->max_rxpool_size = sz3;
605 			rv = vio_init_multipools(&ldcp->vmp,
606 			    VSW_NUM_VMPOOLS, sz1, sz2, sz3,
607 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
608 
609 		} else {
610 
611 			ldcp->max_rxpool_size = sz4;
612 			rv = vio_init_multipools(&ldcp->vmp,
613 			    VSW_NUM_VMPOOLS + 1, sz1, sz2, sz3, sz4,
614 			    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
615 			    vsw_num_mblks4);
616 
617 		}
618 
619 		return (rv);
620 	}
621 
622 	/*
623 	 * Pool sizes are not specified. We select the pool sizes based on the
624 	 * mtu if vnet_jumbo_rxpools is enabled.
625 	 */
626 	if (vsw_jumbo_rxpools == B_FALSE || data_sz == VNET_2K) {
627 		/*
628 		 * Receive buffer pool allocation based on mtu is disabled.
629 		 * Use the default mechanism of standard size pool allocation.
630 		 */
631 		sz1 = VSW_MBLK_SZ_128;
632 		sz2 = VSW_MBLK_SZ_256;
633 		sz3 = VSW_MBLK_SZ_2048;
634 		ldcp->max_rxpool_size = sz3;
635 
636 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS,
637 		    sz1, sz2, sz3,
638 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3);
639 
640 		return (rv);
641 	}
642 
643 	switch (data_sz) {
644 
645 	case VNET_4K:
646 
647 		sz1 = VSW_MBLK_SZ_128;
648 		sz2 = VSW_MBLK_SZ_256;
649 		sz3 = VSW_MBLK_SZ_2048;
650 		sz4 = sz3 << 1;			/* 4K */
651 		ldcp->max_rxpool_size = sz4;
652 
653 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
654 		    sz1, sz2, sz3, sz4,
655 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
656 		    vsw_num_mblks4);
657 		break;
658 
659 	default:	/* data_sz:  4K+ to 16K */
660 
661 		sz1 = VSW_MBLK_SZ_256;
662 		sz2 = VSW_MBLK_SZ_2048;
663 		sz3 = data_sz >> 1;	/* Jumbo-size/2 */
664 		sz4 = data_sz;	/* Jumbo-size */
665 		ldcp->max_rxpool_size = sz4;
666 
667 		rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1,
668 		    sz1, sz2, sz3, sz4,
669 		    vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3,
670 		    vsw_num_mblks4);
671 		break;
672 	}
673 
674 	return (rv);
675 
676 }
677 
678 /*
679  * Attach a logical domain channel (ldc) under a specified port.
680  *
681  * Returns 0 on success, 1 on failure.
682  */
683 static int
684 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
685 {
686 	vsw_t 		*vswp = port->p_vswp;
687 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
688 	vsw_ldc_t 	*ldcp = NULL;
689 	ldc_attr_t 	attr;
690 	ldc_status_t	istatus;
691 	int 		status = DDI_FAILURE;
692 	char		kname[MAXNAMELEN];
693 	enum		{ PROG_init = 0x0,
694 			    PROG_callback = 0x1, PROG_rx_thread = 0x2,
695 			    PROG_tx_thread = 0x4}
696 			progress;
697 
698 	progress = PROG_init;
699 
700 	D1(vswp, "%s: enter", __func__);
701 
702 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
703 	if (ldcp == NULL) {
704 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
705 		return (1);
706 	}
707 	ldcp->ldc_id = ldc_id;
708 
709 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
710 	mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
711 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
712 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
713 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
714 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
715 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
716 
717 	/* required for handshake with peer */
718 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
719 	ldcp->peer_session = 0;
720 	ldcp->session_status = 0;
721 	ldcp->hss_id = 1;	/* Initial handshake session id */
722 
723 	(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
724 
725 	/* only set for outbound lane, inbound set by peer */
726 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
727 
728 	attr.devclass = LDC_DEV_NT_SVC;
729 	attr.instance = ddi_get_instance(vswp->dip);
730 	attr.mode = LDC_MODE_UNRELIABLE;
731 	attr.mtu = VSW_LDC_MTU;
732 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
733 	if (status != 0) {
734 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
735 		    __func__, ldc_id, status);
736 		goto ldc_attach_fail;
737 	}
738 
739 	if (vsw_ldc_rxthr_enabled) {
740 		ldcp->rx_thr_flags = 0;
741 
742 		mutex_init(&ldcp->rx_thr_lock, NULL, MUTEX_DRIVER, NULL);
743 		cv_init(&ldcp->rx_thr_cv, NULL, CV_DRIVER, NULL);
744 		ldcp->rx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
745 		    vsw_ldc_rx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
746 
747 		progress |= PROG_rx_thread;
748 		if (ldcp->rx_thread == NULL) {
749 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
750 			    __func__, ldc_id);
751 			goto ldc_attach_fail;
752 		}
753 	}
754 
755 	if (vsw_ldc_txthr_enabled) {
756 		ldcp->tx_thr_flags = 0;
757 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
758 
759 		mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
760 		cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
761 		ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
762 		    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
763 
764 		progress |= PROG_tx_thread;
765 		if (ldcp->tx_thread == NULL) {
766 			DWARN(vswp, "%s(%lld): Failed to create worker thread",
767 			    __func__, ldc_id);
768 			goto ldc_attach_fail;
769 		}
770 	}
771 
772 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
773 	if (status != 0) {
774 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
775 		    __func__, ldc_id, status);
776 		(void) ldc_fini(ldcp->ldc_handle);
777 		goto ldc_attach_fail;
778 	}
779 	/*
780 	 * allocate a message for ldc_read()s, big enough to hold ctrl and
781 	 * data msgs, including raw data msgs used to recv priority frames.
782 	 */
783 	ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
784 	ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
785 
786 	progress |= PROG_callback;
787 
788 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
789 
790 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
791 		DERR(vswp, "%s: ldc_status failed", __func__);
792 		mutex_destroy(&ldcp->status_lock);
793 		goto ldc_attach_fail;
794 	}
795 
796 	ldcp->ldc_status = istatus;
797 	ldcp->ldc_port = port;
798 	ldcp->ldc_vswp = vswp;
799 
800 	vsw_reset_vnet_proto_ops(ldcp);
801 
802 	(void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
803 	ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
804 	    kname, &ldcp->ldc_stats);
805 	if (ldcp->ksp == NULL) {
806 		DERR(vswp, "%s: kstats setup failed", __func__);
807 		goto ldc_attach_fail;
808 	}
809 
810 	/* link it into the list of channels for this port */
811 	WRITE_ENTER(&ldcl->lockrw);
812 	ldcp->ldc_next = ldcl->head;
813 	ldcl->head = ldcp;
814 	RW_EXIT(&ldcl->lockrw);
815 
816 	D1(vswp, "%s: exit", __func__);
817 	return (0);
818 
819 ldc_attach_fail:
820 
821 	if (progress & PROG_callback) {
822 		(void) ldc_unreg_callback(ldcp->ldc_handle);
823 		kmem_free(ldcp->ldcmsg, ldcp->msglen);
824 	}
825 
826 	if (progress & PROG_rx_thread) {
827 		if (ldcp->rx_thread != NULL) {
828 			vsw_stop_rx_thread(ldcp);
829 		}
830 		mutex_destroy(&ldcp->rx_thr_lock);
831 		cv_destroy(&ldcp->rx_thr_cv);
832 	}
833 
834 	if (progress & PROG_tx_thread) {
835 		if (ldcp->tx_thread != NULL) {
836 			vsw_stop_tx_thread(ldcp);
837 		}
838 		mutex_destroy(&ldcp->tx_thr_lock);
839 		cv_destroy(&ldcp->tx_thr_cv);
840 	}
841 	if (ldcp->ksp != NULL) {
842 		vgen_destroy_kstats(ldcp->ksp);
843 	}
844 	mutex_destroy(&ldcp->ldc_txlock);
845 	mutex_destroy(&ldcp->ldc_rxlock);
846 	mutex_destroy(&ldcp->ldc_cblock);
847 	mutex_destroy(&ldcp->drain_cv_lock);
848 
849 	cv_destroy(&ldcp->drain_cv);
850 
851 	rw_destroy(&ldcp->lane_in.dlistrw);
852 	rw_destroy(&ldcp->lane_out.dlistrw);
853 
854 	kmem_free(ldcp, sizeof (vsw_ldc_t));
855 
856 	return (1);
857 }
858 
859 /*
860  * Detach a logical domain channel (ldc) belonging to a
861  * particular port.
862  *
863  * Returns 0 on success, 1 on failure.
864  */
865 static int
866 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
867 {
868 	vsw_t 		*vswp = port->p_vswp;
869 	vsw_ldc_t 	*ldcp, *prev_ldcp;
870 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
871 	int 		rv;
872 
873 	prev_ldcp = ldcl->head;
874 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
875 		if (ldcp->ldc_id == ldc_id) {
876 			break;
877 		}
878 	}
879 
880 	/* specified ldc id not found */
881 	if (ldcp == NULL) {
882 		DERR(vswp, "%s: ldcp = NULL", __func__);
883 		return (1);
884 	}
885 
886 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
887 
888 	/* Stop the receive thread */
889 	if (ldcp->rx_thread != NULL) {
890 		vsw_stop_rx_thread(ldcp);
891 		mutex_destroy(&ldcp->rx_thr_lock);
892 		cv_destroy(&ldcp->rx_thr_cv);
893 	}
894 	kmem_free(ldcp->ldcmsg, ldcp->msglen);
895 
896 	/* Stop the tx thread */
897 	if (ldcp->tx_thread != NULL) {
898 		vsw_stop_tx_thread(ldcp);
899 		mutex_destroy(&ldcp->tx_thr_lock);
900 		cv_destroy(&ldcp->tx_thr_cv);
901 		if (ldcp->tx_mhead != NULL) {
902 			freemsgchain(ldcp->tx_mhead);
903 			ldcp->tx_mhead = ldcp->tx_mtail = NULL;
904 			ldcp->tx_cnt = 0;
905 		}
906 	}
907 
908 	/* Destory kstats */
909 	vgen_destroy_kstats(ldcp->ksp);
910 
911 	/*
912 	 * Before we can close the channel we must release any mapped
913 	 * resources (e.g. drings).
914 	 */
915 	vsw_free_lane_resources(ldcp, INBOUND);
916 	vsw_free_lane_resources(ldcp, OUTBOUND);
917 
918 	/*
919 	 * If the close fails we are in serious trouble, as won't
920 	 * be able to delete the parent port.
921 	 */
922 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
923 		DERR(vswp, "%s: error %d closing channel %lld",
924 		    __func__, rv, ldcp->ldc_id);
925 		return (1);
926 	}
927 
928 	(void) ldc_fini(ldcp->ldc_handle);
929 
930 	ldcp->ldc_status = LDC_INIT;
931 	ldcp->ldc_handle = NULL;
932 	ldcp->ldc_vswp = NULL;
933 
934 
935 	/*
936 	 * Most likely some mblks are still in use and
937 	 * have not been returned to the pool. These mblks are
938 	 * added to the pool that is maintained in the device instance.
939 	 * Another attempt will be made to destroy the pool
940 	 * when the device detaches.
941 	 */
942 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
943 
944 	/* unlink it from the list */
945 	prev_ldcp = ldcp->ldc_next;
946 
947 	mutex_destroy(&ldcp->ldc_txlock);
948 	mutex_destroy(&ldcp->ldc_rxlock);
949 	mutex_destroy(&ldcp->ldc_cblock);
950 	cv_destroy(&ldcp->drain_cv);
951 	mutex_destroy(&ldcp->drain_cv_lock);
952 	mutex_destroy(&ldcp->status_lock);
953 	rw_destroy(&ldcp->lane_in.dlistrw);
954 	rw_destroy(&ldcp->lane_out.dlistrw);
955 
956 	kmem_free(ldcp, sizeof (vsw_ldc_t));
957 
958 	return (0);
959 }
960 
961 /*
962  * Open and attempt to bring up the channel. Note that channel
963  * can only be brought up if peer has also opened channel.
964  *
965  * Returns 0 if can open and bring up channel, otherwise
966  * returns 1.
967  */
968 static int
969 vsw_ldc_init(vsw_ldc_t *ldcp)
970 {
971 	vsw_t 		*vswp = ldcp->ldc_vswp;
972 	ldc_status_t	istatus = 0;
973 	int		rv;
974 
975 	D1(vswp, "%s: enter", __func__);
976 
977 	LDC_ENTER_LOCK(ldcp);
978 
979 	/* don't start at 0 in case clients don't like that */
980 	ldcp->next_ident = 1;
981 
982 	rv = ldc_open(ldcp->ldc_handle);
983 	if (rv != 0) {
984 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
985 		    __func__, ldcp->ldc_id, rv);
986 		LDC_EXIT_LOCK(ldcp);
987 		return (1);
988 	}
989 
990 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
991 		DERR(vswp, "%s: unable to get status", __func__);
992 		LDC_EXIT_LOCK(ldcp);
993 		return (1);
994 
995 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
996 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
997 		    __func__, ldcp->ldc_id, istatus);
998 		LDC_EXIT_LOCK(ldcp);
999 		return (1);
1000 	}
1001 
1002 	mutex_enter(&ldcp->status_lock);
1003 	ldcp->ldc_status = istatus;
1004 	mutex_exit(&ldcp->status_lock);
1005 
1006 	rv = ldc_up(ldcp->ldc_handle);
1007 	if (rv != 0) {
1008 		/*
1009 		 * Not a fatal error for ldc_up() to fail, as peer
1010 		 * end point may simply not be ready yet.
1011 		 */
1012 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
1013 		    ldcp->ldc_id, rv);
1014 		LDC_EXIT_LOCK(ldcp);
1015 		return (1);
1016 	}
1017 
1018 	/*
1019 	 * ldc_up() call is non-blocking so need to explicitly
1020 	 * check channel status to see if in fact the channel
1021 	 * is UP.
1022 	 */
1023 	mutex_enter(&ldcp->status_lock);
1024 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
1025 		DERR(vswp, "%s: unable to get status", __func__);
1026 		mutex_exit(&ldcp->status_lock);
1027 		LDC_EXIT_LOCK(ldcp);
1028 		return (1);
1029 
1030 	}
1031 
1032 	if (ldcp->ldc_status == LDC_UP) {
1033 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
1034 		    ldcp->ldc_id, istatus);
1035 		mutex_exit(&ldcp->status_lock);
1036 		LDC_EXIT_LOCK(ldcp);
1037 
1038 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1039 		return (0);
1040 	}
1041 
1042 	mutex_exit(&ldcp->status_lock);
1043 	LDC_EXIT_LOCK(ldcp);
1044 
1045 	D1(vswp, "%s: exit", __func__);
1046 	return (0);
1047 }
1048 
1049 /* disable callbacks on the channel */
1050 static int
1051 vsw_ldc_uninit(vsw_ldc_t *ldcp)
1052 {
1053 	vsw_t	*vswp = ldcp->ldc_vswp;
1054 	int	rv;
1055 
1056 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
1057 
1058 	LDC_ENTER_LOCK(ldcp);
1059 
1060 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
1061 	if (rv != 0) {
1062 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
1063 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
1064 		LDC_EXIT_LOCK(ldcp);
1065 		return (1);
1066 	}
1067 
1068 	mutex_enter(&ldcp->status_lock);
1069 	ldcp->ldc_status = LDC_INIT;
1070 	mutex_exit(&ldcp->status_lock);
1071 
1072 	LDC_EXIT_LOCK(ldcp);
1073 
1074 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
1075 
1076 	return (0);
1077 }
1078 
1079 static int
1080 vsw_init_ldcs(vsw_port_t *port)
1081 {
1082 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1083 	vsw_ldc_t	*ldcp;
1084 
1085 	READ_ENTER(&ldcl->lockrw);
1086 	ldcp =  ldcl->head;
1087 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1088 		(void) vsw_ldc_init(ldcp);
1089 	}
1090 	RW_EXIT(&ldcl->lockrw);
1091 
1092 	return (0);
1093 }
1094 
1095 static int
1096 vsw_uninit_ldcs(vsw_port_t *port)
1097 {
1098 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1099 	vsw_ldc_t	*ldcp;
1100 
1101 	D1(NULL, "vsw_uninit_ldcs: enter\n");
1102 
1103 	READ_ENTER(&ldcl->lockrw);
1104 	ldcp =  ldcl->head;
1105 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1106 		(void) vsw_ldc_uninit(ldcp);
1107 	}
1108 	RW_EXIT(&ldcl->lockrw);
1109 
1110 	D1(NULL, "vsw_uninit_ldcs: exit\n");
1111 
1112 	return (0);
1113 }
1114 
1115 /*
1116  * Wait until the callback(s) associated with the ldcs under the specified
1117  * port have completed.
1118  *
1119  * Prior to this function being invoked each channel under this port
1120  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1121  *
1122  * A short explaination of what we are doing below..
1123  *
1124  * The simplest approach would be to have a reference counter in
1125  * the ldc structure which is increment/decremented by the callbacks as
1126  * they use the channel. The drain function could then simply disable any
1127  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
1128  * there is a tiny window here - before the callback is able to get the lock
1129  * on the channel it is interrupted and this function gets to execute. It
1130  * sees that the ref count is zero and believes its free to delete the
1131  * associated data structures.
1132  *
1133  * We get around this by taking advantage of the fact that before the ldc
1134  * framework invokes a callback it sets a flag to indicate that there is a
1135  * callback active (or about to become active). If when we attempt to
1136  * unregister a callback when this active flag is set then the unregister
1137  * will fail with EWOULDBLOCK.
1138  *
1139  * If the unregister fails we do a cv_timedwait. We will either be signaled
1140  * by the callback as it is exiting (note we have to wait a short period to
1141  * allow the callback to return fully to the ldc framework and it to clear
1142  * the active flag), or by the timer expiring. In either case we again attempt
1143  * the unregister. We repeat this until we can succesfully unregister the
1144  * callback.
1145  *
1146  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
1147  * the case where the callback has finished but the ldc framework has not yet
1148  * cleared the active flag. In this case we would never get a cv_signal.
1149  */
1150 static int
1151 vsw_drain_ldcs(vsw_port_t *port)
1152 {
1153 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
1154 	vsw_ldc_t	*ldcp;
1155 	vsw_t		*vswp = port->p_vswp;
1156 
1157 	D1(vswp, "%s: enter", __func__);
1158 
1159 	READ_ENTER(&ldcl->lockrw);
1160 
1161 	ldcp = ldcl->head;
1162 
1163 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
1164 		/*
1165 		 * If we can unregister the channel callback then we
1166 		 * know that there is no callback either running or
1167 		 * scheduled to run for this channel so move on to next
1168 		 * channel in the list.
1169 		 */
1170 		mutex_enter(&ldcp->drain_cv_lock);
1171 
1172 		/* prompt active callbacks to quit */
1173 		ldcp->drain_state = VSW_LDC_DRAINING;
1174 
1175 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
1176 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
1177 			    ldcp->ldc_id);
1178 			mutex_exit(&ldcp->drain_cv_lock);
1179 			continue;
1180 		} else {
1181 			/*
1182 			 * If we end up here we know that either 1) a callback
1183 			 * is currently executing, 2) is about to start (i.e.
1184 			 * the ldc framework has set the active flag but
1185 			 * has not actually invoked the callback yet, or 3)
1186 			 * has finished and has returned to the ldc framework
1187 			 * but the ldc framework has not yet cleared the
1188 			 * active bit.
1189 			 *
1190 			 * Wait for it to finish.
1191 			 */
1192 			while (ldc_unreg_callback(ldcp->ldc_handle)
1193 			    == EWOULDBLOCK)
1194 				(void) cv_timedwait(&ldcp->drain_cv,
1195 				    &ldcp->drain_cv_lock, lbolt + hz);
1196 
1197 			mutex_exit(&ldcp->drain_cv_lock);
1198 			D2(vswp, "%s: unreg callback for chan %ld after "
1199 			    "timeout", __func__, ldcp->ldc_id);
1200 		}
1201 	}
1202 	RW_EXIT(&ldcl->lockrw);
1203 
1204 	D1(vswp, "%s: exit", __func__);
1205 	return (0);
1206 }
1207 
1208 /*
1209  * Wait until all tasks which reference this port have completed.
1210  *
1211  * Prior to this function being invoked each channel under this port
1212  * should have been quiesced via ldc_set_cb_mode(DISABLE).
1213  */
1214 static int
1215 vsw_drain_port_taskq(vsw_port_t *port)
1216 {
1217 	vsw_t		*vswp = port->p_vswp;
1218 
1219 	D1(vswp, "%s: enter", __func__);
1220 
1221 	/*
1222 	 * Mark the port as in the process of being detached, and
1223 	 * dispatch a marker task to the queue so we know when all
1224 	 * relevant tasks have completed.
1225 	 */
1226 	mutex_enter(&port->state_lock);
1227 	port->state = VSW_PORT_DETACHING;
1228 
1229 	if ((vswp->taskq_p == NULL) ||
1230 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1231 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1232 		DERR(vswp, "%s: unable to dispatch marker task",
1233 		    __func__);
1234 		mutex_exit(&port->state_lock);
1235 		return (1);
1236 	}
1237 
1238 	/*
1239 	 * Wait for the marker task to finish.
1240 	 */
1241 	while (port->state != VSW_PORT_DETACHABLE)
1242 		cv_wait(&port->state_cv, &port->state_lock);
1243 
1244 	mutex_exit(&port->state_lock);
1245 
1246 	D1(vswp, "%s: exit", __func__);
1247 
1248 	return (0);
1249 }
1250 
1251 static void
1252 vsw_marker_task(void *arg)
1253 {
1254 	vsw_port_t	*port = arg;
1255 	vsw_t		*vswp = port->p_vswp;
1256 
1257 	D1(vswp, "%s: enter", __func__);
1258 
1259 	mutex_enter(&port->state_lock);
1260 
1261 	/*
1262 	 * No further tasks should be dispatched which reference
1263 	 * this port so ok to mark it as safe to detach.
1264 	 */
1265 	port->state = VSW_PORT_DETACHABLE;
1266 
1267 	cv_signal(&port->state_cv);
1268 
1269 	mutex_exit(&port->state_lock);
1270 
1271 	D1(vswp, "%s: exit", __func__);
1272 }
1273 
1274 vsw_port_t *
1275 vsw_lookup_port(vsw_t *vswp, int p_instance)
1276 {
1277 	vsw_port_list_t *plist = &vswp->plist;
1278 	vsw_port_t	*port;
1279 
1280 	for (port = plist->head; port != NULL; port = port->p_next) {
1281 		if (port->p_instance == p_instance) {
1282 			D2(vswp, "vsw_lookup_port: found p_instance\n");
1283 			return (port);
1284 		}
1285 	}
1286 
1287 	return (NULL);
1288 }
1289 
1290 void
1291 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1292 {
1293 	vsw_ldc_list_t 	*ldclp;
1294 	vsw_ldc_t	*ldcp;
1295 
1296 	ldclp = &portp->p_ldclist;
1297 
1298 	READ_ENTER(&ldclp->lockrw);
1299 
1300 	/*
1301 	 * NOTE: for now, we will assume we have a single channel.
1302 	 */
1303 	if (ldclp->head == NULL) {
1304 		RW_EXIT(&ldclp->lockrw);
1305 		return;
1306 	}
1307 	ldcp = ldclp->head;
1308 
1309 	mutex_enter(&ldcp->ldc_cblock);
1310 
1311 	/*
1312 	 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1313 	 * the connection. See comments in vsw_set_vnet_proto_ops().
1314 	 */
1315 	if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1316 	    portp->nvids != 0) {
1317 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1318 	}
1319 
1320 	mutex_exit(&ldcp->ldc_cblock);
1321 
1322 	RW_EXIT(&ldclp->lockrw);
1323 }
1324 
1325 void
1326 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1327 {
1328 	vsw_ldc_list_t	*ldclp;
1329 	vsw_ldc_t	*ldcp;
1330 
1331 	ldclp = &portp->p_ldclist;
1332 
1333 	READ_ENTER(&ldclp->lockrw);
1334 
1335 	/*
1336 	 * NOTE: for now, we will assume we have a single channel.
1337 	 */
1338 	if (ldclp->head == NULL) {
1339 		RW_EXIT(&ldclp->lockrw);
1340 		return;
1341 	}
1342 	ldcp = ldclp->head;
1343 
1344 	mutex_enter(&ldcp->ldc_cblock);
1345 
1346 	/*
1347 	 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1348 	 * to trigger re-negotiation, which inturn trigger HybridIO
1349 	 * setup/cleanup.
1350 	 */
1351 	if ((ldcp->hphase == VSW_MILESTONE4) &&
1352 	    (portp->p_hio_capable == B_TRUE)) {
1353 		if (immediate == B_TRUE) {
1354 			(void) ldc_down(ldcp->ldc_handle);
1355 		} else {
1356 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1357 		}
1358 	}
1359 
1360 	mutex_exit(&ldcp->ldc_cblock);
1361 
1362 	RW_EXIT(&ldclp->lockrw);
1363 }
1364 
1365 void
1366 vsw_port_reset(vsw_port_t *portp)
1367 {
1368 	vsw_ldc_list_t 	*ldclp;
1369 	vsw_ldc_t	*ldcp;
1370 
1371 	ldclp = &portp->p_ldclist;
1372 
1373 	READ_ENTER(&ldclp->lockrw);
1374 
1375 	/*
1376 	 * NOTE: for now, we will assume we have a single channel.
1377 	 */
1378 	if (ldclp->head == NULL) {
1379 		RW_EXIT(&ldclp->lockrw);
1380 		return;
1381 	}
1382 	ldcp = ldclp->head;
1383 
1384 	mutex_enter(&ldcp->ldc_cblock);
1385 
1386 	/*
1387 	 * reset channel and terminate the connection.
1388 	 */
1389 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1390 
1391 	mutex_exit(&ldcp->ldc_cblock);
1392 
1393 	RW_EXIT(&ldclp->lockrw);
1394 }
1395 
1396 void
1397 vsw_reset_ports(vsw_t *vswp)
1398 {
1399 	vsw_port_list_t	*plist = &vswp->plist;
1400 	vsw_port_t	*portp;
1401 
1402 	READ_ENTER(&plist->lockrw);
1403 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1404 		if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1405 			vsw_hio_stop_port(portp);
1406 		}
1407 		vsw_port_reset(portp);
1408 	}
1409 	RW_EXIT(&plist->lockrw);
1410 }
1411 
1412 
1413 /*
1414  * Search for and remove the specified port from the port
1415  * list. Returns 0 if able to locate and remove port, otherwise
1416  * returns 1.
1417  */
1418 static int
1419 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1420 {
1421 	vsw_port_list_t *plist = &vswp->plist;
1422 	vsw_port_t	*curr_p, *prev_p;
1423 
1424 	if (plist->head == NULL)
1425 		return (1);
1426 
1427 	curr_p = prev_p = plist->head;
1428 
1429 	while (curr_p != NULL) {
1430 		if (curr_p == port) {
1431 			if (prev_p == curr_p) {
1432 				plist->head = curr_p->p_next;
1433 			} else {
1434 				prev_p->p_next = curr_p->p_next;
1435 			}
1436 			plist->num_ports--;
1437 			break;
1438 		} else {
1439 			prev_p = curr_p;
1440 			curr_p = curr_p->p_next;
1441 		}
1442 	}
1443 	return (0);
1444 }
1445 
1446 /*
1447  * Interrupt handler for ldc messages.
1448  */
1449 static uint_t
1450 vsw_ldc_cb(uint64_t event, caddr_t arg)
1451 {
1452 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
1453 	vsw_t 		*vswp = ldcp->ldc_vswp;
1454 
1455 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1456 
1457 	mutex_enter(&ldcp->ldc_cblock);
1458 	ldcp->ldc_stats.callbacks++;
1459 
1460 	mutex_enter(&ldcp->status_lock);
1461 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1462 		mutex_exit(&ldcp->status_lock);
1463 		mutex_exit(&ldcp->ldc_cblock);
1464 		return (LDC_SUCCESS);
1465 	}
1466 	mutex_exit(&ldcp->status_lock);
1467 
1468 	if (event & LDC_EVT_UP) {
1469 		/*
1470 		 * Channel has come up.
1471 		 */
1472 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1473 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1474 
1475 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1476 
1477 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1478 	}
1479 
1480 	if (event & LDC_EVT_READ) {
1481 		/*
1482 		 * Data available for reading.
1483 		 */
1484 		D2(vswp, "%s: id(ld) event(%llx) data READ",
1485 		    __func__, ldcp->ldc_id, event);
1486 
1487 		if (ldcp->rx_thread != NULL) {
1488 			/*
1489 			 * If the receive thread is enabled, then
1490 			 * wakeup the receive thread to process the
1491 			 * LDC messages.
1492 			 */
1493 			mutex_exit(&ldcp->ldc_cblock);
1494 			mutex_enter(&ldcp->rx_thr_lock);
1495 			if (!(ldcp->rx_thr_flags & VSW_WTHR_DATARCVD)) {
1496 				ldcp->rx_thr_flags |= VSW_WTHR_DATARCVD;
1497 				cv_signal(&ldcp->rx_thr_cv);
1498 			}
1499 			mutex_exit(&ldcp->rx_thr_lock);
1500 			mutex_enter(&ldcp->ldc_cblock);
1501 		} else {
1502 			vsw_process_pkt(ldcp);
1503 		}
1504 
1505 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1506 
1507 		goto vsw_cb_exit;
1508 	}
1509 
1510 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1511 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1512 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1513 
1514 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1515 	}
1516 
1517 	/*
1518 	 * Catch either LDC_EVT_WRITE which we don't support or any
1519 	 * unknown event.
1520 	 */
1521 	if (event &
1522 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1523 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1524 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1525 	}
1526 
1527 vsw_cb_exit:
1528 	mutex_exit(&ldcp->ldc_cblock);
1529 
1530 	/*
1531 	 * Let the drain function know we are finishing if it
1532 	 * is waiting.
1533 	 */
1534 	mutex_enter(&ldcp->drain_cv_lock);
1535 	if (ldcp->drain_state == VSW_LDC_DRAINING)
1536 		cv_signal(&ldcp->drain_cv);
1537 	mutex_exit(&ldcp->drain_cv_lock);
1538 
1539 	return (LDC_SUCCESS);
1540 }
1541 
1542 /*
1543  * Reinitialise data structures associated with the channel.
1544  */
1545 static void
1546 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1547 {
1548 	vsw_t		*vswp = ldcp->ldc_vswp;
1549 	vsw_port_t	*port;
1550 	vsw_ldc_list_t	*ldcl;
1551 
1552 	D1(vswp, "%s: enter", __func__);
1553 
1554 	/* free receive mblk pools for the channel */
1555 	vio_destroy_multipools(&ldcp->vmp, &vswp->rxh);
1556 
1557 	port = ldcp->ldc_port;
1558 	ldcl = &port->p_ldclist;
1559 
1560 	READ_ENTER(&ldcl->lockrw);
1561 
1562 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1563 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1564 
1565 	vsw_free_lane_resources(ldcp, INBOUND);
1566 	vsw_free_lane_resources(ldcp, OUTBOUND);
1567 	RW_EXIT(&ldcl->lockrw);
1568 
1569 	ldcp->lane_in.lstate = 0;
1570 	ldcp->lane_out.lstate = 0;
1571 
1572 	/* Remove the fdb entry for this port/mac address */
1573 	vsw_fdbe_del(vswp, &(port->p_macaddr));
1574 
1575 	/* remove the port from vlans it has been assigned to */
1576 	vsw_vlan_remove_ids(port, VSW_VNETPORT);
1577 
1578 	/*
1579 	 * Remove parent port from any multicast groups
1580 	 * it may have registered with. Client must resend
1581 	 * multicast add command after handshake completes.
1582 	 */
1583 	vsw_del_mcst_port(port);
1584 
1585 	ldcp->peer_session = 0;
1586 	ldcp->session_status = 0;
1587 	ldcp->hcnt = 0;
1588 	ldcp->hphase = VSW_MILESTONE0;
1589 
1590 	vsw_reset_vnet_proto_ops(ldcp);
1591 
1592 	D1(vswp, "%s: exit", __func__);
1593 }
1594 
1595 /*
1596  * Process a connection event.
1597  *
1598  * Note - care must be taken to ensure that this function is
1599  * not called with the dlistrw lock held.
1600  */
1601 static void
1602 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1603 {
1604 	vsw_t		*vswp = ldcp->ldc_vswp;
1605 	vsw_conn_evt_t	*conn = NULL;
1606 
1607 	D1(vswp, "%s: enter", __func__);
1608 
1609 	/*
1610 	 * Check if either a reset or restart event is pending
1611 	 * or in progress. If so just return.
1612 	 *
1613 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1614 	 * being received by the callback handler, or a ECONNRESET error
1615 	 * code being returned from a ldc_read() or ldc_write() call.
1616 	 *
1617 	 * A VSW_CONN_RESTART event occurs when some error checking code
1618 	 * decides that there is a problem with data from the channel,
1619 	 * and that the handshake should be restarted.
1620 	 */
1621 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1622 	    (ldstub((uint8_t *)&ldcp->reset_active)))
1623 		return;
1624 
1625 	/*
1626 	 * If it is an LDC_UP event we first check the recorded
1627 	 * state of the channel. If this is UP then we know that
1628 	 * the channel moving to the UP state has already been dealt
1629 	 * with and don't need to dispatch a  new task.
1630 	 *
1631 	 * The reason for this check is that when we do a ldc_up(),
1632 	 * depending on the state of the peer, we may or may not get
1633 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1634 	 * every time we do ldc_up() we explicitly check the channel
1635 	 * status to see has it come up (ldc_up() is asynch and will
1636 	 * complete at some undefined time), and take the appropriate
1637 	 * action.
1638 	 *
1639 	 * The flip side of this is that we may get a LDC_UP event
1640 	 * when we have already seen that the channel is up and have
1641 	 * dealt with that.
1642 	 */
1643 	mutex_enter(&ldcp->status_lock);
1644 	if (evt == VSW_CONN_UP) {
1645 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1646 			mutex_exit(&ldcp->status_lock);
1647 			return;
1648 		}
1649 	}
1650 	mutex_exit(&ldcp->status_lock);
1651 
1652 	/*
1653 	 * The transaction group id allows us to identify and discard
1654 	 * any tasks which are still pending on the taskq and refer
1655 	 * to the handshake session we are about to restart or reset.
1656 	 * These stale messages no longer have any real meaning.
1657 	 */
1658 	(void) atomic_inc_32(&ldcp->hss_id);
1659 
1660 	ASSERT(vswp->taskq_p != NULL);
1661 
1662 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1663 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1664 		    " connection event", vswp->instance);
1665 		goto err_exit;
1666 	}
1667 
1668 	conn->evt = evt;
1669 	conn->ldcp = ldcp;
1670 
1671 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1672 	    DDI_NOSLEEP) != DDI_SUCCESS) {
1673 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1674 		    vswp->instance);
1675 
1676 		kmem_free(conn, sizeof (vsw_conn_evt_t));
1677 		goto err_exit;
1678 	}
1679 
1680 	D1(vswp, "%s: exit", __func__);
1681 	return;
1682 
1683 err_exit:
1684 	/*
1685 	 * Have mostly likely failed due to memory shortage. Clear the flag so
1686 	 * that future requests will at least be attempted and will hopefully
1687 	 * succeed.
1688 	 */
1689 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1690 		ldcp->reset_active = 0;
1691 }
1692 
1693 /*
1694  * Deal with events relating to a connection. Invoked from a taskq.
1695  */
1696 static void
1697 vsw_conn_task(void *arg)
1698 {
1699 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
1700 	vsw_ldc_t	*ldcp = NULL;
1701 	vsw_port_t	*portp;
1702 	vsw_t		*vswp = NULL;
1703 	uint16_t	evt;
1704 	ldc_status_t	curr_status;
1705 
1706 	ldcp = conn->ldcp;
1707 	evt = conn->evt;
1708 	vswp = ldcp->ldc_vswp;
1709 	portp = ldcp->ldc_port;
1710 
1711 	D1(vswp, "%s: enter", __func__);
1712 
1713 	/* can safely free now have copied out data */
1714 	kmem_free(conn, sizeof (vsw_conn_evt_t));
1715 
1716 	mutex_enter(&ldcp->status_lock);
1717 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1718 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1719 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1720 		mutex_exit(&ldcp->status_lock);
1721 		return;
1722 	}
1723 
1724 	/*
1725 	 * If we wish to restart the handshake on this channel, then if
1726 	 * the channel is UP we bring it DOWN to flush the underlying
1727 	 * ldc queue.
1728 	 */
1729 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1730 		(void) ldc_down(ldcp->ldc_handle);
1731 
1732 	if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1733 		vsw_hio_stop(vswp, ldcp);
1734 	}
1735 
1736 	/*
1737 	 * re-init all the associated data structures.
1738 	 */
1739 	vsw_ldc_reinit(ldcp);
1740 
1741 	/*
1742 	 * Bring the channel back up (note it does no harm to
1743 	 * do this even if the channel is already UP, Just
1744 	 * becomes effectively a no-op).
1745 	 */
1746 	(void) ldc_up(ldcp->ldc_handle);
1747 
1748 	/*
1749 	 * Check if channel is now UP. This will only happen if
1750 	 * peer has also done a ldc_up().
1751 	 */
1752 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1753 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1754 		    "channel %ld", vswp->instance, ldcp->ldc_id);
1755 		mutex_exit(&ldcp->status_lock);
1756 		return;
1757 	}
1758 
1759 	ldcp->ldc_status = curr_status;
1760 
1761 	/* channel UP so restart handshake by sending version info */
1762 	if (curr_status == LDC_UP) {
1763 		if (ldcp->hcnt++ > vsw_num_handshakes) {
1764 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1765 			    " handshake attempts (%d) on channel %ld",
1766 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1767 			mutex_exit(&ldcp->status_lock);
1768 			return;
1769 		}
1770 
1771 		if (vsw_obp_ver_proto_workaround == B_FALSE &&
1772 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1773 		    DDI_NOSLEEP) != DDI_SUCCESS)) {
1774 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1775 			    vswp->instance);
1776 
1777 			/*
1778 			 * Don't count as valid restart attempt if couldn't
1779 			 * send version msg.
1780 			 */
1781 			if (ldcp->hcnt > 0)
1782 				ldcp->hcnt--;
1783 		}
1784 	}
1785 
1786 	/*
1787 	 * Mark that the process is complete by clearing the flag.
1788 	 *
1789 	 * Note is it possible that the taskq dispatch above may have failed,
1790 	 * most likely due to memory shortage. We still clear the flag so
1791 	 * future attempts will at least be attempted and will hopefully
1792 	 * succeed.
1793 	 */
1794 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1795 		ldcp->reset_active = 0;
1796 
1797 	mutex_exit(&ldcp->status_lock);
1798 
1799 	D1(vswp, "%s: exit", __func__);
1800 }
1801 
1802 /*
1803  * returns 0 if legal for event signified by flag to have
1804  * occured at the time it did. Otherwise returns 1.
1805  */
1806 int
1807 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1808 {
1809 	vsw_t		*vswp = ldcp->ldc_vswp;
1810 	uint64_t	state;
1811 	uint64_t	phase;
1812 
1813 	if (dir == INBOUND)
1814 		state = ldcp->lane_in.lstate;
1815 	else
1816 		state = ldcp->lane_out.lstate;
1817 
1818 	phase = ldcp->hphase;
1819 
1820 	switch (flag) {
1821 	case VSW_VER_INFO_RECV:
1822 		if (phase > VSW_MILESTONE0) {
1823 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1824 			    " when in state %d\n", ldcp->ldc_id, phase);
1825 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1826 			return (1);
1827 		}
1828 		break;
1829 
1830 	case VSW_VER_ACK_RECV:
1831 	case VSW_VER_NACK_RECV:
1832 		if (!(state & VSW_VER_INFO_SENT)) {
1833 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1834 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1835 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1836 			return (1);
1837 		} else
1838 			state &= ~VSW_VER_INFO_SENT;
1839 		break;
1840 
1841 	case VSW_ATTR_INFO_RECV:
1842 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1843 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1844 			    " when in state %d\n", ldcp->ldc_id, phase);
1845 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1846 			return (1);
1847 		}
1848 		break;
1849 
1850 	case VSW_ATTR_ACK_RECV:
1851 	case VSW_ATTR_NACK_RECV:
1852 		if (!(state & VSW_ATTR_INFO_SENT)) {
1853 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1854 			    " or ATTR_NACK when in state %d\n",
1855 			    ldcp->ldc_id, phase);
1856 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1857 			return (1);
1858 		} else
1859 			state &= ~VSW_ATTR_INFO_SENT;
1860 		break;
1861 
1862 	case VSW_DRING_INFO_RECV:
1863 		if (phase < VSW_MILESTONE1) {
1864 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1865 			    " when in state %d\n", ldcp->ldc_id, phase);
1866 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1867 			return (1);
1868 		}
1869 		break;
1870 
1871 	case VSW_DRING_ACK_RECV:
1872 	case VSW_DRING_NACK_RECV:
1873 		if (!(state & VSW_DRING_INFO_SENT)) {
1874 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1875 			    " or DRING_NACK when in state %d\n",
1876 			    ldcp->ldc_id, phase);
1877 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1878 			return (1);
1879 		} else
1880 			state &= ~VSW_DRING_INFO_SENT;
1881 		break;
1882 
1883 	case VSW_RDX_INFO_RECV:
1884 		if (phase < VSW_MILESTONE3) {
1885 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1886 			    " when in state %d\n", ldcp->ldc_id, phase);
1887 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1888 			return (1);
1889 		}
1890 		break;
1891 
1892 	case VSW_RDX_ACK_RECV:
1893 	case VSW_RDX_NACK_RECV:
1894 		if (!(state & VSW_RDX_INFO_SENT)) {
1895 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1896 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1897 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1898 			return (1);
1899 		} else
1900 			state &= ~VSW_RDX_INFO_SENT;
1901 		break;
1902 
1903 	case VSW_MCST_INFO_RECV:
1904 		if (phase < VSW_MILESTONE3) {
1905 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1906 			    " when in state %d\n", ldcp->ldc_id, phase);
1907 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1908 			return (1);
1909 		}
1910 		break;
1911 
1912 	default:
1913 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1914 		    ldcp->ldc_id, flag);
1915 		return (1);
1916 	}
1917 
1918 	if (dir == INBOUND)
1919 		ldcp->lane_in.lstate = state;
1920 	else
1921 		ldcp->lane_out.lstate = state;
1922 
1923 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1924 
1925 	return (0);
1926 }
1927 
1928 void
1929 vsw_next_milestone(vsw_ldc_t *ldcp)
1930 {
1931 	vsw_t		*vswp = ldcp->ldc_vswp;
1932 	vsw_port_t	*portp = ldcp->ldc_port;
1933 
1934 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1935 	    ldcp->ldc_id, ldcp->hphase);
1936 
1937 	DUMP_FLAGS(ldcp->lane_in.lstate);
1938 	DUMP_FLAGS(ldcp->lane_out.lstate);
1939 
1940 	switch (ldcp->hphase) {
1941 
1942 	case VSW_MILESTONE0:
1943 		/*
1944 		 * If we haven't started to handshake with our peer,
1945 		 * start to do so now.
1946 		 */
1947 		if (ldcp->lane_out.lstate == 0) {
1948 			D2(vswp, "%s: (chan %lld) starting handshake "
1949 			    "with peer", __func__, ldcp->ldc_id);
1950 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1951 		}
1952 
1953 		/*
1954 		 * Only way to pass this milestone is to have successfully
1955 		 * negotiated version info.
1956 		 */
1957 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
1958 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
1959 
1960 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
1961 			    __func__, ldcp->ldc_id);
1962 
1963 			vsw_set_vnet_proto_ops(ldcp);
1964 
1965 			/*
1966 			 * Next milestone is passed when attribute
1967 			 * information has been successfully exchanged.
1968 			 */
1969 			ldcp->hphase = VSW_MILESTONE1;
1970 			vsw_send_attr(ldcp);
1971 
1972 		}
1973 		break;
1974 
1975 	case VSW_MILESTONE1:
1976 		/*
1977 		 * Only way to pass this milestone is to have successfully
1978 		 * negotiated attribute information.
1979 		 */
1980 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
1981 
1982 			ldcp->hphase = VSW_MILESTONE2;
1983 
1984 			/*
1985 			 * If the peer device has said it wishes to
1986 			 * use descriptor rings then we send it our ring
1987 			 * info, otherwise we just set up a private ring
1988 			 * which we use an internal buffer
1989 			 */
1990 			if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1991 			    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
1992 			    (VSW_VER_LT(ldcp, 1, 2) &&
1993 			    (ldcp->lane_in.xfer_mode ==
1994 			    VIO_DRING_MODE_V1_0))) {
1995 				vsw_send_dring_info(ldcp);
1996 			}
1997 		}
1998 		break;
1999 
2000 	case VSW_MILESTONE2:
2001 		/*
2002 		 * If peer has indicated in its attribute message that
2003 		 * it wishes to use descriptor rings then the only way
2004 		 * to pass this milestone is for us to have received
2005 		 * valid dring info.
2006 		 *
2007 		 * If peer is not using descriptor rings then just fall
2008 		 * through.
2009 		 */
2010 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2011 		    (ldcp->lane_in.xfer_mode & VIO_DRING_MODE_V1_2)) ||
2012 		    (VSW_VER_LT(ldcp, 1, 2) &&
2013 		    (ldcp->lane_in.xfer_mode ==
2014 		    VIO_DRING_MODE_V1_0))) {
2015 			if (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))
2016 				break;
2017 		}
2018 
2019 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
2020 		    __func__, ldcp->ldc_id);
2021 
2022 		ldcp->hphase = VSW_MILESTONE3;
2023 		vsw_send_rdx(ldcp);
2024 		break;
2025 
2026 	case VSW_MILESTONE3:
2027 		/*
2028 		 * Pass this milestone when all paramaters have been
2029 		 * successfully exchanged and RDX sent in both directions.
2030 		 *
2031 		 * Mark outbound lane as available to transmit data.
2032 		 */
2033 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
2034 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
2035 
2036 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
2037 			    __func__, ldcp->ldc_id);
2038 			D2(vswp, "%s: ** handshake complete (0x%llx : "
2039 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
2040 			    ldcp->lane_out.lstate);
2041 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
2042 			ldcp->hphase = VSW_MILESTONE4;
2043 			ldcp->hcnt = 0;
2044 			DISPLAY_STATE();
2045 			/* Start HIO if enabled and capable */
2046 			if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
2047 				D2(vswp, "%s: start HybridIO setup", __func__);
2048 				vsw_hio_start(vswp, ldcp);
2049 			}
2050 		} else {
2051 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
2052 			    __func__, ldcp->lane_in.lstate,
2053 			    ldcp->lane_out.lstate);
2054 		}
2055 		break;
2056 
2057 	case VSW_MILESTONE4:
2058 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
2059 		    ldcp->ldc_id);
2060 		break;
2061 
2062 	default:
2063 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
2064 		    ldcp->ldc_id, ldcp->hphase);
2065 	}
2066 
2067 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
2068 	    ldcp->hphase);
2069 }
2070 
2071 /*
2072  * Check if major version is supported.
2073  *
2074  * Returns 0 if finds supported major number, and if necessary
2075  * adjusts the minor field.
2076  *
2077  * Returns 1 if can't match major number exactly. Sets mjor/minor
2078  * to next lowest support values, or to zero if no other values possible.
2079  */
2080 static int
2081 vsw_supported_version(vio_ver_msg_t *vp)
2082 {
2083 	int	i;
2084 
2085 	D1(NULL, "vsw_supported_version: enter");
2086 
2087 	for (i = 0; i < VSW_NUM_VER; i++) {
2088 		if (vsw_versions[i].ver_major == vp->ver_major) {
2089 			/*
2090 			 * Matching or lower major version found. Update
2091 			 * minor number if necessary.
2092 			 */
2093 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
2094 				D2(NULL, "%s: adjusting minor value from %d "
2095 				    "to %d", __func__, vp->ver_minor,
2096 				    vsw_versions[i].ver_minor);
2097 				vp->ver_minor = vsw_versions[i].ver_minor;
2098 			}
2099 
2100 			return (0);
2101 		}
2102 
2103 		/*
2104 		 * If the message contains a higher major version number, set
2105 		 * the message's major/minor versions to the current values
2106 		 * and return false, so this message will get resent with
2107 		 * these values.
2108 		 */
2109 		if (vsw_versions[i].ver_major < vp->ver_major) {
2110 			D2(NULL, "%s: adjusting major and minor "
2111 			    "values to %d, %d\n",
2112 			    __func__, vsw_versions[i].ver_major,
2113 			    vsw_versions[i].ver_minor);
2114 			vp->ver_major = vsw_versions[i].ver_major;
2115 			vp->ver_minor = vsw_versions[i].ver_minor;
2116 			return (1);
2117 		}
2118 	}
2119 
2120 	/* No match was possible, zero out fields */
2121 	vp->ver_major = 0;
2122 	vp->ver_minor = 0;
2123 
2124 	D1(NULL, "vsw_supported_version: exit");
2125 
2126 	return (1);
2127 }
2128 
2129 /*
2130  * Set vnet-protocol-version dependent functions based on version.
2131  */
2132 static void
2133 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
2134 {
2135 	vsw_t	*vswp = ldcp->ldc_vswp;
2136 	lane_t	*lp = &ldcp->lane_out;
2137 
2138 	if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2139 		/*
2140 		 * If the version negotiated with peer is >= 1.4(Jumbo Frame
2141 		 * Support), set the mtu in our attributes to max_frame_size.
2142 		 */
2143 		lp->mtu = vswp->max_frame_size;
2144 	} else if (VSW_VER_EQ(ldcp, 1, 3)) {
2145 		/*
2146 		 * If the version negotiated with peer is == 1.3 (Vlan Tag
2147 		 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
2148 		 */
2149 		lp->mtu = ETHERMAX + VLAN_TAGSZ;
2150 	} else {
2151 		vsw_port_t	*portp = ldcp->ldc_port;
2152 		/*
2153 		 * Pre-1.3 peers expect max frame size of ETHERMAX.
2154 		 * We can negotiate that size with those peers provided only
2155 		 * pvid is defined for our peer and there are no vids. Then we
2156 		 * can send/recv only untagged frames of max size ETHERMAX.
2157 		 * Note that pvid of the peer can be different, as vsw has to
2158 		 * serve the vnet in that vlan even if itself is not assigned
2159 		 * to that vlan.
2160 		 */
2161 		if (portp->nvids == 0) {
2162 			lp->mtu = ETHERMAX;
2163 		}
2164 	}
2165 
2166 	if (VSW_VER_GTEQ(ldcp, 1, 2)) {
2167 		/* Versions >= 1.2 */
2168 
2169 		if (VSW_PRI_ETH_DEFINED(vswp)) {
2170 			/*
2171 			 * enable priority routines and pkt mode only if
2172 			 * at least one pri-eth-type is specified in MD.
2173 			 */
2174 			ldcp->tx = vsw_ldctx_pri;
2175 			ldcp->rx_pktdata = vsw_process_pkt_data;
2176 
2177 			/* set xfer mode for vsw_send_attr() */
2178 			lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2179 		} else {
2180 			/* no priority eth types defined in MD */
2181 
2182 			ldcp->tx = vsw_ldctx;
2183 			ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2184 
2185 			/* set xfer mode for vsw_send_attr() */
2186 			lp->xfer_mode = VIO_DRING_MODE_V1_2;
2187 		}
2188 
2189 	} else {
2190 		/* Versions prior to 1.2  */
2191 
2192 		vsw_reset_vnet_proto_ops(ldcp);
2193 	}
2194 }
2195 
2196 /*
2197  * Reset vnet-protocol-version dependent functions to v1.0.
2198  */
2199 static void
2200 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2201 {
2202 	lane_t	*lp = &ldcp->lane_out;
2203 
2204 	ldcp->tx = vsw_ldctx;
2205 	ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2206 
2207 	/* set xfer mode for vsw_send_attr() */
2208 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
2209 }
2210 
2211 /*
2212  * Main routine for processing messages received over LDC.
2213  */
2214 static void
2215 vsw_process_pkt(void *arg)
2216 {
2217 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2218 	vsw_t 		*vswp = ldcp->ldc_vswp;
2219 	size_t		msglen;
2220 	vio_msg_tag_t	*tagp;
2221 	uint64_t	*ldcmsg;
2222 	int 		rv = 0;
2223 
2224 
2225 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2226 
2227 	ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2228 
2229 	ldcmsg = ldcp->ldcmsg;
2230 	/*
2231 	 * If channel is up read messages until channel is empty.
2232 	 */
2233 	do {
2234 		msglen = ldcp->msglen;
2235 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2236 
2237 		if (rv != 0) {
2238 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2239 			    __func__, ldcp->ldc_id, rv, msglen);
2240 		}
2241 
2242 		/* channel has been reset */
2243 		if (rv == ECONNRESET) {
2244 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2245 			break;
2246 		}
2247 
2248 		if (msglen == 0) {
2249 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2250 			    ldcp->ldc_id);
2251 			break;
2252 		}
2253 
2254 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2255 		    ldcp->ldc_id, msglen);
2256 
2257 		/*
2258 		 * Figure out what sort of packet we have gotten by
2259 		 * examining the msg tag, and then switch it appropriately.
2260 		 */
2261 		tagp = (vio_msg_tag_t *)ldcmsg;
2262 
2263 		switch (tagp->vio_msgtype) {
2264 		case VIO_TYPE_CTRL:
2265 			vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp);
2266 			break;
2267 		case VIO_TYPE_DATA:
2268 			vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2269 			break;
2270 		case VIO_TYPE_ERR:
2271 			vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2272 			break;
2273 		default:
2274 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2275 			    "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2276 			break;
2277 		}
2278 	} while (msglen);
2279 
2280 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2281 }
2282 
2283 /*
2284  * Dispatch a task to process a VIO control message.
2285  */
2286 static void
2287 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp)
2288 {
2289 	vsw_ctrl_task_t		*ctaskp = NULL;
2290 	vsw_port_t		*port = ldcp->ldc_port;
2291 	vsw_t			*vswp = port->p_vswp;
2292 
2293 	D1(vswp, "%s: enter", __func__);
2294 
2295 	/*
2296 	 * We need to handle RDX ACK messages in-band as once they
2297 	 * are exchanged it is possible that we will get an
2298 	 * immediate (legitimate) data packet.
2299 	 */
2300 	if ((tagp->vio_subtype_env == VIO_RDX) &&
2301 	    (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2302 
2303 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2304 			return;
2305 
2306 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2307 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
2308 		    "(ostate 0x%llx : hphase %d)", __func__,
2309 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2310 		vsw_next_milestone(ldcp);
2311 		return;
2312 	}
2313 
2314 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2315 
2316 	if (ctaskp == NULL) {
2317 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2318 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2319 		return;
2320 	}
2321 
2322 	ctaskp->ldcp = ldcp;
2323 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
2324 	ctaskp->hss_id = ldcp->hss_id;
2325 
2326 	/*
2327 	 * Dispatch task to processing taskq if port is not in
2328 	 * the process of being detached.
2329 	 */
2330 	mutex_enter(&port->state_lock);
2331 	if (port->state == VSW_PORT_INIT) {
2332 		if ((vswp->taskq_p == NULL) ||
2333 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2334 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2335 			mutex_exit(&port->state_lock);
2336 			DERR(vswp, "%s: unable to dispatch task to taskq",
2337 			    __func__);
2338 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2339 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2340 			return;
2341 		}
2342 	} else {
2343 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2344 		DWARN(vswp, "%s: port %d detaching, not dispatching "
2345 		    "task", __func__, port->p_instance);
2346 	}
2347 
2348 	mutex_exit(&port->state_lock);
2349 
2350 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2351 	    ldcp->ldc_id);
2352 	D1(vswp, "%s: exit", __func__);
2353 }
2354 
2355 /*
2356  * Process a VIO ctrl message. Invoked from taskq.
2357  */
2358 static void
2359 vsw_process_ctrl_pkt(void *arg)
2360 {
2361 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
2362 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
2363 	vsw_t 		*vswp = ldcp->ldc_vswp;
2364 	vio_msg_tag_t	tag;
2365 	uint16_t	env;
2366 
2367 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2368 
2369 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2370 	env = tag.vio_subtype_env;
2371 
2372 	/* stale pkt check */
2373 	if (ctaskp->hss_id < ldcp->hss_id) {
2374 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2375 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
2376 		kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2377 		return;
2378 	}
2379 
2380 	/* session id check */
2381 	if (ldcp->session_status & VSW_PEER_SESSION) {
2382 		if (ldcp->peer_session != tag.vio_sid) {
2383 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2384 			    __func__, ldcp->ldc_id, tag.vio_sid);
2385 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2386 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2387 			return;
2388 		}
2389 	}
2390 
2391 	/*
2392 	 * Switch on vio_subtype envelope, then let lower routines
2393 	 * decide if its an INFO, ACK or NACK packet.
2394 	 */
2395 	switch (env) {
2396 	case VIO_VER_INFO:
2397 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2398 		break;
2399 	case VIO_DRING_REG:
2400 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2401 		break;
2402 	case VIO_DRING_UNREG:
2403 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2404 		break;
2405 	case VIO_ATTR_INFO:
2406 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2407 		break;
2408 	case VNET_MCAST_INFO:
2409 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2410 		break;
2411 	case VIO_RDX:
2412 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2413 		break;
2414 	case VIO_DDS_INFO:
2415 		vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2416 		break;
2417 	default:
2418 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2419 	}
2420 
2421 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2422 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2423 }
2424 
2425 /*
2426  * Version negotiation. We can end up here either because our peer
2427  * has responded to a handshake message we have sent it, or our peer
2428  * has initiated a handshake with us. If its the former then can only
2429  * be ACK or NACK, if its the later can only be INFO.
2430  *
2431  * If its an ACK we move to the next stage of the handshake, namely
2432  * attribute exchange. If its a NACK we see if we can specify another
2433  * version, if we can't we stop.
2434  *
2435  * If it is an INFO we reset all params associated with communication
2436  * in that direction over this channel (remember connection is
2437  * essentially 2 independent simplex channels).
2438  */
2439 void
2440 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2441 {
2442 	vio_ver_msg_t	*ver_pkt;
2443 	vsw_t 		*vswp = ldcp->ldc_vswp;
2444 
2445 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2446 
2447 	/*
2448 	 * We know this is a ctrl/version packet so
2449 	 * cast it into the correct structure.
2450 	 */
2451 	ver_pkt = (vio_ver_msg_t *)pkt;
2452 
2453 	switch (ver_pkt->tag.vio_subtype) {
2454 	case VIO_SUBTYPE_INFO:
2455 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2456 
2457 		/*
2458 		 * Record the session id, which we will use from now
2459 		 * until we see another VER_INFO msg. Even then the
2460 		 * session id in most cases will be unchanged, execpt
2461 		 * if channel was reset.
2462 		 */
2463 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
2464 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2465 			DERR(vswp, "%s: updating session id for chan %lld "
2466 			    "from %llx to %llx", __func__, ldcp->ldc_id,
2467 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
2468 		}
2469 
2470 		ldcp->peer_session = ver_pkt->tag.vio_sid;
2471 		ldcp->session_status |= VSW_PEER_SESSION;
2472 
2473 		/* Legal message at this time ? */
2474 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2475 			return;
2476 
2477 		/*
2478 		 * First check the device class. Currently only expect
2479 		 * to be talking to a network device. In the future may
2480 		 * also talk to another switch.
2481 		 */
2482 		if (ver_pkt->dev_class != VDEV_NETWORK) {
2483 			DERR(vswp, "%s: illegal device class %d", __func__,
2484 			    ver_pkt->dev_class);
2485 
2486 			ver_pkt->tag.vio_sid = ldcp->local_session;
2487 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2488 
2489 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2490 
2491 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2492 			    sizeof (vio_ver_msg_t), B_TRUE);
2493 
2494 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2495 			vsw_next_milestone(ldcp);
2496 			return;
2497 		} else {
2498 			ldcp->dev_class = ver_pkt->dev_class;
2499 		}
2500 
2501 		/*
2502 		 * Now check the version.
2503 		 */
2504 		if (vsw_supported_version(ver_pkt) == 0) {
2505 			/*
2506 			 * Support this major version and possibly
2507 			 * adjusted minor version.
2508 			 */
2509 
2510 			D2(vswp, "%s: accepted ver %d:%d", __func__,
2511 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2512 
2513 			/* Store accepted values */
2514 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2515 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2516 
2517 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2518 
2519 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2520 
2521 			if (vsw_obp_ver_proto_workaround == B_TRUE) {
2522 				/*
2523 				 * Send a version info message
2524 				 * using the accepted version that
2525 				 * we are about to ack. Also note that
2526 				 * we send our ver info before we ack.
2527 				 * Otherwise, as soon as receiving the
2528 				 * ack, obp sends attr info msg, which
2529 				 * breaks vsw_check_flag() invoked
2530 				 * from vsw_process_ctrl_attr_pkt();
2531 				 * as we also need VSW_VER_ACK_RECV to
2532 				 * be set in lane_out.lstate, before
2533 				 * we can receive attr info.
2534 				 */
2535 				vsw_send_ver(ldcp);
2536 			}
2537 		} else {
2538 			/*
2539 			 * NACK back with the next lower major/minor
2540 			 * pairing we support (if don't suuport any more
2541 			 * versions then they will be set to zero.
2542 			 */
2543 
2544 			D2(vswp, "%s: replying with ver %d:%d", __func__,
2545 			    ver_pkt->ver_major, ver_pkt->ver_minor);
2546 
2547 			/* Store updated values */
2548 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
2549 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2550 
2551 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2552 
2553 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2554 		}
2555 
2556 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2557 		ver_pkt->tag.vio_sid = ldcp->local_session;
2558 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2559 		    sizeof (vio_ver_msg_t), B_TRUE);
2560 
2561 		vsw_next_milestone(ldcp);
2562 		break;
2563 
2564 	case VIO_SUBTYPE_ACK:
2565 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2566 
2567 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2568 			return;
2569 
2570 		/* Store updated values */
2571 		ldcp->lane_out.ver_major = ver_pkt->ver_major;
2572 		ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2573 
2574 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2575 		vsw_next_milestone(ldcp);
2576 
2577 		break;
2578 
2579 	case VIO_SUBTYPE_NACK:
2580 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2581 
2582 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2583 			return;
2584 
2585 		/*
2586 		 * If our peer sent us a NACK with the ver fields set to
2587 		 * zero then there is nothing more we can do. Otherwise see
2588 		 * if we support either the version suggested, or a lesser
2589 		 * one.
2590 		 */
2591 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2592 			DERR(vswp, "%s: peer unable to negotiate any "
2593 			    "further.", __func__);
2594 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2595 			vsw_next_milestone(ldcp);
2596 			return;
2597 		}
2598 
2599 		/*
2600 		 * Check to see if we support this major version or
2601 		 * a lower one. If we don't then maj/min will be set
2602 		 * to zero.
2603 		 */
2604 		(void) vsw_supported_version(ver_pkt);
2605 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2606 			/* Nothing more we can do */
2607 			DERR(vswp, "%s: version negotiation failed.\n",
2608 			    __func__);
2609 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2610 			vsw_next_milestone(ldcp);
2611 		} else {
2612 			/* found a supported major version */
2613 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
2614 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2615 
2616 			D2(vswp, "%s: resending with updated values (%x, %x)",
2617 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2618 
2619 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2620 			ver_pkt->tag.vio_sid = ldcp->local_session;
2621 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2622 
2623 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2624 
2625 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
2626 			    sizeof (vio_ver_msg_t), B_TRUE);
2627 
2628 			vsw_next_milestone(ldcp);
2629 
2630 		}
2631 		break;
2632 
2633 	default:
2634 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2635 		    ver_pkt->tag.vio_subtype);
2636 	}
2637 
2638 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2639 }
2640 
2641 /*
2642  * Process an attribute packet. We can end up here either because our peer
2643  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2644  * peer has sent us an attribute INFO message
2645  *
2646  * If its an ACK we then move to the next stage of the handshake which
2647  * is to send our descriptor ring info to our peer. If its a NACK then
2648  * there is nothing more we can (currently) do.
2649  *
2650  * If we get a valid/acceptable INFO packet (and we have already negotiated
2651  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2652  * NACK back and reset channel state to INACTIV.
2653  *
2654  * FUTURE: in time we will probably negotiate over attributes, but for
2655  * the moment unacceptable attributes are regarded as a fatal error.
2656  *
2657  */
2658 void
2659 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2660 {
2661 	vnet_attr_msg_t		*attr_pkt;
2662 	vsw_t			*vswp = ldcp->ldc_vswp;
2663 	vsw_port_t		*port = ldcp->ldc_port;
2664 	uint64_t		macaddr = 0;
2665 	lane_t			*lane_out = &ldcp->lane_out;
2666 	lane_t			*lane_in = &ldcp->lane_in;
2667 	uint32_t		mtu;
2668 	boolean_t		ack = B_TRUE;
2669 	int			i;
2670 
2671 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2672 
2673 	/*
2674 	 * We know this is a ctrl/attr packet so
2675 	 * cast it into the correct structure.
2676 	 */
2677 	attr_pkt = (vnet_attr_msg_t *)pkt;
2678 
2679 	switch (attr_pkt->tag.vio_subtype) {
2680 	case VIO_SUBTYPE_INFO:
2681 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2682 
2683 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
2684 			return;
2685 
2686 		/*
2687 		 * If the attributes are unacceptable then we NACK back.
2688 		 */
2689 		if (vsw_check_attr(attr_pkt, ldcp)) {
2690 			ack = B_FALSE;
2691 
2692 			DERR(vswp, "%s (chan %d): invalid attributes",
2693 			    __func__, ldcp->ldc_id);
2694 
2695 		} else {
2696 
2697 			if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2698 				/*
2699 				 * Versions >= 1.4:
2700 				 * The mtu is negotiated down to the
2701 				 * minimum of our mtu and peer's mtu.
2702 				 */
2703 				mtu = MIN(attr_pkt->mtu, vswp->max_frame_size);
2704 
2705 				/*
2706 				 * If we have received an ack for the attr info
2707 				 * that we sent, then check if the mtu computed
2708 				 * above matches the mtu that the peer had ack'd
2709 				 * (saved in local hparams). If they don't
2710 				 * match, we fail the handshake.
2711 				 */
2712 				if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2713 					if (mtu != lane_out->mtu) {
2714 						/* send NACK */
2715 						ack = B_FALSE;
2716 					}
2717 				} else {
2718 					/*
2719 					 * Save the mtu computed above in our
2720 					 * attr parameters, so it gets sent in
2721 					 * the attr info from us to the peer.
2722 					 */
2723 					lane_out->mtu = mtu;
2724 				}
2725 			}
2726 
2727 		}
2728 
2729 		if (ack == B_FALSE) {
2730 
2731 			vsw_free_lane_resources(ldcp, INBOUND);
2732 
2733 			attr_pkt->tag.vio_sid = ldcp->local_session;
2734 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2735 
2736 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2737 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2738 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2739 			    sizeof (vnet_attr_msg_t), B_TRUE);
2740 
2741 			vsw_next_milestone(ldcp);
2742 			return;
2743 		}
2744 
2745 		/*
2746 		 * Otherwise store attributes for this lane and update
2747 		 * lane state.
2748 		 */
2749 		lane_in->mtu = attr_pkt->mtu;
2750 		lane_in->addr = attr_pkt->addr;
2751 		lane_in->addr_type = attr_pkt->addr_type;
2752 		lane_in->xfer_mode = attr_pkt->xfer_mode;
2753 		lane_in->ack_freq = attr_pkt->ack_freq;
2754 
2755 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2756 			/* save the MIN mtu in the msg to be replied */
2757 			attr_pkt->mtu = mtu;
2758 		}
2759 
2760 		macaddr = lane_in->addr;
2761 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2762 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2763 			macaddr >>= 8;
2764 		}
2765 
2766 		/* create the fdb entry for this port/mac address */
2767 		vsw_fdbe_add(vswp, port);
2768 
2769 		/* add the port to the specified vlans */
2770 		vsw_vlan_add_ids(port, VSW_VNETPORT);
2771 
2772 		/* setup device specifc xmit routines */
2773 		mutex_enter(&port->tx_lock);
2774 		if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2775 		    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2776 		    (VSW_VER_LT(ldcp, 1, 2) &&
2777 		    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2778 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2779 			port->transmit = vsw_dringsend;
2780 		} else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2781 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2782 			vsw_create_privring(ldcp);
2783 			port->transmit = vsw_descrsend;
2784 			lane_out->xfer_mode = VIO_DESC_MODE;
2785 		}
2786 
2787 		/*
2788 		 * HybridIO is supported only vnet, not by OBP.
2789 		 * So, set hio_capable to true only when in DRING mode.
2790 		 */
2791 		if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2792 		    (lane_in->xfer_mode != VIO_DESC_MODE)) {
2793 			(void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2794 		} else {
2795 			(void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2796 		}
2797 
2798 		mutex_exit(&port->tx_lock);
2799 
2800 		attr_pkt->tag.vio_sid = ldcp->local_session;
2801 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2802 
2803 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2804 
2805 		lane_in->lstate |= VSW_ATTR_ACK_SENT;
2806 
2807 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
2808 		    sizeof (vnet_attr_msg_t), B_TRUE);
2809 
2810 		vsw_next_milestone(ldcp);
2811 		break;
2812 
2813 	case VIO_SUBTYPE_ACK:
2814 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2815 
2816 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
2817 			return;
2818 
2819 		if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2820 			/*
2821 			 * Versions >= 1.4:
2822 			 * The ack msg sent by the peer contains the minimum of
2823 			 * our mtu (that we had sent in our attr info) and the
2824 			 * peer's mtu.
2825 			 *
2826 			 * If we have sent an ack for the attr info msg from
2827 			 * the peer, check if the mtu that was computed then
2828 			 * (saved in lane_out params) matches the mtu that the
2829 			 * peer has ack'd. If they don't match, we fail the
2830 			 * handshake.
2831 			 */
2832 			if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2833 				if (lane_out->mtu != attr_pkt->mtu) {
2834 					return;
2835 				}
2836 			} else {
2837 				/*
2838 				 * If the mtu ack'd by the peer is > our mtu
2839 				 * fail handshake. Otherwise, save the mtu, so
2840 				 * we can validate it when we receive attr info
2841 				 * from our peer.
2842 				 */
2843 				if (attr_pkt->mtu > lane_out->mtu) {
2844 					return;
2845 				}
2846 				if (attr_pkt->mtu <= lane_out->mtu) {
2847 					lane_out->mtu = attr_pkt->mtu;
2848 				}
2849 			}
2850 		}
2851 
2852 		lane_out->lstate |= VSW_ATTR_ACK_RECV;
2853 		vsw_next_milestone(ldcp);
2854 		break;
2855 
2856 	case VIO_SUBTYPE_NACK:
2857 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2858 
2859 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2860 			return;
2861 
2862 		lane_out->lstate |= VSW_ATTR_NACK_RECV;
2863 		vsw_next_milestone(ldcp);
2864 		break;
2865 
2866 	default:
2867 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2868 		    attr_pkt->tag.vio_subtype);
2869 	}
2870 
2871 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2872 }
2873 
2874 /*
2875  * Process a dring info packet. We can end up here either because our peer
2876  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
2877  * peer has sent us a dring INFO message.
2878  *
2879  * If we get a valid/acceptable INFO packet (and we have already negotiated
2880  * a version) we ACK back and update the lane state, otherwise we NACK back.
2881  *
2882  * FUTURE: nothing to stop client from sending us info on multiple dring's
2883  * but for the moment we will just use the first one we are given.
2884  *
2885  */
2886 void
2887 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
2888 {
2889 	vio_dring_reg_msg_t	*dring_pkt;
2890 	vsw_t			*vswp = ldcp->ldc_vswp;
2891 	ldc_mem_info_t		minfo;
2892 	dring_info_t		*dp, *dbp;
2893 	int			dring_found = 0;
2894 
2895 	/*
2896 	 * We know this is a ctrl/dring packet so
2897 	 * cast it into the correct structure.
2898 	 */
2899 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
2900 
2901 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2902 
2903 	switch (dring_pkt->tag.vio_subtype) {
2904 	case VIO_SUBTYPE_INFO:
2905 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2906 
2907 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
2908 			return;
2909 
2910 		/*
2911 		 * If the dring params are unacceptable then we NACK back.
2912 		 */
2913 		if (vsw_check_dring_info(dring_pkt)) {
2914 
2915 			DERR(vswp, "%s (%lld): invalid dring info",
2916 			    __func__, ldcp->ldc_id);
2917 
2918 			vsw_free_lane_resources(ldcp, INBOUND);
2919 
2920 			dring_pkt->tag.vio_sid = ldcp->local_session;
2921 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2922 
2923 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2924 
2925 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2926 
2927 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2928 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2929 
2930 			vsw_next_milestone(ldcp);
2931 			return;
2932 		}
2933 
2934 		/*
2935 		 * Otherwise, attempt to map in the dring using the
2936 		 * cookie. If that succeeds we send back a unique dring
2937 		 * identifier that the sending side will use in future
2938 		 * to refer to this descriptor ring.
2939 		 */
2940 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
2941 
2942 		dp->num_descriptors = dring_pkt->num_descriptors;
2943 		dp->descriptor_size = dring_pkt->descriptor_size;
2944 		dp->options = dring_pkt->options;
2945 		dp->ncookies = dring_pkt->ncookies;
2946 
2947 		/*
2948 		 * Note: should only get one cookie. Enforced in
2949 		 * the ldc layer.
2950 		 */
2951 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
2952 		    sizeof (ldc_mem_cookie_t));
2953 
2954 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
2955 		    dp->num_descriptors, dp->descriptor_size);
2956 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
2957 		    dp->options, dp->ncookies);
2958 
2959 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
2960 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
2961 		    LDC_DIRECT_MAP, &(dp->handle))) != 0) {
2962 
2963 			DERR(vswp, "%s: dring_map failed\n", __func__);
2964 
2965 			kmem_free(dp, sizeof (dring_info_t));
2966 			vsw_free_lane_resources(ldcp, INBOUND);
2967 
2968 			dring_pkt->tag.vio_sid = ldcp->local_session;
2969 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2970 
2971 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2972 
2973 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2974 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2975 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2976 
2977 			vsw_next_milestone(ldcp);
2978 			return;
2979 		}
2980 
2981 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
2982 
2983 			DERR(vswp, "%s: dring_addr failed\n", __func__);
2984 
2985 			kmem_free(dp, sizeof (dring_info_t));
2986 			vsw_free_lane_resources(ldcp, INBOUND);
2987 
2988 			dring_pkt->tag.vio_sid = ldcp->local_session;
2989 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2990 
2991 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
2992 
2993 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
2994 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
2995 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
2996 
2997 			vsw_next_milestone(ldcp);
2998 			return;
2999 		} else {
3000 			/* store the address of the pub part of ring */
3001 			dp->pub_addr = minfo.vaddr;
3002 
3003 			/* cache the dring mtype */
3004 			dp->dring_mtype = minfo.mtype;
3005 		}
3006 
3007 		/* no private section as we are importing */
3008 		dp->priv_addr = NULL;
3009 
3010 		/*
3011 		 * Using simple mono increasing int for ident at
3012 		 * the moment.
3013 		 */
3014 		dp->ident = ldcp->next_ident;
3015 		ldcp->next_ident++;
3016 
3017 		dp->end_idx = 0;
3018 		dp->next = NULL;
3019 
3020 		/*
3021 		 * Link it onto the end of the list of drings
3022 		 * for this lane.
3023 		 */
3024 		if (ldcp->lane_in.dringp == NULL) {
3025 			D2(vswp, "%s: adding first INBOUND dring", __func__);
3026 			ldcp->lane_in.dringp = dp;
3027 		} else {
3028 			dbp = ldcp->lane_in.dringp;
3029 
3030 			while (dbp->next != NULL)
3031 				dbp = dbp->next;
3032 
3033 			dbp->next = dp;
3034 		}
3035 
3036 		/* acknowledge it */
3037 		dring_pkt->tag.vio_sid = ldcp->local_session;
3038 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3039 		dring_pkt->dring_ident = dp->ident;
3040 
3041 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3042 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
3043 
3044 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
3045 		vsw_next_milestone(ldcp);
3046 		break;
3047 
3048 	case VIO_SUBTYPE_ACK:
3049 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3050 
3051 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
3052 			return;
3053 
3054 		/*
3055 		 * Peer is acknowledging our dring info and will have
3056 		 * sent us a dring identifier which we will use to
3057 		 * refer to this ring w.r.t. our peer.
3058 		 */
3059 		dp = ldcp->lane_out.dringp;
3060 		if (dp != NULL) {
3061 			/*
3062 			 * Find the ring this ident should be associated
3063 			 * with.
3064 			 */
3065 			if (vsw_dring_match(dp, dring_pkt)) {
3066 				dring_found = 1;
3067 
3068 			} else while (dp != NULL) {
3069 				if (vsw_dring_match(dp, dring_pkt)) {
3070 					dring_found = 1;
3071 					break;
3072 				}
3073 				dp = dp->next;
3074 			}
3075 
3076 			if (dring_found == 0) {
3077 				DERR(NULL, "%s: unrecognised ring cookie",
3078 				    __func__);
3079 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3080 				return;
3081 			}
3082 
3083 		} else {
3084 			DERR(vswp, "%s: DRING ACK received but no drings "
3085 			    "allocated", __func__);
3086 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3087 			return;
3088 		}
3089 
3090 		/* store ident */
3091 		dp->ident = dring_pkt->dring_ident;
3092 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
3093 		vsw_next_milestone(ldcp);
3094 		break;
3095 
3096 	case VIO_SUBTYPE_NACK:
3097 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3098 
3099 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3100 			return;
3101 
3102 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
3103 		vsw_next_milestone(ldcp);
3104 		break;
3105 
3106 	default:
3107 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3108 		    dring_pkt->tag.vio_subtype);
3109 	}
3110 
3111 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3112 }
3113 
3114 /*
3115  * Process a request from peer to unregister a dring.
3116  *
3117  * For the moment we just restart the handshake if our
3118  * peer endpoint attempts to unregister a dring.
3119  */
3120 void
3121 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3122 {
3123 	vsw_t			*vswp = ldcp->ldc_vswp;
3124 	vio_dring_unreg_msg_t	*dring_pkt;
3125 
3126 	/*
3127 	 * We know this is a ctrl/dring packet so
3128 	 * cast it into the correct structure.
3129 	 */
3130 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3131 
3132 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3133 
3134 	switch (dring_pkt->tag.vio_subtype) {
3135 	case VIO_SUBTYPE_INFO:
3136 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3137 
3138 		DWARN(vswp, "%s: restarting handshake..", __func__);
3139 		break;
3140 
3141 	case VIO_SUBTYPE_ACK:
3142 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3143 
3144 		DWARN(vswp, "%s: restarting handshake..", __func__);
3145 		break;
3146 
3147 	case VIO_SUBTYPE_NACK:
3148 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3149 
3150 		DWARN(vswp, "%s: restarting handshake..", __func__);
3151 		break;
3152 
3153 	default:
3154 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3155 		    dring_pkt->tag.vio_subtype);
3156 	}
3157 
3158 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3159 
3160 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3161 }
3162 
3163 #define	SND_MCST_NACK(ldcp, pkt) \
3164 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3165 	pkt->tag.vio_sid = ldcp->local_session; \
3166 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3167 			sizeof (vnet_mcast_msg_t), B_TRUE);
3168 
3169 /*
3170  * Process a multicast request from a vnet.
3171  *
3172  * Vnet's specify a multicast address that they are interested in. This
3173  * address is used as a key into the hash table which forms the multicast
3174  * forwarding database (mFDB).
3175  *
3176  * The table keys are the multicast addresses, while the table entries
3177  * are pointers to lists of ports which wish to receive packets for the
3178  * specified multicast address.
3179  *
3180  * When a multicast packet is being switched we use the address as a key
3181  * into the hash table, and then walk the appropriate port list forwarding
3182  * the pkt to each port in turn.
3183  *
3184  * If a vnet is no longer interested in a particular multicast grouping
3185  * we simply find the correct location in the hash table and then delete
3186  * the relevant port from the port list.
3187  *
3188  * To deal with the case whereby a port is being deleted without first
3189  * removing itself from the lists in the hash table, we maintain a list
3190  * of multicast addresses the port has registered an interest in, within
3191  * the port structure itself. We then simply walk that list of addresses
3192  * using them as keys into the hash table and remove the port from the
3193  * appropriate lists.
3194  */
3195 static void
3196 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3197 {
3198 	vnet_mcast_msg_t	*mcst_pkt;
3199 	vsw_port_t		*port = ldcp->ldc_port;
3200 	vsw_t			*vswp = ldcp->ldc_vswp;
3201 	int			i;
3202 
3203 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3204 
3205 	/*
3206 	 * We know this is a ctrl/mcast packet so
3207 	 * cast it into the correct structure.
3208 	 */
3209 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3210 
3211 	switch (mcst_pkt->tag.vio_subtype) {
3212 	case VIO_SUBTYPE_INFO:
3213 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3214 
3215 		/*
3216 		 * Check if in correct state to receive a multicast
3217 		 * message (i.e. handshake complete). If not reset
3218 		 * the handshake.
3219 		 */
3220 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3221 			return;
3222 
3223 		/*
3224 		 * Before attempting to add or remove address check
3225 		 * that they are valid multicast addresses.
3226 		 * If not, then NACK back.
3227 		 */
3228 		for (i = 0; i < mcst_pkt->count; i++) {
3229 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3230 				DERR(vswp, "%s: invalid multicast address",
3231 				    __func__);
3232 				SND_MCST_NACK(ldcp, mcst_pkt);
3233 				return;
3234 			}
3235 		}
3236 
3237 		/*
3238 		 * Now add/remove the addresses. If this fails we
3239 		 * NACK back.
3240 		 */
3241 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3242 			SND_MCST_NACK(ldcp, mcst_pkt);
3243 			return;
3244 		}
3245 
3246 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3247 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3248 
3249 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3250 
3251 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3252 		    sizeof (vnet_mcast_msg_t), B_TRUE);
3253 		break;
3254 
3255 	case VIO_SUBTYPE_ACK:
3256 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3257 
3258 		/*
3259 		 * We shouldn't ever get a multicast ACK message as
3260 		 * at the moment we never request multicast addresses
3261 		 * to be set on some other device. This may change in
3262 		 * the future if we have cascading switches.
3263 		 */
3264 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3265 			return;
3266 
3267 				/* Do nothing */
3268 		break;
3269 
3270 	case VIO_SUBTYPE_NACK:
3271 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3272 
3273 		/*
3274 		 * We shouldn't get a multicast NACK packet for the
3275 		 * same reasons as we shouldn't get a ACK packet.
3276 		 */
3277 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3278 			return;
3279 
3280 				/* Do nothing */
3281 		break;
3282 
3283 	default:
3284 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3285 		    mcst_pkt->tag.vio_subtype);
3286 	}
3287 
3288 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3289 }
3290 
3291 static void
3292 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3293 {
3294 	vio_rdx_msg_t	*rdx_pkt;
3295 	vsw_t		*vswp = ldcp->ldc_vswp;
3296 
3297 	/*
3298 	 * We know this is a ctrl/rdx packet so
3299 	 * cast it into the correct structure.
3300 	 */
3301 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3302 
3303 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3304 
3305 	switch (rdx_pkt->tag.vio_subtype) {
3306 	case VIO_SUBTYPE_INFO:
3307 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3308 
3309 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3310 			return;
3311 
3312 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3313 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3314 
3315 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3316 
3317 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3318 
3319 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3320 		    sizeof (vio_rdx_msg_t), B_TRUE);
3321 
3322 		vsw_next_milestone(ldcp);
3323 		break;
3324 
3325 	case VIO_SUBTYPE_ACK:
3326 		/*
3327 		 * Should be handled in-band by callback handler.
3328 		 */
3329 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3330 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3331 		break;
3332 
3333 	case VIO_SUBTYPE_NACK:
3334 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3335 
3336 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3337 			return;
3338 
3339 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3340 		vsw_next_milestone(ldcp);
3341 		break;
3342 
3343 	default:
3344 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3345 		    rdx_pkt->tag.vio_subtype);
3346 	}
3347 
3348 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3349 }
3350 
3351 static void
3352 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3353 	uint32_t msglen)
3354 {
3355 	uint16_t	env = tagp->vio_subtype_env;
3356 	vsw_t		*vswp = ldcp->ldc_vswp;
3357 
3358 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3359 
3360 	/* session id check */
3361 	if (ldcp->session_status & VSW_PEER_SESSION) {
3362 		if (ldcp->peer_session != tagp->vio_sid) {
3363 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3364 			    __func__, ldcp->ldc_id, tagp->vio_sid);
3365 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3366 			return;
3367 		}
3368 	}
3369 
3370 	/*
3371 	 * It is an error for us to be getting data packets
3372 	 * before the handshake has completed.
3373 	 */
3374 	if (ldcp->hphase != VSW_MILESTONE4) {
3375 		DERR(vswp, "%s: got data packet before handshake complete "
3376 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
3377 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3378 		DUMP_FLAGS(ldcp->lane_in.lstate);
3379 		DUMP_FLAGS(ldcp->lane_out.lstate);
3380 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3381 		return;
3382 	}
3383 
3384 	/*
3385 	 * To reduce the locking contention, release the
3386 	 * ldc_cblock here and re-acquire it once we are done
3387 	 * receiving packets.
3388 	 */
3389 	mutex_exit(&ldcp->ldc_cblock);
3390 	mutex_enter(&ldcp->ldc_rxlock);
3391 
3392 	/*
3393 	 * Switch on vio_subtype envelope, then let lower routines
3394 	 * decide if its an INFO, ACK or NACK packet.
3395 	 */
3396 	if (env == VIO_DRING_DATA) {
3397 		vsw_process_data_dring_pkt(ldcp, dpkt);
3398 	} else if (env == VIO_PKT_DATA) {
3399 		ldcp->rx_pktdata(ldcp, dpkt, msglen);
3400 	} else if (env == VIO_DESC_DATA) {
3401 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
3402 	} else {
3403 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
3404 	}
3405 
3406 	mutex_exit(&ldcp->ldc_rxlock);
3407 	mutex_enter(&ldcp->ldc_cblock);
3408 
3409 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3410 }
3411 
3412 #define	SND_DRING_NACK(ldcp, pkt) \
3413 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3414 	pkt->tag.vio_sid = ldcp->local_session; \
3415 	(void) vsw_send_msg(ldcp, (void *)pkt, \
3416 			sizeof (vio_dring_msg_t), B_TRUE);
3417 
3418 static void
3419 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
3420 {
3421 	vio_dring_msg_t		*dring_pkt;
3422 	vnet_public_desc_t	desc, *pub_addr = NULL;
3423 	vsw_private_desc_t	*priv_addr = NULL;
3424 	dring_info_t		*dp = NULL;
3425 	vsw_t			*vswp = ldcp->ldc_vswp;
3426 	mblk_t			*mp = NULL;
3427 	mblk_t			*bp = NULL;
3428 	mblk_t			*bpt = NULL;
3429 	size_t			nbytes = 0;
3430 	uint64_t		chain = 0;
3431 	uint64_t		len;
3432 	uint32_t		pos, start;
3433 	uint32_t		range_start, range_end;
3434 	int32_t			end, num, cnt = 0;
3435 	int			i, rv, rng_rv = 0, msg_rv = 0;
3436 	boolean_t		prev_desc_ack = B_FALSE;
3437 	int			read_attempts = 0;
3438 	struct ether_header	*ehp;
3439 	lane_t			*lp = &ldcp->lane_out;
3440 
3441 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3442 
3443 	/*
3444 	 * We know this is a data/dring packet so
3445 	 * cast it into the correct structure.
3446 	 */
3447 	dring_pkt = (vio_dring_msg_t *)dpkt;
3448 
3449 	/*
3450 	 * Switch on the vio_subtype. If its INFO then we need to
3451 	 * process the data. If its an ACK we need to make sure
3452 	 * it makes sense (i.e did we send an earlier data/info),
3453 	 * and if its a NACK then we maybe attempt a retry.
3454 	 */
3455 	switch (dring_pkt->tag.vio_subtype) {
3456 	case VIO_SUBTYPE_INFO:
3457 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
3458 
3459 		READ_ENTER(&ldcp->lane_in.dlistrw);
3460 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
3461 		    dring_pkt->dring_ident)) == NULL) {
3462 			RW_EXIT(&ldcp->lane_in.dlistrw);
3463 
3464 			DERR(vswp, "%s(%lld): unable to find dring from "
3465 			    "ident 0x%llx", __func__, ldcp->ldc_id,
3466 			    dring_pkt->dring_ident);
3467 
3468 			SND_DRING_NACK(ldcp, dring_pkt);
3469 			return;
3470 		}
3471 
3472 		start = pos = dring_pkt->start_idx;
3473 		end = dring_pkt->end_idx;
3474 		len = dp->num_descriptors;
3475 
3476 		range_start = range_end = pos;
3477 
3478 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
3479 		    __func__, ldcp->ldc_id, start, end);
3480 
3481 		if (end == -1) {
3482 			num = -1;
3483 		} else if (end >= 0) {
3484 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
3485 
3486 			/* basic sanity check */
3487 			if (end > len) {
3488 				RW_EXIT(&ldcp->lane_in.dlistrw);
3489 				DERR(vswp, "%s(%lld): endpoint %lld outside "
3490 				    "ring length %lld", __func__,
3491 				    ldcp->ldc_id, end, len);
3492 
3493 				SND_DRING_NACK(ldcp, dring_pkt);
3494 				return;
3495 			}
3496 		} else {
3497 			RW_EXIT(&ldcp->lane_in.dlistrw);
3498 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
3499 			    __func__, ldcp->ldc_id, end);
3500 			SND_DRING_NACK(ldcp, dring_pkt);
3501 			return;
3502 		}
3503 
3504 		while (cnt != num) {
3505 vsw_recheck_desc:
3506 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
3507 
3508 			if ((rng_rv = vnet_dring_entry_copy(pub_addr,
3509 			    &desc, dp->dring_mtype, dp->handle,
3510 			    pos, pos)) != 0) {
3511 				DERR(vswp, "%s(%lld): unable to copy "
3512 				    "descriptor at pos %d: err %d",
3513 				    __func__, pos, ldcp->ldc_id, rng_rv);
3514 				ldcp->ldc_stats.ierrors++;
3515 				break;
3516 			}
3517 
3518 			/*
3519 			 * When given a bounded range of descriptors
3520 			 * to process, its an error to hit a descriptor
3521 			 * which is not ready. In the non-bounded case
3522 			 * (end_idx == -1) this simply indicates we have
3523 			 * reached the end of the current active range.
3524 			 */
3525 			if (desc.hdr.dstate != VIO_DESC_READY) {
3526 				/* unbound - no error */
3527 				if (end == -1) {
3528 					if (read_attempts == vsw_read_attempts)
3529 						break;
3530 
3531 					delay(drv_usectohz(vsw_desc_delay));
3532 					read_attempts++;
3533 					goto vsw_recheck_desc;
3534 				}
3535 
3536 				/* bounded - error - so NACK back */
3537 				RW_EXIT(&ldcp->lane_in.dlistrw);
3538 				DERR(vswp, "%s(%lld): descriptor not READY "
3539 				    "(%d)", __func__, ldcp->ldc_id,
3540 				    desc.hdr.dstate);
3541 				SND_DRING_NACK(ldcp, dring_pkt);
3542 				return;
3543 			}
3544 
3545 			DTRACE_PROBE1(read_attempts, int, read_attempts);
3546 
3547 			range_end = pos;
3548 
3549 			/*
3550 			 * If we ACK'd the previous descriptor then now
3551 			 * record the new range start position for later
3552 			 * ACK's.
3553 			 */
3554 			if (prev_desc_ack) {
3555 				range_start = pos;
3556 
3557 				D2(vswp, "%s(%lld): updating range start to be "
3558 				    "%d", __func__, ldcp->ldc_id, range_start);
3559 
3560 				prev_desc_ack = B_FALSE;
3561 			}
3562 
3563 			D2(vswp, "%s(%lld): processing desc %lld at pos"
3564 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
3565 			    __func__, ldcp->ldc_id, pos, &desc,
3566 			    desc.hdr.dstate, desc.nbytes);
3567 
3568 			if ((desc.nbytes < ETHERMIN) ||
3569 			    (desc.nbytes > lp->mtu)) {
3570 				/* invalid size; drop the packet */
3571 				ldcp->ldc_stats.ierrors++;
3572 				goto vsw_process_desc_done;
3573 			}
3574 
3575 			/*
3576 			 * Ensure that we ask ldc for an aligned
3577 			 * number of bytes. Data is padded to align on 8
3578 			 * byte boundary, desc.nbytes is actual data length,
3579 			 * i.e. minus that padding.
3580 			 */
3581 			nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7;
3582 			if (nbytes > ldcp->max_rxpool_size) {
3583 				mp = allocb(desc.nbytes + VNET_IPALIGN + 8,
3584 				    BPRI_MED);
3585 			} else {
3586 				mp = vio_multipool_allocb(&ldcp->vmp, nbytes);
3587 				if (mp == NULL) {
3588 					ldcp->ldc_stats.rx_vio_allocb_fail++;
3589 					/*
3590 					 * No free receive buffers available,
3591 					 * so fallback onto allocb(9F). Make
3592 					 * sure that we get a data buffer which
3593 					 * is a multiple of 8 as this is
3594 					 * required by ldc_mem_copy.
3595 					 */
3596 					DTRACE_PROBE(allocb);
3597 					mp = allocb(desc.nbytes +
3598 					    VNET_IPALIGN + 8, BPRI_MED);
3599 				}
3600 			}
3601 			if (mp == NULL) {
3602 				DERR(vswp, "%s(%ld): allocb failed",
3603 				    __func__, ldcp->ldc_id);
3604 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3605 				    dp->dring_mtype, dp->handle, pos, pos,
3606 				    VIO_DESC_DONE);
3607 				ldcp->ldc_stats.ierrors++;
3608 				ldcp->ldc_stats.rx_allocb_fail++;
3609 				break;
3610 			}
3611 
3612 			rv = ldc_mem_copy(ldcp->ldc_handle,
3613 			    (caddr_t)mp->b_rptr, 0, &nbytes,
3614 			    desc.memcookie, desc.ncookies, LDC_COPY_IN);
3615 			if (rv != 0) {
3616 				DERR(vswp, "%s(%d): unable to copy in data "
3617 				    "from %d cookies in desc %d (rv %d)",
3618 				    __func__, ldcp->ldc_id, desc.ncookies,
3619 				    pos, rv);
3620 				freemsg(mp);
3621 
3622 				rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3623 				    dp->dring_mtype, dp->handle, pos, pos,
3624 				    VIO_DESC_DONE);
3625 				ldcp->ldc_stats.ierrors++;
3626 				break;
3627 			} else {
3628 				D2(vswp, "%s(%d): copied in %ld bytes"
3629 				    " using %d cookies", __func__,
3630 				    ldcp->ldc_id, nbytes, desc.ncookies);
3631 			}
3632 
3633 			/* adjust the read pointer to skip over the padding */
3634 			mp->b_rptr += VNET_IPALIGN;
3635 
3636 			/* point to the actual end of data */
3637 			mp->b_wptr = mp->b_rptr + desc.nbytes;
3638 
3639 			/* update statistics */
3640 			ehp = (struct ether_header *)mp->b_rptr;
3641 			if (IS_BROADCAST(ehp))
3642 				ldcp->ldc_stats.brdcstrcv++;
3643 			else if (IS_MULTICAST(ehp))
3644 				ldcp->ldc_stats.multircv++;
3645 
3646 			ldcp->ldc_stats.ipackets++;
3647 			ldcp->ldc_stats.rbytes += desc.nbytes;
3648 
3649 			/*
3650 			 * IPALIGN space can be used for VLAN_TAG
3651 			 */
3652 			(void) vsw_vlan_frame_pretag(ldcp->ldc_port,
3653 			    VSW_VNETPORT, mp);
3654 
3655 			/* build a chain of received packets */
3656 			if (bp == NULL) {
3657 				/* first pkt */
3658 				bp = mp;
3659 				bp->b_next = bp->b_prev = NULL;
3660 				bpt = bp;
3661 				chain = 1;
3662 			} else {
3663 				mp->b_next = mp->b_prev = NULL;
3664 				bpt->b_next = mp;
3665 				bpt = mp;
3666 				chain++;
3667 			}
3668 
3669 vsw_process_desc_done:
3670 			/* mark we are finished with this descriptor */
3671 			if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr,
3672 			    dp->dring_mtype, dp->handle, pos, pos,
3673 			    VIO_DESC_DONE)) != 0) {
3674 				DERR(vswp, "%s(%lld): unable to update "
3675 				    "dstate at pos %d: err %d",
3676 				    __func__, pos, ldcp->ldc_id, rng_rv);
3677 				ldcp->ldc_stats.ierrors++;
3678 				break;
3679 			}
3680 
3681 			/*
3682 			 * Send an ACK back to peer if requested.
3683 			 */
3684 			if (desc.hdr.ack) {
3685 				dring_pkt->start_idx = range_start;
3686 				dring_pkt->end_idx = range_end;
3687 
3688 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
3689 				    " requested", __func__, ldcp->ldc_id,
3690 				    dring_pkt->start_idx, dring_pkt->end_idx);
3691 
3692 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
3693 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3694 				dring_pkt->tag.vio_sid = ldcp->local_session;
3695 
3696 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3697 				    sizeof (vio_dring_msg_t), B_FALSE);
3698 
3699 				/*
3700 				 * Check if ACK was successfully sent. If not
3701 				 * we break and deal with that below.
3702 				 */
3703 				if (msg_rv != 0)
3704 					break;
3705 
3706 				prev_desc_ack = B_TRUE;
3707 				range_start = pos;
3708 			}
3709 
3710 			/* next descriptor */
3711 			pos = (pos + 1) % len;
3712 			cnt++;
3713 
3714 			/*
3715 			 * Break out of loop here and stop processing to
3716 			 * allow some other network device (or disk) to
3717 			 * get access to the cpu.
3718 			 */
3719 			if (chain > vsw_chain_len) {
3720 				D3(vswp, "%s(%lld): switching chain of %d "
3721 				    "msgs", __func__, ldcp->ldc_id, chain);
3722 				break;
3723 			}
3724 		}
3725 		RW_EXIT(&ldcp->lane_in.dlistrw);
3726 
3727 		/* send the chain of packets to be switched */
3728 		if (bp != NULL) {
3729 			DTRACE_PROBE1(vsw_rcv_msgs, int, chain);
3730 			D3(vswp, "%s(%lld): switching chain of %d msgs",
3731 			    __func__, ldcp->ldc_id, chain);
3732 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
3733 			    ldcp->ldc_port, NULL);
3734 		}
3735 
3736 		/*
3737 		 * If when we encountered an error when attempting to
3738 		 * access an imported dring, initiate a connection reset.
3739 		 */
3740 		if (rng_rv != 0) {
3741 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3742 			break;
3743 		}
3744 
3745 		/*
3746 		 * If when we attempted to send the ACK we found that the
3747 		 * channel had been reset then now handle this. We deal with
3748 		 * it here as we cannot reset the channel while holding the
3749 		 * dlistrw lock, and we don't want to acquire/release it
3750 		 * continuously in the above loop, as a channel reset should
3751 		 * be a rare event.
3752 		 */
3753 		if (msg_rv == ECONNRESET) {
3754 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3755 			break;
3756 		}
3757 
3758 		DTRACE_PROBE1(msg_cnt, int, cnt);
3759 
3760 		/*
3761 		 * We are now finished so ACK back with the state
3762 		 * set to STOPPING so our peer knows we are finished
3763 		 */
3764 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3765 		dring_pkt->tag.vio_sid = ldcp->local_session;
3766 
3767 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
3768 
3769 		DTRACE_PROBE(stop_process_sent);
3770 
3771 		/*
3772 		 * We have not processed any more descriptors beyond
3773 		 * the last one we ACK'd.
3774 		 */
3775 		if (prev_desc_ack)
3776 			range_start = range_end;
3777 
3778 		dring_pkt->start_idx = range_start;
3779 		dring_pkt->end_idx = range_end;
3780 
3781 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
3782 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3783 		    dring_pkt->end_idx);
3784 
3785 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
3786 		    sizeof (vio_dring_msg_t), B_TRUE);
3787 		break;
3788 
3789 	case VIO_SUBTYPE_ACK:
3790 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
3791 		/*
3792 		 * Verify that the relevant descriptors are all
3793 		 * marked as DONE
3794 		 */
3795 		READ_ENTER(&ldcp->lane_out.dlistrw);
3796 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
3797 		    dring_pkt->dring_ident)) == NULL) {
3798 			RW_EXIT(&ldcp->lane_out.dlistrw);
3799 			DERR(vswp, "%s: unknown ident in ACK", __func__);
3800 			return;
3801 		}
3802 
3803 		start = end = 0;
3804 		start = dring_pkt->start_idx;
3805 		end = dring_pkt->end_idx;
3806 		len = dp->num_descriptors;
3807 
3808 
3809 		mutex_enter(&dp->dlock);
3810 		dp->last_ack_recv = end;
3811 		ldcp->ldc_stats.dring_data_acks++;
3812 		mutex_exit(&dp->dlock);
3813 
3814 		(void) vsw_reclaim_dring(dp, start);
3815 
3816 		/*
3817 		 * If our peer is stopping processing descriptors then
3818 		 * we check to make sure it has processed all the descriptors
3819 		 * we have updated. If not then we send it a new message
3820 		 * to prompt it to restart.
3821 		 */
3822 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
3823 			DTRACE_PROBE(stop_process_recv);
3824 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
3825 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
3826 			    dring_pkt->end_idx);
3827 
3828 			/*
3829 			 * Check next descriptor in public section of ring.
3830 			 * If its marked as READY then we need to prompt our
3831 			 * peer to start processing the ring again.
3832 			 */
3833 			i = (end + 1) % len;
3834 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
3835 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3836 
3837 			/*
3838 			 * Hold the restart lock across all of this to
3839 			 * make sure that its not possible for us to
3840 			 * decide that a msg needs to be sent in the future
3841 			 * but the sending code having already checked is
3842 			 * about to exit.
3843 			 */
3844 			mutex_enter(&dp->restart_lock);
3845 			ldcp->ldc_stats.dring_stopped_acks++;
3846 			mutex_enter(&priv_addr->dstate_lock);
3847 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
3848 
3849 				mutex_exit(&priv_addr->dstate_lock);
3850 
3851 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3852 				dring_pkt->tag.vio_sid = ldcp->local_session;
3853 
3854 				dring_pkt->start_idx = (end + 1) % len;
3855 				dring_pkt->end_idx = -1;
3856 
3857 				D2(vswp, "%s(%lld) : sending restart msg:"
3858 				    " %d : %d", __func__, ldcp->ldc_id,
3859 				    dring_pkt->start_idx, dring_pkt->end_idx);
3860 
3861 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
3862 				    sizeof (vio_dring_msg_t), B_FALSE);
3863 				ldcp->ldc_stats.dring_data_msgs++;
3864 
3865 			} else {
3866 				mutex_exit(&priv_addr->dstate_lock);
3867 				dp->restart_reqd = B_TRUE;
3868 			}
3869 			mutex_exit(&dp->restart_lock);
3870 		}
3871 		RW_EXIT(&ldcp->lane_out.dlistrw);
3872 
3873 		/* only do channel reset after dropping dlistrw lock */
3874 		if (msg_rv == ECONNRESET)
3875 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3876 
3877 		break;
3878 
3879 	case VIO_SUBTYPE_NACK:
3880 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
3881 		    __func__, ldcp->ldc_id);
3882 		/*
3883 		 * Something is badly wrong if we are getting NACK's
3884 		 * for our data pkts. So reset the channel.
3885 		 */
3886 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3887 
3888 		break;
3889 
3890 	default:
3891 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3892 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
3893 	}
3894 
3895 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3896 }
3897 
3898 /*
3899  * dummy pkt data handler function for vnet protocol version 1.0
3900  */
3901 static void
3902 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3903 {
3904 	_NOTE(ARGUNUSED(arg1, arg2, msglen))
3905 }
3906 
3907 /*
3908  * This function handles raw pkt data messages received over the channel.
3909  * Currently, only priority-eth-type frames are received through this mechanism.
3910  * In this case, the frame(data) is present within the message itself which
3911  * is copied into an mblk before switching it.
3912  */
3913 static void
3914 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3915 {
3916 	vsw_ldc_t		*ldcp = (vsw_ldc_t *)arg1;
3917 	vio_raw_data_msg_t	*dpkt = (vio_raw_data_msg_t *)arg2;
3918 	uint32_t		size;
3919 	mblk_t			*mp;
3920 	vsw_t			*vswp = ldcp->ldc_vswp;
3921 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
3922 	lane_t			*lp = &ldcp->lane_out;
3923 
3924 	size = msglen - VIO_PKT_DATA_HDRSIZE;
3925 	if (size < ETHERMIN || size > lp->mtu) {
3926 		(void) atomic_inc_32(&statsp->rx_pri_fail);
3927 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3928 		    ldcp->ldc_id, size);
3929 		return;
3930 	}
3931 
3932 	mp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3933 	if (mp == NULL) {
3934 		mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3935 		if (mp == NULL) {
3936 			(void) atomic_inc_32(&statsp->rx_pri_fail);
3937 			DWARN(vswp, "%s(%lld) allocb failure, "
3938 			    "unable to process priority frame\n", __func__,
3939 			    ldcp->ldc_id);
3940 			return;
3941 		}
3942 	}
3943 
3944 	/* skip over the extra space for vlan tag */
3945 	mp->b_rptr += VLAN_TAGSZ;
3946 
3947 	/* copy the frame from the payload of raw data msg into the mblk */
3948 	bcopy(dpkt->data, mp->b_rptr, size);
3949 	mp->b_wptr = mp->b_rptr + size;
3950 
3951 	/* update stats */
3952 	(void) atomic_inc_64(&statsp->rx_pri_packets);
3953 	(void) atomic_add_64(&statsp->rx_pri_bytes, size);
3954 
3955 	/*
3956 	 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3957 	 */
3958 	(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3959 
3960 	/* switch the frame to destination */
3961 	vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3962 }
3963 
3964 /*
3965  * Process an in-band descriptor message (most likely from
3966  * OBP).
3967  */
3968 static void
3969 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3970 {
3971 	vnet_ibnd_desc_t	*ibnd_desc;
3972 	dring_info_t		*dp = NULL;
3973 	vsw_private_desc_t	*priv_addr = NULL;
3974 	vsw_t			*vswp = ldcp->ldc_vswp;
3975 	mblk_t			*mp = NULL;
3976 	size_t			nbytes = 0;
3977 	size_t			off = 0;
3978 	uint64_t		idx = 0;
3979 	uint32_t		num = 1, len, datalen = 0;
3980 	uint64_t		ncookies = 0;
3981 	int			i, rv;
3982 	int			j = 0;
3983 
3984 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3985 
3986 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3987 
3988 	switch (ibnd_desc->hdr.tag.vio_subtype) {
3989 	case VIO_SUBTYPE_INFO:
3990 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3991 
3992 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3993 			return;
3994 
3995 		/*
3996 		 * Data is padded to align on a 8 byte boundary,
3997 		 * nbytes is actual data length, i.e. minus that
3998 		 * padding.
3999 		 */
4000 		datalen = ibnd_desc->nbytes;
4001 
4002 		D2(vswp, "%s(%lld): processing inband desc : "
4003 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
4004 
4005 		ncookies = ibnd_desc->ncookies;
4006 
4007 		/*
4008 		 * allocb(9F) returns an aligned data block. We
4009 		 * need to ensure that we ask ldc for an aligned
4010 		 * number of bytes also.
4011 		 */
4012 		nbytes = datalen;
4013 		if (nbytes & 0x7) {
4014 			off = 8 - (nbytes & 0x7);
4015 			nbytes += off;
4016 		}
4017 
4018 		/* alloc extra space for VLAN_TAG */
4019 		mp = allocb(datalen + 8, BPRI_MED);
4020 		if (mp == NULL) {
4021 			DERR(vswp, "%s(%lld): allocb failed",
4022 			    __func__, ldcp->ldc_id);
4023 			ldcp->ldc_stats.rx_allocb_fail++;
4024 			return;
4025 		}
4026 
4027 		/* skip over the extra space for VLAN_TAG */
4028 		mp->b_rptr += 8;
4029 
4030 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
4031 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
4032 		    LDC_COPY_IN);
4033 
4034 		if (rv != 0) {
4035 			DERR(vswp, "%s(%d): unable to copy in data from "
4036 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
4037 			freemsg(mp);
4038 			ldcp->ldc_stats.ierrors++;
4039 			return;
4040 		}
4041 
4042 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
4043 		    __func__, ldcp->ldc_id, nbytes, ncookies);
4044 
4045 		/* point to the actual end of data */
4046 		mp->b_wptr = mp->b_rptr + datalen;
4047 		ldcp->ldc_stats.ipackets++;
4048 		ldcp->ldc_stats.rbytes += datalen;
4049 
4050 		/*
4051 		 * We ACK back every in-band descriptor message we process
4052 		 */
4053 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
4054 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
4055 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
4056 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
4057 
4058 		/*
4059 		 * there is extra space alloc'd for VLAN_TAG
4060 		 */
4061 		(void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
4062 
4063 		/* send the packet to be switched */
4064 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
4065 		    ldcp->ldc_port, NULL);
4066 
4067 		break;
4068 
4069 	case VIO_SUBTYPE_ACK:
4070 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4071 
4072 		/* Verify the ACK is valid */
4073 		idx = ibnd_desc->hdr.desc_handle;
4074 
4075 		if (idx >= vsw_ntxds) {
4076 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
4077 			    "(idx %ld)", vswp->instance, idx);
4078 			return;
4079 		}
4080 
4081 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4082 			DERR(vswp, "%s: no dring found", __func__);
4083 			return;
4084 		}
4085 
4086 		len = dp->num_descriptors;
4087 		/*
4088 		 * If the descriptor we are being ACK'ed for is not the
4089 		 * one we expected, then pkts were lost somwhere, either
4090 		 * when we tried to send a msg, or a previous ACK msg from
4091 		 * our peer. In either case we now reclaim the descriptors
4092 		 * in the range from the last ACK we received up to the
4093 		 * current ACK.
4094 		 */
4095 		if (idx != dp->last_ack_recv) {
4096 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
4097 			    __func__, dp->last_ack_recv, idx);
4098 			num = idx >= dp->last_ack_recv ?
4099 			    idx - dp->last_ack_recv + 1:
4100 			    (len - dp->last_ack_recv + 1) + idx;
4101 		}
4102 
4103 		/*
4104 		 * When we sent the in-band message to our peer we
4105 		 * marked the copy in our private ring as READY. We now
4106 		 * check that the descriptor we are being ACK'ed for is in
4107 		 * fact READY, i.e. it is one we have shared with our peer.
4108 		 *
4109 		 * If its not we flag an error, but still reset the descr
4110 		 * back to FREE.
4111 		 */
4112 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
4113 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4114 			mutex_enter(&priv_addr->dstate_lock);
4115 			if (priv_addr->dstate != VIO_DESC_READY) {
4116 				DERR(vswp, "%s: (%ld) desc at index %ld not "
4117 				    "READY (0x%lx)", __func__,
4118 				    ldcp->ldc_id, idx, priv_addr->dstate);
4119 				DERR(vswp, "%s: bound %d: ncookies %ld : "
4120 				    "datalen %ld", __func__,
4121 				    priv_addr->bound, priv_addr->ncookies,
4122 				    priv_addr->datalen);
4123 			}
4124 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
4125 			    ldcp->ldc_id, idx);
4126 			/* release resources associated with sent msg */
4127 			priv_addr->datalen = 0;
4128 			priv_addr->dstate = VIO_DESC_FREE;
4129 			mutex_exit(&priv_addr->dstate_lock);
4130 		}
4131 		/* update to next expected value */
4132 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
4133 
4134 		break;
4135 
4136 	case VIO_SUBTYPE_NACK:
4137 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4138 
4139 		/*
4140 		 * We should only get a NACK if our peer doesn't like
4141 		 * something about a message we have sent it. If this
4142 		 * happens we just release the resources associated with
4143 		 * the message. (We are relying on higher layers to decide
4144 		 * whether or not to resend.
4145 		 */
4146 
4147 		/* limit check */
4148 		idx = ibnd_desc->hdr.desc_handle;
4149 
4150 		if (idx >= vsw_ntxds) {
4151 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
4152 			    __func__, idx);
4153 			return;
4154 		}
4155 
4156 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4157 			DERR(vswp, "%s: no dring found", __func__);
4158 			return;
4159 		}
4160 
4161 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4162 
4163 		/* move to correct location in ring */
4164 		priv_addr += idx;
4165 
4166 		/* release resources associated with sent msg */
4167 		mutex_enter(&priv_addr->dstate_lock);
4168 		priv_addr->datalen = 0;
4169 		priv_addr->dstate = VIO_DESC_FREE;
4170 		mutex_exit(&priv_addr->dstate_lock);
4171 
4172 		break;
4173 
4174 	default:
4175 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4176 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
4177 	}
4178 
4179 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4180 }
4181 
4182 static void
4183 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
4184 {
4185 	_NOTE(ARGUNUSED(epkt))
4186 
4187 	vsw_t		*vswp = ldcp->ldc_vswp;
4188 	uint16_t	env = tagp->vio_subtype_env;
4189 
4190 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4191 
4192 	/*
4193 	 * Error vio_subtypes have yet to be defined. So for
4194 	 * the moment we can't do anything.
4195 	 */
4196 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
4197 
4198 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4199 }
4200 
4201 /* transmit the packet over the given port */
4202 int
4203 vsw_portsend(vsw_port_t *port, mblk_t *mp)
4204 {
4205 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
4206 	vsw_ldc_t 	*ldcp;
4207 	mblk_t		*mpt;
4208 	int		count;
4209 	int		status = 0;
4210 
4211 	READ_ENTER(&ldcl->lockrw);
4212 	/*
4213 	 * Note for now, we have a single channel.
4214 	 */
4215 	ldcp = ldcl->head;
4216 	if (ldcp == NULL) {
4217 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
4218 		freemsgchain(mp);
4219 		RW_EXIT(&ldcl->lockrw);
4220 		return (1);
4221 	}
4222 
4223 	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
4224 
4225 	if (count != 0) {
4226 		status = ldcp->tx(ldcp, mp, mpt, count);
4227 	}
4228 
4229 	RW_EXIT(&ldcl->lockrw);
4230 	return (status);
4231 }
4232 
4233 /*
4234  * Break up frames into 2 seperate chains: normal and
4235  * priority, based on the frame type. The number of
4236  * priority frames is also counted and returned.
4237  *
4238  * Params:
4239  * 	vswp:	pointer to the instance of vsw
4240  *	np:	head of packet chain to be broken
4241  *	npt:	tail of packet chain to be broken
4242  *
4243  * Returns:
4244  *	np:	head of normal data packets
4245  *	npt:	tail of normal data packets
4246  *	hp:	head of high priority packets
4247  *	hpt:	tail of high priority packets
4248  */
4249 static uint32_t
4250 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
4251 	mblk_t **hp, mblk_t **hpt)
4252 {
4253 	mblk_t			*tmp = NULL;
4254 	mblk_t			*smp = NULL;
4255 	mblk_t			*hmp = NULL;	/* high prio pkts head */
4256 	mblk_t			*hmpt = NULL;	/* high prio pkts tail */
4257 	mblk_t			*nmp = NULL;	/* normal pkts head */
4258 	mblk_t			*nmpt = NULL;	/* normal pkts tail */
4259 	uint32_t		count = 0;
4260 	int			i;
4261 	struct ether_header	*ehp;
4262 	uint32_t		num_types;
4263 	uint16_t		*types;
4264 
4265 	tmp = *np;
4266 	while (tmp != NULL) {
4267 
4268 		smp = tmp;
4269 		tmp = tmp->b_next;
4270 		smp->b_next = NULL;
4271 		smp->b_prev = NULL;
4272 
4273 		ehp = (struct ether_header *)smp->b_rptr;
4274 		num_types = vswp->pri_num_types;
4275 		types = vswp->pri_types;
4276 		for (i = 0; i < num_types; i++) {
4277 			if (ehp->ether_type == types[i]) {
4278 				/* high priority frame */
4279 
4280 				if (hmp != NULL) {
4281 					hmpt->b_next = smp;
4282 					hmpt = smp;
4283 				} else {
4284 					hmp = hmpt = smp;
4285 				}
4286 				count++;
4287 				break;
4288 			}
4289 		}
4290 		if (i == num_types) {
4291 			/* normal data frame */
4292 
4293 			if (nmp != NULL) {
4294 				nmpt->b_next = smp;
4295 				nmpt = smp;
4296 			} else {
4297 				nmp = nmpt = smp;
4298 			}
4299 		}
4300 	}
4301 
4302 	*hp = hmp;
4303 	*hpt = hmpt;
4304 	*np = nmp;
4305 	*npt = nmpt;
4306 
4307 	return (count);
4308 }
4309 
4310 /*
4311  * Wrapper function to transmit normal and/or priority frames over the channel.
4312  */
4313 static int
4314 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4315 {
4316 	vsw_ldc_t 		*ldcp = (vsw_ldc_t *)arg;
4317 	mblk_t			*tmp;
4318 	mblk_t			*smp;
4319 	mblk_t			*hmp;	/* high prio pkts head */
4320 	mblk_t			*hmpt;	/* high prio pkts tail */
4321 	mblk_t			*nmp;	/* normal pkts head */
4322 	mblk_t			*nmpt;	/* normal pkts tail */
4323 	uint32_t		n = 0;
4324 	vsw_t			*vswp = ldcp->ldc_vswp;
4325 
4326 	ASSERT(VSW_PRI_ETH_DEFINED(vswp));
4327 	ASSERT(count != 0);
4328 
4329 	nmp = mp;
4330 	nmpt = mpt;
4331 
4332 	/* gather any priority frames from the chain of packets */
4333 	n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
4334 
4335 	/* transmit priority frames */
4336 	tmp = hmp;
4337 	while (tmp != NULL) {
4338 		smp = tmp;
4339 		tmp = tmp->b_next;
4340 		smp->b_next = NULL;
4341 		vsw_ldcsend_pkt(ldcp, smp);
4342 	}
4343 
4344 	count -= n;
4345 
4346 	if (count == 0) {
4347 		/* no normal data frames to process */
4348 		return (0);
4349 	}
4350 
4351 	return (vsw_ldctx(ldcp, nmp, nmpt, count));
4352 }
4353 
4354 /*
4355  * Wrapper function to transmit normal frames over the channel.
4356  */
4357 static int
4358 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
4359 {
4360 	vsw_ldc_t 	*ldcp = (vsw_ldc_t *)arg;
4361 	mblk_t		*tmp = NULL;
4362 
4363 	ASSERT(count != 0);
4364 	/*
4365 	 * If the TX thread is enabled, then queue the
4366 	 * ordinary frames and signal the tx thread.
4367 	 */
4368 	if (ldcp->tx_thread != NULL) {
4369 
4370 		mutex_enter(&ldcp->tx_thr_lock);
4371 
4372 		if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
4373 			/*
4374 			 * If we reached queue limit,
4375 			 * do not queue new packets,
4376 			 * drop them.
4377 			 */
4378 			ldcp->ldc_stats.tx_qfull += count;
4379 			mutex_exit(&ldcp->tx_thr_lock);
4380 			freemsgchain(mp);
4381 			goto exit;
4382 		}
4383 		if (ldcp->tx_mhead == NULL) {
4384 			ldcp->tx_mhead = mp;
4385 			ldcp->tx_mtail = mpt;
4386 			cv_signal(&ldcp->tx_thr_cv);
4387 		} else {
4388 			ldcp->tx_mtail->b_next = mp;
4389 			ldcp->tx_mtail = mpt;
4390 		}
4391 		ldcp->tx_cnt += count;
4392 		mutex_exit(&ldcp->tx_thr_lock);
4393 	} else {
4394 		while (mp != NULL) {
4395 			tmp = mp->b_next;
4396 			mp->b_next = mp->b_prev = NULL;
4397 			(void) vsw_ldcsend(ldcp, mp, 1);
4398 			mp = tmp;
4399 		}
4400 	}
4401 
4402 exit:
4403 	return (0);
4404 }
4405 
4406 /*
4407  * This function transmits the frame in the payload of a raw data
4408  * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
4409  * send special frames with high priorities, without going through
4410  * the normal data path which uses descriptor ring mechanism.
4411  */
4412 static void
4413 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
4414 {
4415 	vio_raw_data_msg_t	*pkt;
4416 	mblk_t			*bp;
4417 	mblk_t			*nmp = NULL;
4418 	caddr_t			dst;
4419 	uint32_t		mblksz;
4420 	uint32_t		size;
4421 	uint32_t		nbytes;
4422 	int			rv;
4423 	vsw_t			*vswp = ldcp->ldc_vswp;
4424 	vgen_stats_t		*statsp = &ldcp->ldc_stats;
4425 
4426 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4427 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4428 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4429 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4430 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4431 		    ldcp->lane_out.lstate);
4432 		goto send_pkt_exit;
4433 	}
4434 
4435 	size = msgsize(mp);
4436 
4437 	/* frame size bigger than available payload len of raw data msg ? */
4438 	if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
4439 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4440 		DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
4441 		    ldcp->ldc_id, size);
4442 		goto send_pkt_exit;
4443 	}
4444 
4445 	if (size < ETHERMIN)
4446 		size = ETHERMIN;
4447 
4448 	/* alloc space for a raw data message */
4449 	nmp = vio_allocb(vswp->pri_tx_vmp);
4450 	if (nmp == NULL) {
4451 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4452 		DWARN(vswp, "vio_allocb failed\n");
4453 		goto send_pkt_exit;
4454 	}
4455 	pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
4456 
4457 	/* copy frame into the payload of raw data message */
4458 	dst = (caddr_t)pkt->data;
4459 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
4460 		mblksz = MBLKL(bp);
4461 		bcopy(bp->b_rptr, dst, mblksz);
4462 		dst += mblksz;
4463 	}
4464 
4465 	/* setup the raw data msg */
4466 	pkt->tag.vio_msgtype = VIO_TYPE_DATA;
4467 	pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4468 	pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4469 	pkt->tag.vio_sid = ldcp->local_session;
4470 	nbytes = VIO_PKT_DATA_HDRSIZE + size;
4471 
4472 	/* send the msg over ldc */
4473 	rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4474 	if (rv != 0) {
4475 		(void) atomic_inc_32(&statsp->tx_pri_fail);
4476 		DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4477 		    ldcp->ldc_id);
4478 		goto send_pkt_exit;
4479 	}
4480 
4481 	/* update stats */
4482 	(void) atomic_inc_64(&statsp->tx_pri_packets);
4483 	(void) atomic_add_64(&statsp->tx_pri_packets, size);
4484 
4485 send_pkt_exit:
4486 	if (nmp != NULL)
4487 		freemsg(nmp);
4488 	freemsg(mp);
4489 }
4490 
4491 /*
4492  * Transmit the packet over the given LDC channel.
4493  *
4494  * The 'retries' argument indicates how many times a packet
4495  * is retried before it is dropped. Note, the retry is done
4496  * only for a resource related failure, for all other failures
4497  * the packet is dropped immediately.
4498  */
4499 static int
4500 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4501 {
4502 	int i;
4503 	int rc;
4504 	int status = 0;
4505 	vsw_port_t *port = ldcp->ldc_port;
4506 	dring_info_t *dp = NULL;
4507 
4508 
4509 	for (i = 0; i < retries; ) {
4510 		/*
4511 		 * Send the message out using the appropriate
4512 		 * transmit function which will free mblock when it
4513 		 * is finished with it.
4514 		 */
4515 		mutex_enter(&port->tx_lock);
4516 		if (port->transmit != NULL) {
4517 			status = (*port->transmit)(ldcp, mp);
4518 		}
4519 		if (status == LDC_TX_SUCCESS) {
4520 			mutex_exit(&port->tx_lock);
4521 			break;
4522 		}
4523 		i++;	/* increment the counter here */
4524 
4525 		/* If its the last retry, then update the oerror */
4526 		if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4527 			ldcp->ldc_stats.oerrors++;
4528 		}
4529 		mutex_exit(&port->tx_lock);
4530 
4531 		if (status != LDC_TX_NORESOURCES) {
4532 			/*
4533 			 * No retrying required for errors un-related
4534 			 * to resources.
4535 			 */
4536 			break;
4537 		}
4538 		READ_ENTER(&ldcp->lane_out.dlistrw);
4539 		if (((dp = ldcp->lane_out.dringp) != NULL) &&
4540 		    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4541 		    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4542 		    ((VSW_VER_LT(ldcp, 1, 2) &&
4543 		    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4544 			rc = vsw_reclaim_dring(dp, dp->end_idx);
4545 		} else {
4546 			/*
4547 			 * If there is no dring or the xfer_mode is
4548 			 * set to DESC_MODE(ie., OBP), then simply break here.
4549 			 */
4550 			RW_EXIT(&ldcp->lane_out.dlistrw);
4551 			break;
4552 		}
4553 		RW_EXIT(&ldcp->lane_out.dlistrw);
4554 
4555 		/*
4556 		 * Delay only if none were reclaimed
4557 		 * and its not the last retry.
4558 		 */
4559 		if ((rc == 0) && (i < retries)) {
4560 			delay(drv_usectohz(vsw_ldc_tx_delay));
4561 		}
4562 	}
4563 	freemsg(mp);
4564 	return (status);
4565 }
4566 
4567 /*
4568  * Send packet out via descriptor ring to a logical device.
4569  */
4570 static int
4571 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
4572 {
4573 	vio_dring_msg_t		dring_pkt;
4574 	dring_info_t		*dp = NULL;
4575 	vsw_private_desc_t	*priv_desc = NULL;
4576 	vnet_public_desc_t	*pub = NULL;
4577 	vsw_t			*vswp = ldcp->ldc_vswp;
4578 	mblk_t			*bp;
4579 	size_t			n, size;
4580 	caddr_t			bufp;
4581 	int			idx;
4582 	int			status = LDC_TX_SUCCESS;
4583 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4584 	lane_t			*lp = &ldcp->lane_out;
4585 
4586 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
4587 
4588 	/* TODO: make test a macro */
4589 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4590 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4591 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
4592 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
4593 		    ldcp->lane_out.lstate);
4594 		ldcp->ldc_stats.oerrors++;
4595 		return (LDC_TX_FAILURE);
4596 	}
4597 
4598 	/*
4599 	 * Note - using first ring only, this may change
4600 	 * in the future.
4601 	 */
4602 	READ_ENTER(&ldcp->lane_out.dlistrw);
4603 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4604 		RW_EXIT(&ldcp->lane_out.dlistrw);
4605 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
4606 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
4607 		ldcp->ldc_stats.oerrors++;
4608 		return (LDC_TX_FAILURE);
4609 	}
4610 
4611 	size = msgsize(mp);
4612 	if (size > (size_t)lp->mtu) {
4613 		RW_EXIT(&ldcp->lane_out.dlistrw);
4614 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4615 		    ldcp->ldc_id, size);
4616 		ldcp->ldc_stats.oerrors++;
4617 		return (LDC_TX_FAILURE);
4618 	}
4619 
4620 	/*
4621 	 * Find a free descriptor
4622 	 *
4623 	 * Note: for the moment we are assuming that we will only
4624 	 * have one dring going from the switch to each of its
4625 	 * peers. This may change in the future.
4626 	 */
4627 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4628 		D2(vswp, "%s(%lld): no descriptor available for ring "
4629 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4630 
4631 		/* nothing more we can do */
4632 		status = LDC_TX_NORESOURCES;
4633 		ldcp->ldc_stats.tx_no_desc++;
4634 		goto vsw_dringsend_free_exit;
4635 	} else {
4636 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
4637 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
4638 	}
4639 
4640 	/* copy data into the descriptor */
4641 	bufp = priv_desc->datap;
4642 	bufp += VNET_IPALIGN;
4643 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4644 		n = MBLKL(bp);
4645 		bcopy(bp->b_rptr, bufp, n);
4646 		bufp += n;
4647 	}
4648 
4649 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4650 
4651 	pub = priv_desc->descp;
4652 	pub->nbytes = priv_desc->datalen;
4653 
4654 	/* update statistics */
4655 	if (IS_BROADCAST(ehp))
4656 		ldcp->ldc_stats.brdcstxmt++;
4657 	else if (IS_MULTICAST(ehp))
4658 		ldcp->ldc_stats.multixmt++;
4659 	ldcp->ldc_stats.opackets++;
4660 	ldcp->ldc_stats.obytes += priv_desc->datalen;
4661 
4662 	mutex_enter(&priv_desc->dstate_lock);
4663 	pub->hdr.dstate = VIO_DESC_READY;
4664 	mutex_exit(&priv_desc->dstate_lock);
4665 
4666 	/*
4667 	 * Determine whether or not we need to send a message to our
4668 	 * peer prompting them to read our newly updated descriptor(s).
4669 	 */
4670 	mutex_enter(&dp->restart_lock);
4671 	if (dp->restart_reqd) {
4672 		dp->restart_reqd = B_FALSE;
4673 		ldcp->ldc_stats.dring_data_msgs++;
4674 		mutex_exit(&dp->restart_lock);
4675 
4676 		/*
4677 		 * Send a vio_dring_msg to peer to prompt them to read
4678 		 * the updated descriptor ring.
4679 		 */
4680 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
4681 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
4682 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
4683 		dring_pkt.tag.vio_sid = ldcp->local_session;
4684 
4685 		/* Note - for now using first ring */
4686 		dring_pkt.dring_ident = dp->ident;
4687 
4688 		/*
4689 		 * If last_ack_recv is -1 then we know we've not
4690 		 * received any ack's yet, so this must be the first
4691 		 * msg sent, so set the start to the begining of the ring.
4692 		 */
4693 		mutex_enter(&dp->dlock);
4694 		if (dp->last_ack_recv == -1) {
4695 			dring_pkt.start_idx = 0;
4696 		} else {
4697 			dring_pkt.start_idx =
4698 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
4699 		}
4700 		dring_pkt.end_idx = -1;
4701 		mutex_exit(&dp->dlock);
4702 
4703 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
4704 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
4705 		D3(vswp, "%s(%lld): start %lld : end %lld :\n",
4706 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
4707 		    dring_pkt.end_idx);
4708 
4709 		RW_EXIT(&ldcp->lane_out.dlistrw);
4710 
4711 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
4712 		    sizeof (vio_dring_msg_t), B_TRUE);
4713 
4714 		return (status);
4715 
4716 	} else {
4717 		mutex_exit(&dp->restart_lock);
4718 		D2(vswp, "%s(%lld): updating descp %d", __func__,
4719 		    ldcp->ldc_id, idx);
4720 	}
4721 
4722 vsw_dringsend_free_exit:
4723 
4724 	RW_EXIT(&ldcp->lane_out.dlistrw);
4725 
4726 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4727 	return (status);
4728 }
4729 
4730 /*
4731  * Send an in-band descriptor message over ldc.
4732  */
4733 static int
4734 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4735 {
4736 	vsw_t			*vswp = ldcp->ldc_vswp;
4737 	vnet_ibnd_desc_t	ibnd_msg;
4738 	vsw_private_desc_t	*priv_desc = NULL;
4739 	dring_info_t		*dp = NULL;
4740 	size_t			n, size = 0;
4741 	caddr_t			bufp;
4742 	mblk_t			*bp;
4743 	int			idx, i;
4744 	int			status = LDC_TX_SUCCESS;
4745 	static int		warn_msg = 1;
4746 	lane_t			*lp = &ldcp->lane_out;
4747 
4748 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4749 
4750 	ASSERT(mp != NULL);
4751 
4752 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4753 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4754 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4755 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
4756 		    ldcp->lane_out.lstate);
4757 		ldcp->ldc_stats.oerrors++;
4758 		return (LDC_TX_FAILURE);
4759 	}
4760 
4761 	/*
4762 	 * only expect single dring to exist, which we use
4763 	 * as an internal buffer, rather than a transfer channel.
4764 	 */
4765 	READ_ENTER(&ldcp->lane_out.dlistrw);
4766 	if ((dp = ldcp->lane_out.dringp) == NULL) {
4767 		DERR(vswp, "%s(%lld): no dring for outbound lane",
4768 		    __func__, ldcp->ldc_id);
4769 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4770 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4771 		RW_EXIT(&ldcp->lane_out.dlistrw);
4772 		ldcp->ldc_stats.oerrors++;
4773 		return (LDC_TX_FAILURE);
4774 	}
4775 
4776 	size = msgsize(mp);
4777 	if (size > (size_t)lp->mtu) {
4778 		RW_EXIT(&ldcp->lane_out.dlistrw);
4779 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4780 		    ldcp->ldc_id, size);
4781 		ldcp->ldc_stats.oerrors++;
4782 		return (LDC_TX_FAILURE);
4783 	}
4784 
4785 	/*
4786 	 * Find a free descriptor in our buffer ring
4787 	 */
4788 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4789 		RW_EXIT(&ldcp->lane_out.dlistrw);
4790 		if (warn_msg) {
4791 			DERR(vswp, "%s(%lld): no descriptor available for ring "
4792 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
4793 			warn_msg = 0;
4794 		}
4795 
4796 		/* nothing more we can do */
4797 		status = LDC_TX_NORESOURCES;
4798 		goto vsw_descrsend_free_exit;
4799 	} else {
4800 		D2(vswp, "%s(%lld): free private descriptor found at pos "
4801 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4802 		warn_msg = 1;
4803 	}
4804 
4805 	/* copy data into the descriptor */
4806 	bufp = priv_desc->datap;
4807 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4808 		n = MBLKL(bp);
4809 		bcopy(bp->b_rptr, bufp, n);
4810 		bufp += n;
4811 	}
4812 
4813 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4814 
4815 	/* create and send the in-band descp msg */
4816 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4817 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4818 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4819 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4820 
4821 	/*
4822 	 * Copy the mem cookies describing the data from the
4823 	 * private region of the descriptor ring into the inband
4824 	 * descriptor.
4825 	 */
4826 	for (i = 0; i < priv_desc->ncookies; i++) {
4827 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4828 		    sizeof (ldc_mem_cookie_t));
4829 	}
4830 
4831 	ibnd_msg.hdr.desc_handle = idx;
4832 	ibnd_msg.ncookies = priv_desc->ncookies;
4833 	ibnd_msg.nbytes = size;
4834 
4835 	ldcp->ldc_stats.opackets++;
4836 	ldcp->ldc_stats.obytes += size;
4837 
4838 	RW_EXIT(&ldcp->lane_out.dlistrw);
4839 
4840 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4841 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
4842 
4843 vsw_descrsend_free_exit:
4844 
4845 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4846 	return (status);
4847 }
4848 
4849 static void
4850 vsw_send_ver(void *arg)
4851 {
4852 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
4853 	vsw_t		*vswp = ldcp->ldc_vswp;
4854 	lane_t		*lp = &ldcp->lane_out;
4855 	vio_ver_msg_t	ver_msg;
4856 
4857 	D1(vswp, "%s enter", __func__);
4858 
4859 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4860 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4861 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4862 	ver_msg.tag.vio_sid = ldcp->local_session;
4863 
4864 	if (vsw_obp_ver_proto_workaround == B_FALSE) {
4865 		ver_msg.ver_major = vsw_versions[0].ver_major;
4866 		ver_msg.ver_minor = vsw_versions[0].ver_minor;
4867 	} else {
4868 		/* use the major,minor that we've ack'd */
4869 		lane_t	*lpi = &ldcp->lane_in;
4870 		ver_msg.ver_major = lpi->ver_major;
4871 		ver_msg.ver_minor = lpi->ver_minor;
4872 	}
4873 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4874 
4875 	lp->lstate |= VSW_VER_INFO_SENT;
4876 	lp->ver_major = ver_msg.ver_major;
4877 	lp->ver_minor = ver_msg.ver_minor;
4878 
4879 	DUMP_TAG(ver_msg.tag);
4880 
4881 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4882 
4883 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4884 }
4885 
4886 static void
4887 vsw_send_attr(vsw_ldc_t *ldcp)
4888 {
4889 	vsw_t			*vswp = ldcp->ldc_vswp;
4890 	lane_t			*lp = &ldcp->lane_out;
4891 	vnet_attr_msg_t		attr_msg;
4892 
4893 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4894 
4895 	/*
4896 	 * Subtype is set to INFO by default
4897 	 */
4898 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4899 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4900 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4901 	attr_msg.tag.vio_sid = ldcp->local_session;
4902 
4903 	/* payload copied from default settings for lane */
4904 	attr_msg.mtu = lp->mtu;
4905 	attr_msg.addr_type = lp->addr_type;
4906 	attr_msg.xfer_mode = lp->xfer_mode;
4907 	attr_msg.ack_freq = lp->xfer_mode;
4908 
4909 	READ_ENTER(&vswp->if_lockrw);
4910 	attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4911 	RW_EXIT(&vswp->if_lockrw);
4912 
4913 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4914 
4915 	DUMP_TAG(attr_msg.tag);
4916 
4917 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4918 
4919 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4920 }
4921 
4922 /*
4923  * Create dring info msg (which also results in the creation of
4924  * a dring).
4925  */
4926 static vio_dring_reg_msg_t *
4927 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
4928 {
4929 	vio_dring_reg_msg_t	*mp;
4930 	dring_info_t		*dp;
4931 	vsw_t			*vswp = ldcp->ldc_vswp;
4932 	int			rv;
4933 
4934 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
4935 
4936 	/*
4937 	 * If we can't create a dring, obviously no point sending
4938 	 * a message.
4939 	 */
4940 	if ((dp = vsw_create_dring(ldcp)) == NULL)
4941 		return (NULL);
4942 
4943 	/* Allocate pools of receive mblks */
4944 	rv = vsw_init_multipools(ldcp, vswp);
4945 	if (rv) {
4946 		DWARN(vswp, "%s: unable to create free mblk pools for"
4947 		    " channel %ld (rv %d)", __func__, ldcp->ldc_id, rv);
4948 		vsw_free_lane_resources(ldcp, OUTBOUND);
4949 		return (NULL);
4950 	}
4951 
4952 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
4953 
4954 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
4955 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
4956 	mp->tag.vio_subtype_env = VIO_DRING_REG;
4957 	mp->tag.vio_sid = ldcp->local_session;
4958 
4959 	/* payload */
4960 	mp->num_descriptors = dp->num_descriptors;
4961 	mp->descriptor_size = dp->descriptor_size;
4962 	mp->options = dp->options;
4963 	mp->ncookies = dp->ncookies;
4964 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
4965 
4966 	mp->dring_ident = 0;
4967 
4968 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
4969 
4970 	return (mp);
4971 }
4972 
4973 static void
4974 vsw_send_dring_info(vsw_ldc_t *ldcp)
4975 {
4976 	vio_dring_reg_msg_t	*dring_msg;
4977 	vsw_t			*vswp = ldcp->ldc_vswp;
4978 
4979 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4980 
4981 	dring_msg = vsw_create_dring_info_pkt(ldcp);
4982 	if (dring_msg == NULL) {
4983 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
4984 		    vswp->instance, __func__);
4985 		return;
4986 	}
4987 
4988 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
4989 
4990 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
4991 
4992 	(void) vsw_send_msg(ldcp, dring_msg,
4993 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
4994 
4995 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
4996 
4997 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4998 }
4999 
5000 static void
5001 vsw_send_rdx(vsw_ldc_t *ldcp)
5002 {
5003 	vsw_t		*vswp = ldcp->ldc_vswp;
5004 	vio_rdx_msg_t	rdx_msg;
5005 
5006 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5007 
5008 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5009 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5010 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
5011 	rdx_msg.tag.vio_sid = ldcp->local_session;
5012 
5013 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
5014 
5015 	DUMP_TAG(rdx_msg.tag);
5016 
5017 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
5018 
5019 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
5020 }
5021 
5022 /*
5023  * Generic routine to send message out over ldc channel.
5024  *
5025  * It is possible that when we attempt to write over the ldc channel
5026  * that we get notified that it has been reset. Depending on the value
5027  * of the handle_reset flag we either handle that event here or simply
5028  * notify the caller that the channel was reset.
5029  */
5030 int
5031 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
5032 {
5033 	int			rv;
5034 	size_t			msglen = size;
5035 	vio_msg_tag_t		*tag = (vio_msg_tag_t *)msgp;
5036 	vsw_t			*vswp = ldcp->ldc_vswp;
5037 	vio_dring_msg_t		*dmsg;
5038 	vio_raw_data_msg_t	*rmsg;
5039 	vnet_ibnd_desc_t	*imsg;
5040 	boolean_t		data_msg = B_FALSE;
5041 
5042 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
5043 	    ldcp->ldc_id, size);
5044 
5045 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
5046 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
5047 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
5048 
5049 	mutex_enter(&ldcp->ldc_txlock);
5050 
5051 	if (tag->vio_subtype == VIO_SUBTYPE_INFO) {
5052 		if (tag->vio_subtype_env == VIO_DRING_DATA) {
5053 			dmsg = (vio_dring_msg_t *)tag;
5054 			dmsg->seq_num = ldcp->lane_out.seq_num;
5055 			data_msg = B_TRUE;
5056 		} else if (tag->vio_subtype_env == VIO_PKT_DATA) {
5057 			rmsg = (vio_raw_data_msg_t *)tag;
5058 			rmsg->seq_num = ldcp->lane_out.seq_num;
5059 			data_msg = B_TRUE;
5060 		} else if (tag->vio_subtype_env == VIO_DESC_DATA) {
5061 			imsg = (vnet_ibnd_desc_t *)tag;
5062 			imsg->hdr.seq_num = ldcp->lane_out.seq_num;
5063 			data_msg = B_TRUE;
5064 		}
5065 	}
5066 
5067 	do {
5068 		msglen = size;
5069 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
5070 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
5071 
5072 	if (rv == 0 && data_msg == B_TRUE) {
5073 		ldcp->lane_out.seq_num++;
5074 	}
5075 
5076 	if ((rv != 0) || (msglen != size)) {
5077 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
5078 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
5079 		ldcp->ldc_stats.oerrors++;
5080 	}
5081 
5082 	mutex_exit(&ldcp->ldc_txlock);
5083 
5084 	/*
5085 	 * If channel has been reset we either handle it here or
5086 	 * simply report back that it has been reset and let caller
5087 	 * decide what to do.
5088 	 */
5089 	if (rv == ECONNRESET) {
5090 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
5091 
5092 		/*
5093 		 * N.B - must never be holding the dlistrw lock when
5094 		 * we do a reset of the channel.
5095 		 */
5096 		if (handle_reset) {
5097 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
5098 		}
5099 	}
5100 
5101 	return (rv);
5102 }
5103 
5104 /*
5105  * Remove the specified address from the list of address maintained
5106  * in this port node.
5107  */
5108 mcst_addr_t *
5109 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
5110 {
5111 	vsw_t		*vswp = NULL;
5112 	vsw_port_t	*port = NULL;
5113 	mcst_addr_t	*prev_p = NULL;
5114 	mcst_addr_t	*curr_p = NULL;
5115 
5116 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
5117 	    __func__, devtype, addr);
5118 
5119 	if (devtype == VSW_VNETPORT) {
5120 		port = (vsw_port_t *)arg;
5121 		mutex_enter(&port->mca_lock);
5122 		prev_p = curr_p = port->mcap;
5123 	} else {
5124 		vswp = (vsw_t *)arg;
5125 		mutex_enter(&vswp->mca_lock);
5126 		prev_p = curr_p = vswp->mcap;
5127 	}
5128 
5129 	while (curr_p != NULL) {
5130 		if (curr_p->addr == addr) {
5131 			D2(NULL, "%s: address found", __func__);
5132 			/* match found */
5133 			if (prev_p == curr_p) {
5134 				/* list head */
5135 				if (devtype == VSW_VNETPORT)
5136 					port->mcap = curr_p->nextp;
5137 				else
5138 					vswp->mcap = curr_p->nextp;
5139 			} else {
5140 				prev_p->nextp = curr_p->nextp;
5141 			}
5142 			break;
5143 		} else {
5144 			prev_p = curr_p;
5145 			curr_p = curr_p->nextp;
5146 		}
5147 	}
5148 
5149 	if (devtype == VSW_VNETPORT)
5150 		mutex_exit(&port->mca_lock);
5151 	else
5152 		mutex_exit(&vswp->mca_lock);
5153 
5154 	D1(NULL, "%s: exit", __func__);
5155 
5156 	return (curr_p);
5157 }
5158 
5159 /*
5160  * Creates a descriptor ring (dring) and links it into the
5161  * link of outbound drings for this channel.
5162  *
5163  * Returns NULL if creation failed.
5164  */
5165 static dring_info_t *
5166 vsw_create_dring(vsw_ldc_t *ldcp)
5167 {
5168 	vsw_private_desc_t	*priv_addr = NULL;
5169 	vsw_t			*vswp = ldcp->ldc_vswp;
5170 	ldc_mem_info_t		minfo;
5171 	dring_info_t		*dp, *tp;
5172 	int			i;
5173 
5174 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5175 
5176 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5177 
5178 	/* create public section of ring */
5179 	if ((ldc_mem_dring_create(vsw_ntxds,
5180 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
5181 
5182 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
5183 		    "failed", ldcp->ldc_id);
5184 		goto create_fail_exit;
5185 	}
5186 
5187 	ASSERT(dp->handle != NULL);
5188 
5189 	/*
5190 	 * Get the base address of the public section of the ring.
5191 	 */
5192 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
5193 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
5194 		    ldcp->ldc_id);
5195 		goto dring_fail_exit;
5196 	} else {
5197 		ASSERT(minfo.vaddr != 0);
5198 		dp->pub_addr = minfo.vaddr;
5199 	}
5200 
5201 	dp->num_descriptors = vsw_ntxds;
5202 	dp->descriptor_size = VSW_PUB_SIZE;
5203 	dp->options = VIO_TX_DRING;
5204 	dp->ncookies = 1;	/* guaranteed by ldc */
5205 
5206 	/*
5207 	 * create private portion of ring
5208 	 */
5209 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
5210 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5211 
5212 	if (vsw_setup_ring(ldcp, dp)) {
5213 		DERR(vswp, "%s: unable to setup ring", __func__);
5214 		goto dring_fail_exit;
5215 	}
5216 
5217 	/* haven't used any descriptors yet */
5218 	dp->end_idx = 0;
5219 	dp->last_ack_recv = -1;
5220 
5221 	/* bind dring to the channel */
5222 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
5223 	    LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW,
5224 	    &dp->cookie[0], &dp->ncookies)) != 0) {
5225 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
5226 		    "%lld", ldcp->ldc_id);
5227 		goto dring_fail_exit;
5228 	}
5229 
5230 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5231 	dp->restart_reqd = B_TRUE;
5232 
5233 	/*
5234 	 * Only ever create rings for outgoing lane. Link it onto
5235 	 * end of list.
5236 	 */
5237 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5238 	if (ldcp->lane_out.dringp == NULL) {
5239 		D2(vswp, "vsw_create_dring: adding first outbound ring");
5240 		ldcp->lane_out.dringp = dp;
5241 	} else {
5242 		tp = ldcp->lane_out.dringp;
5243 		while (tp->next != NULL)
5244 			tp = tp->next;
5245 
5246 		tp->next = dp;
5247 	}
5248 	RW_EXIT(&ldcp->lane_out.dlistrw);
5249 
5250 	return (dp);
5251 
5252 dring_fail_exit:
5253 	(void) ldc_mem_dring_destroy(dp->handle);
5254 
5255 create_fail_exit:
5256 	if (dp->priv_addr != NULL) {
5257 		priv_addr = dp->priv_addr;
5258 		for (i = 0; i < vsw_ntxds; i++) {
5259 			if (priv_addr->memhandle != NULL)
5260 				(void) ldc_mem_free_handle(
5261 				    priv_addr->memhandle);
5262 			priv_addr++;
5263 		}
5264 		kmem_free(dp->priv_addr,
5265 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5266 	}
5267 	mutex_destroy(&dp->dlock);
5268 
5269 	kmem_free(dp, sizeof (dring_info_t));
5270 	return (NULL);
5271 }
5272 
5273 /*
5274  * Create a ring consisting of just a private portion and link
5275  * it into the list of rings for the outbound lane.
5276  *
5277  * These type of rings are used primarily for temporary data
5278  * storage (i.e. as data buffers).
5279  */
5280 void
5281 vsw_create_privring(vsw_ldc_t *ldcp)
5282 {
5283 	dring_info_t		*dp, *tp;
5284 	vsw_t			*vswp = ldcp->ldc_vswp;
5285 
5286 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5287 
5288 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5289 
5290 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
5291 
5292 	/* no public section */
5293 	dp->pub_addr = NULL;
5294 
5295 	dp->priv_addr = kmem_zalloc(
5296 	    (sizeof (vsw_private_desc_t) * vsw_ntxds), KM_SLEEP);
5297 
5298 	dp->num_descriptors = vsw_ntxds;
5299 
5300 	if (vsw_setup_ring(ldcp, dp)) {
5301 		DERR(vswp, "%s: setup of ring failed", __func__);
5302 		kmem_free(dp->priv_addr,
5303 		    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5304 		mutex_destroy(&dp->dlock);
5305 		kmem_free(dp, sizeof (dring_info_t));
5306 		return;
5307 	}
5308 
5309 	/* haven't used any descriptors yet */
5310 	dp->end_idx = 0;
5311 
5312 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
5313 	dp->restart_reqd = B_TRUE;
5314 
5315 	/*
5316 	 * Only ever create rings for outgoing lane. Link it onto
5317 	 * end of list.
5318 	 */
5319 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
5320 	if (ldcp->lane_out.dringp == NULL) {
5321 		D2(vswp, "%s: adding first outbound privring", __func__);
5322 		ldcp->lane_out.dringp = dp;
5323 	} else {
5324 		tp = ldcp->lane_out.dringp;
5325 		while (tp->next != NULL)
5326 			tp = tp->next;
5327 
5328 		tp->next = dp;
5329 	}
5330 	RW_EXIT(&ldcp->lane_out.dlistrw);
5331 
5332 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5333 }
5334 
5335 /*
5336  * Setup the descriptors in the dring. Returns 0 on success, 1 on
5337  * failure.
5338  */
5339 int
5340 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
5341 {
5342 	vnet_public_desc_t	*pub_addr = NULL;
5343 	vsw_private_desc_t	*priv_addr = NULL;
5344 	vsw_t			*vswp = ldcp->ldc_vswp;
5345 	uint64_t		*tmpp;
5346 	uint64_t		offset = 0;
5347 	uint32_t		ncookies = 0;
5348 	static char		*name = "vsw_setup_ring";
5349 	int			i, j, nc, rv;
5350 	size_t			data_sz;
5351 	void			*data_addr;
5352 
5353 	priv_addr = dp->priv_addr;
5354 	pub_addr = dp->pub_addr;
5355 
5356 	/* public section may be null but private should never be */
5357 	ASSERT(priv_addr != NULL);
5358 
5359 	/*
5360 	 * Allocate the region of memory which will be used to hold
5361 	 * the data the descriptors will refer to.
5362 	 */
5363 	data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN;
5364 
5365 	/*
5366 	 * In order to ensure that the number of ldc cookies per descriptor is
5367 	 * limited to be within the default MAX_COOKIES (2), we take the steps
5368 	 * outlined below:
5369 	 *
5370 	 * Align the entire data buffer area to 8K and carve out per descriptor
5371 	 * data buffers starting from this 8K aligned base address.
5372 	 *
5373 	 * We round up the mtu specified to be a multiple of 2K or 4K.
5374 	 * For sizes up to 12K we round up the size to the next 2K.
5375 	 * For sizes > 12K we round up to the next 4K (otherwise sizes such as
5376 	 * 14K could end up needing 3 cookies, with the buffer spread across
5377 	 * 3 8K pages:  8K+6K, 2K+8K+2K, 6K+8K, ...).
5378 	 */
5379 	if (data_sz <= VNET_12K) {
5380 		data_sz = VNET_ROUNDUP_2K(data_sz);
5381 	} else {
5382 		data_sz = VNET_ROUNDUP_4K(data_sz);
5383 	}
5384 
5385 	dp->desc_data_sz = data_sz;
5386 
5387 	/* allocate extra 8K bytes for alignment */
5388 	dp->data_sz = (vsw_ntxds * data_sz) + VNET_8K;
5389 	data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
5390 	dp->data_addr = data_addr;
5391 
5392 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
5393 	    dp->data_sz, dp->data_addr);
5394 
5395 	/* align the starting address of the data area to 8K */
5396 	data_addr = (void *)VNET_ROUNDUP_8K((uintptr_t)data_addr);
5397 
5398 	tmpp = (uint64_t *)data_addr;
5399 	offset = dp->desc_data_sz/sizeof (tmpp);
5400 
5401 	/*
5402 	 * Initialise some of the private and public (if they exist)
5403 	 * descriptor fields.
5404 	 */
5405 	for (i = 0; i < vsw_ntxds; i++) {
5406 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
5407 
5408 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
5409 		    &priv_addr->memhandle)) != 0) {
5410 			DERR(vswp, "%s: alloc mem handle failed", name);
5411 			goto setup_ring_cleanup;
5412 		}
5413 
5414 		priv_addr->datap = (void *)tmpp;
5415 
5416 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
5417 		    (caddr_t)priv_addr->datap, dp->desc_data_sz,
5418 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
5419 		    &(priv_addr->memcookie[0]), &ncookies);
5420 		if (rv != 0) {
5421 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
5422 			    "(rv %d)", name, ldcp->ldc_id, rv);
5423 			goto setup_ring_cleanup;
5424 		}
5425 		priv_addr->bound = 1;
5426 
5427 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
5428 		    name, i, priv_addr->memcookie[0].addr,
5429 		    priv_addr->memcookie[0].size);
5430 
5431 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
5432 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
5433 			    "invalid num of cookies (%d) for size 0x%llx",
5434 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
5435 
5436 			goto setup_ring_cleanup;
5437 		} else {
5438 			for (j = 1; j < ncookies; j++) {
5439 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
5440 				    &(priv_addr->memcookie[j]));
5441 				if (rv != 0) {
5442 					DERR(vswp, "%s: ldc_mem_nextcookie "
5443 					    "failed rv (%d)", name, rv);
5444 					goto setup_ring_cleanup;
5445 				}
5446 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
5447 				    "size 0x%llx", name, j,
5448 				    priv_addr->memcookie[j].addr,
5449 				    priv_addr->memcookie[j].size);
5450 			}
5451 
5452 		}
5453 		priv_addr->ncookies = ncookies;
5454 		priv_addr->dstate = VIO_DESC_FREE;
5455 
5456 		if (pub_addr != NULL) {
5457 
5458 			/* link pub and private sides */
5459 			priv_addr->descp = pub_addr;
5460 
5461 			pub_addr->ncookies = priv_addr->ncookies;
5462 
5463 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
5464 				bcopy(&priv_addr->memcookie[nc],
5465 				    &pub_addr->memcookie[nc],
5466 				    sizeof (ldc_mem_cookie_t));
5467 			}
5468 
5469 			pub_addr->hdr.dstate = VIO_DESC_FREE;
5470 			pub_addr++;
5471 		}
5472 
5473 		/*
5474 		 * move to next element in the dring and the next
5475 		 * position in the data buffer.
5476 		 */
5477 		priv_addr++;
5478 		tmpp += offset;
5479 	}
5480 
5481 	return (0);
5482 
5483 setup_ring_cleanup:
5484 	priv_addr = dp->priv_addr;
5485 
5486 	for (j = 0; j < i; j++) {
5487 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
5488 		(void) ldc_mem_free_handle(priv_addr->memhandle);
5489 
5490 		mutex_destroy(&priv_addr->dstate_lock);
5491 
5492 		priv_addr++;
5493 	}
5494 	kmem_free(dp->data_addr, dp->data_sz);
5495 
5496 	return (1);
5497 }
5498 
5499 /*
5500  * Searches the private section of a ring for a free descriptor,
5501  * starting at the location of the last free descriptor found
5502  * previously.
5503  *
5504  * Returns 0 if free descriptor is available, and updates state
5505  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
5506  *
5507  * FUTURE: might need to return contiguous range of descriptors
5508  * as dring info msg assumes all will be contiguous.
5509  */
5510 static int
5511 vsw_dring_find_free_desc(dring_info_t *dringp,
5512 		vsw_private_desc_t **priv_p, int *idx)
5513 {
5514 	vsw_private_desc_t	*addr = NULL;
5515 	int			num = vsw_ntxds;
5516 	int			ret = 1;
5517 
5518 	D1(NULL, "%s enter\n", __func__);
5519 
5520 	ASSERT(dringp->priv_addr != NULL);
5521 
5522 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
5523 	    __func__, dringp, dringp->end_idx);
5524 
5525 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
5526 
5527 	mutex_enter(&addr->dstate_lock);
5528 	if (addr->dstate == VIO_DESC_FREE) {
5529 		addr->dstate = VIO_DESC_READY;
5530 		*priv_p = addr;
5531 		*idx = dringp->end_idx;
5532 		dringp->end_idx = (dringp->end_idx + 1) % num;
5533 		ret = 0;
5534 
5535 	}
5536 	mutex_exit(&addr->dstate_lock);
5537 
5538 	/* ring full */
5539 	if (ret == 1) {
5540 		D2(NULL, "%s: no desp free: started at %d", __func__,
5541 		    dringp->end_idx);
5542 	}
5543 
5544 	D1(NULL, "%s: exit\n", __func__);
5545 
5546 	return (ret);
5547 }
5548 
5549 /*
5550  * Map from a dring identifier to the ring itself. Returns
5551  * pointer to ring or NULL if no match found.
5552  *
5553  * Should be called with dlistrw rwlock held as reader.
5554  */
5555 static dring_info_t *
5556 vsw_ident2dring(lane_t *lane, uint64_t ident)
5557 {
5558 	dring_info_t	*dp = NULL;
5559 
5560 	if ((dp = lane->dringp) == NULL) {
5561 		return (NULL);
5562 	} else {
5563 		if (dp->ident == ident)
5564 			return (dp);
5565 
5566 		while (dp != NULL) {
5567 			if (dp->ident == ident)
5568 				break;
5569 			dp = dp->next;
5570 		}
5571 	}
5572 
5573 	return (dp);
5574 }
5575 
5576 /*
5577  * Set the default lane attributes. These are copied into
5578  * the attr msg we send to our peer. If they are not acceptable
5579  * then (currently) the handshake ends.
5580  */
5581 static void
5582 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
5583 {
5584 	bzero(lp, sizeof (lane_t));
5585 
5586 	READ_ENTER(&vswp->if_lockrw);
5587 	ether_copy(&(vswp->if_addr), &(lp->addr));
5588 	RW_EXIT(&vswp->if_lockrw);
5589 
5590 	lp->mtu = vswp->max_frame_size;
5591 	lp->addr_type = ADDR_TYPE_MAC;
5592 	lp->xfer_mode = VIO_DRING_MODE_V1_0;
5593 	lp->ack_freq = 0;	/* for shared mode */
5594 	lp->seq_num = VNET_ISS;
5595 }
5596 
5597 /*
5598  * Verify that the attributes are acceptable.
5599  *
5600  * FUTURE: If some attributes are not acceptable, change them
5601  * our desired values.
5602  */
5603 static int
5604 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_ldc_t *ldcp)
5605 {
5606 	int			ret = 0;
5607 	struct ether_addr	ea;
5608 	vsw_port_t		*port = ldcp->ldc_port;
5609 	lane_t			*lp = &ldcp->lane_out;
5610 
5611 	D1(NULL, "vsw_check_attr enter\n");
5612 
5613 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
5614 	    (pkt->xfer_mode != lp->xfer_mode)) {
5615 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
5616 		ret = 1;
5617 	}
5618 
5619 	/* Only support MAC addresses at moment. */
5620 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
5621 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
5622 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
5623 		ret = 1;
5624 	}
5625 
5626 	/*
5627 	 * MAC address supplied by device should match that stored
5628 	 * in the vsw-port OBP node. Need to decide what to do if they
5629 	 * don't match, for the moment just warn but don't fail.
5630 	 */
5631 	vnet_macaddr_ultostr(pkt->addr, ea.ether_addr_octet);
5632 	if (ether_cmp(&ea, &port->p_macaddr) != 0) {
5633 		DERR(NULL, "vsw_check_attr: device supplied address "
5634 		    "0x%llx doesn't match node address 0x%llx\n",
5635 		    pkt->addr, port->p_macaddr);
5636 	}
5637 
5638 	/*
5639 	 * Ack freq only makes sense in pkt mode, in shared
5640 	 * mode the ring descriptors say whether or not to
5641 	 * send back an ACK.
5642 	 */
5643 	if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
5644 	    (pkt->xfer_mode & VIO_DRING_MODE_V1_2)) ||
5645 	    (VSW_VER_LT(ldcp, 1, 2) &&
5646 	    (pkt->xfer_mode == VIO_DRING_MODE_V1_0))) {
5647 		if (pkt->ack_freq > 0) {
5648 			D2(NULL, "vsw_check_attr: non zero ack freq "
5649 			    " in SHM mode\n");
5650 			ret = 1;
5651 		}
5652 	}
5653 
5654 	if (VSW_VER_LT(ldcp, 1, 4)) {
5655 		/* versions < 1.4, mtu must match */
5656 		if (pkt->mtu != lp->mtu) {
5657 			D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
5658 			    pkt->mtu);
5659 			ret = 1;
5660 		}
5661 	} else {
5662 		/* Ver >= 1.4, validate mtu of the peer is at least ETHERMAX */
5663 		if (pkt->mtu < ETHERMAX) {
5664 			ret = 1;
5665 		}
5666 	}
5667 
5668 	D1(NULL, "vsw_check_attr exit\n");
5669 
5670 	return (ret);
5671 }
5672 
5673 /*
5674  * Returns 1 if there is a problem, 0 otherwise.
5675  */
5676 static int
5677 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
5678 {
5679 	_NOTE(ARGUNUSED(pkt))
5680 
5681 	int	ret = 0;
5682 
5683 	D1(NULL, "vsw_check_dring_info enter\n");
5684 
5685 	if ((pkt->num_descriptors == 0) ||
5686 	    (pkt->descriptor_size == 0) ||
5687 	    (pkt->ncookies != 1)) {
5688 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
5689 		ret = 1;
5690 	}
5691 
5692 	D1(NULL, "vsw_check_dring_info exit\n");
5693 
5694 	return (ret);
5695 }
5696 
5697 /*
5698  * Returns 1 if two memory cookies match. Otherwise returns 0.
5699  */
5700 static int
5701 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
5702 {
5703 	if ((m1->addr != m2->addr) ||
5704 	    (m2->size != m2->size)) {
5705 		return (0);
5706 	} else {
5707 		return (1);
5708 	}
5709 }
5710 
5711 /*
5712  * Returns 1 if ring described in reg message matches that
5713  * described by dring_info structure. Otherwise returns 0.
5714  */
5715 static int
5716 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
5717 {
5718 	if ((msg->descriptor_size != dp->descriptor_size) ||
5719 	    (msg->num_descriptors != dp->num_descriptors) ||
5720 	    (msg->ncookies != dp->ncookies) ||
5721 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
5722 		return (0);
5723 	} else {
5724 		return (1);
5725 	}
5726 
5727 }
5728 
5729 /*
5730  * Reset and free all the resources associated with
5731  * the channel.
5732  */
5733 static void
5734 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
5735 {
5736 	dring_info_t		*dp, *dpp;
5737 	lane_t			*lp = NULL;
5738 
5739 	ASSERT(ldcp != NULL);
5740 
5741 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
5742 
5743 	if (dir == INBOUND) {
5744 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
5745 		    " of channel %lld", __func__, ldcp->ldc_id);
5746 		lp = &ldcp->lane_in;
5747 	} else {
5748 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
5749 		    " of channel %lld", __func__, ldcp->ldc_id);
5750 		lp = &ldcp->lane_out;
5751 	}
5752 
5753 	lp->lstate = VSW_LANE_INACTIV;
5754 	lp->seq_num = VNET_ISS;
5755 
5756 	if (lp->dringp) {
5757 		if (dir == INBOUND) {
5758 			WRITE_ENTER(&lp->dlistrw);
5759 			dp = lp->dringp;
5760 			while (dp != NULL) {
5761 				dpp = dp->next;
5762 				if (dp->handle != NULL)
5763 					(void) ldc_mem_dring_unmap(dp->handle);
5764 				kmem_free(dp, sizeof (dring_info_t));
5765 				dp = dpp;
5766 			}
5767 			RW_EXIT(&lp->dlistrw);
5768 		} else {
5769 			/*
5770 			 * unbind, destroy exported dring, free dring struct
5771 			 */
5772 			WRITE_ENTER(&lp->dlistrw);
5773 			dp = lp->dringp;
5774 			vsw_free_ring(dp);
5775 			RW_EXIT(&lp->dlistrw);
5776 		}
5777 		lp->dringp = NULL;
5778 	}
5779 
5780 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
5781 }
5782 
5783 /*
5784  * Free ring and all associated resources.
5785  *
5786  * Should be called with dlistrw rwlock held as writer.
5787  */
5788 static void
5789 vsw_free_ring(dring_info_t *dp)
5790 {
5791 	vsw_private_desc_t	*paddr = NULL;
5792 	dring_info_t		*dpp;
5793 	int			i;
5794 
5795 	while (dp != NULL) {
5796 		mutex_enter(&dp->dlock);
5797 		dpp = dp->next;
5798 		if (dp->priv_addr != NULL) {
5799 			/*
5800 			 * First unbind and free the memory handles
5801 			 * stored in each descriptor within the ring.
5802 			 */
5803 			for (i = 0; i < vsw_ntxds; i++) {
5804 				paddr = (vsw_private_desc_t *)
5805 				    dp->priv_addr + i;
5806 				if (paddr->memhandle != NULL) {
5807 					if (paddr->bound == 1) {
5808 						if (ldc_mem_unbind_handle(
5809 						    paddr->memhandle) != 0) {
5810 							DERR(NULL, "error "
5811 							"unbinding handle for "
5812 							"ring 0x%llx at pos %d",
5813 							    dp, i);
5814 							continue;
5815 						}
5816 						paddr->bound = 0;
5817 					}
5818 
5819 					if (ldc_mem_free_handle(
5820 					    paddr->memhandle) != 0) {
5821 						DERR(NULL, "error freeing "
5822 						    "handle for ring 0x%llx "
5823 						    "at pos %d", dp, i);
5824 						continue;
5825 					}
5826 					paddr->memhandle = NULL;
5827 				}
5828 				mutex_destroy(&paddr->dstate_lock);
5829 			}
5830 			kmem_free(dp->priv_addr,
5831 			    (sizeof (vsw_private_desc_t) * vsw_ntxds));
5832 		}
5833 
5834 		/*
5835 		 * Now unbind and destroy the ring itself.
5836 		 */
5837 		if (dp->handle != NULL) {
5838 			(void) ldc_mem_dring_unbind(dp->handle);
5839 			(void) ldc_mem_dring_destroy(dp->handle);
5840 		}
5841 
5842 		if (dp->data_addr != NULL) {
5843 			kmem_free(dp->data_addr, dp->data_sz);
5844 		}
5845 
5846 		mutex_exit(&dp->dlock);
5847 		mutex_destroy(&dp->dlock);
5848 		mutex_destroy(&dp->restart_lock);
5849 		kmem_free(dp, sizeof (dring_info_t));
5850 
5851 		dp = dpp;
5852 	}
5853 }
5854 
5855 /*
5856  * vsw_ldc_rx_worker -- A per LDC worker thread to receive data.
5857  * This thread is woken up by the LDC interrupt handler to process
5858  * LDC packets and receive data.
5859  */
5860 static void
5861 vsw_ldc_rx_worker(void *arg)
5862 {
5863 	callb_cpr_t	cprinfo;
5864 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5865 	vsw_t *vswp = ldcp->ldc_vswp;
5866 
5867 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5868 	CALLB_CPR_INIT(&cprinfo, &ldcp->rx_thr_lock, callb_generic_cpr,
5869 	    "vsw_rx_thread");
5870 	mutex_enter(&ldcp->rx_thr_lock);
5871 	ldcp->rx_thr_flags |= VSW_WTHR_RUNNING;
5872 	while (!(ldcp->rx_thr_flags & VSW_WTHR_STOP)) {
5873 
5874 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5875 		/*
5876 		 * Wait until the data is received or a stop
5877 		 * request is received.
5878 		 */
5879 		while (!(ldcp->rx_thr_flags &
5880 		    (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) {
5881 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5882 		}
5883 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->rx_thr_lock)
5884 
5885 		/*
5886 		 * First process the stop request.
5887 		 */
5888 		if (ldcp->rx_thr_flags & VSW_WTHR_STOP) {
5889 			D2(vswp, "%s(%lld):Rx thread stopped\n",
5890 			    __func__, ldcp->ldc_id);
5891 			break;
5892 		}
5893 		ldcp->rx_thr_flags &= ~VSW_WTHR_DATARCVD;
5894 		mutex_exit(&ldcp->rx_thr_lock);
5895 		D1(vswp, "%s(%lld):calling vsw_process_pkt\n",
5896 		    __func__, ldcp->ldc_id);
5897 		mutex_enter(&ldcp->ldc_cblock);
5898 		vsw_process_pkt(ldcp);
5899 		mutex_exit(&ldcp->ldc_cblock);
5900 		mutex_enter(&ldcp->rx_thr_lock);
5901 	}
5902 
5903 	/*
5904 	 * Update the run status and wakeup the thread that
5905 	 * has sent the stop request.
5906 	 */
5907 	ldcp->rx_thr_flags &= ~VSW_WTHR_RUNNING;
5908 	cv_signal(&ldcp->rx_thr_cv);
5909 	CALLB_CPR_EXIT(&cprinfo);
5910 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5911 	thread_exit();
5912 }
5913 
5914 /* vsw_stop_rx_thread -- Co-ordinate with receive thread to stop it */
5915 static void
5916 vsw_stop_rx_thread(vsw_ldc_t *ldcp)
5917 {
5918 	vsw_t *vswp = ldcp->ldc_vswp;
5919 
5920 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5921 	/*
5922 	 * Send a stop request by setting the stop flag and
5923 	 * wait until the receive thread stops.
5924 	 */
5925 	mutex_enter(&ldcp->rx_thr_lock);
5926 	if (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5927 		ldcp->rx_thr_flags |= VSW_WTHR_STOP;
5928 		cv_signal(&ldcp->rx_thr_cv);
5929 		while (ldcp->rx_thr_flags & VSW_WTHR_RUNNING) {
5930 			cv_wait(&ldcp->rx_thr_cv, &ldcp->rx_thr_lock);
5931 		}
5932 	}
5933 	mutex_exit(&ldcp->rx_thr_lock);
5934 	ldcp->rx_thread = NULL;
5935 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
5936 }
5937 
5938 /*
5939  * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
5940  * This thread is woken up by the vsw_portsend to transmit
5941  * packets.
5942  */
5943 static void
5944 vsw_ldc_tx_worker(void *arg)
5945 {
5946 	callb_cpr_t	cprinfo;
5947 	vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
5948 	vsw_t *vswp = ldcp->ldc_vswp;
5949 	mblk_t *mp;
5950 	mblk_t *tmp;
5951 
5952 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
5953 	CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
5954 	    "vnet_tx_thread");
5955 	mutex_enter(&ldcp->tx_thr_lock);
5956 	ldcp->tx_thr_flags |= VSW_WTHR_RUNNING;
5957 	while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
5958 
5959 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
5960 		/*
5961 		 * Wait until the data is received or a stop
5962 		 * request is received.
5963 		 */
5964 		while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
5965 		    (ldcp->tx_mhead == NULL)) {
5966 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
5967 		}
5968 		CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
5969 
5970 		/*
5971 		 * First process the stop request.
5972 		 */
5973 		if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
5974 			D2(vswp, "%s(%lld):tx thread stopped\n",
5975 			    __func__, ldcp->ldc_id);
5976 			break;
5977 		}
5978 		mp = ldcp->tx_mhead;
5979 		ldcp->tx_mhead = ldcp->tx_mtail = NULL;
5980 		ldcp->tx_cnt = 0;
5981 		mutex_exit(&ldcp->tx_thr_lock);
5982 		D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
5983 		    __func__, ldcp->ldc_id);
5984 		while (mp != NULL) {
5985 			tmp = mp->b_next;
5986 			mp->b_next = mp->b_prev = NULL;
5987 			(void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
5988 			mp = tmp;
5989 		}
5990 		mutex_enter(&ldcp->tx_thr_lock);
5991 	}
5992 
5993 	/*
5994 	 * Update the run status and wakeup the thread that
5995 	 * has sent the stop request.
5996 	 */
5997 	ldcp->tx_thr_flags &= ~VSW_WTHR_RUNNING;
5998 	cv_signal(&ldcp->tx_thr_cv);
5999 	CALLB_CPR_EXIT(&cprinfo);
6000 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6001 	thread_exit();
6002 }
6003 
6004 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
6005 static void
6006 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
6007 {
6008 	vsw_t *vswp = ldcp->ldc_vswp;
6009 
6010 	D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
6011 	/*
6012 	 * Send a stop request by setting the stop flag and
6013 	 * wait until the receive thread stops.
6014 	 */
6015 	mutex_enter(&ldcp->tx_thr_lock);
6016 	if (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
6017 		ldcp->tx_thr_flags |= VSW_WTHR_STOP;
6018 		cv_signal(&ldcp->tx_thr_cv);
6019 		while (ldcp->tx_thr_flags & VSW_WTHR_RUNNING) {
6020 			cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
6021 		}
6022 	}
6023 	mutex_exit(&ldcp->tx_thr_lock);
6024 	ldcp->tx_thread = NULL;
6025 	D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
6026 }
6027 
6028 /* vsw_reclaim_dring -- reclaim descriptors */
6029 static int
6030 vsw_reclaim_dring(dring_info_t *dp, int start)
6031 {
6032 	int i, j, len;
6033 	vsw_private_desc_t *priv_addr;
6034 	vnet_public_desc_t *pub_addr;
6035 
6036 	pub_addr = (vnet_public_desc_t *)dp->pub_addr;
6037 	priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6038 	len = dp->num_descriptors;
6039 
6040 	D2(NULL, "%s: start index %ld\n", __func__, start);
6041 
6042 	j = 0;
6043 	for (i = start; j < len; i = (i + 1) % len, j++) {
6044 		pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6045 		priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6046 
6047 		mutex_enter(&priv_addr->dstate_lock);
6048 		if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
6049 			mutex_exit(&priv_addr->dstate_lock);
6050 			break;
6051 		}
6052 		pub_addr->hdr.dstate = VIO_DESC_FREE;
6053 		priv_addr->dstate = VIO_DESC_FREE;
6054 		/* clear all the fields */
6055 		priv_addr->datalen = 0;
6056 		pub_addr->hdr.ack = 0;
6057 		mutex_exit(&priv_addr->dstate_lock);
6058 
6059 		D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx",
6060 		    i, pub_addr->hdr.dstate, priv_addr->dstate);
6061 	}
6062 	return (j);
6063 }
6064 
6065 /*
6066  * Debugging routines
6067  */
6068 static void
6069 display_state(void)
6070 {
6071 	vsw_t		*vswp;
6072 	vsw_port_list_t	*plist;
6073 	vsw_port_t 	*port;
6074 	vsw_ldc_list_t	*ldcl;
6075 	vsw_ldc_t 	*ldcp;
6076 	extern vsw_t 	*vsw_head;
6077 
6078 	cmn_err(CE_NOTE, "***** system state *****");
6079 
6080 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
6081 		plist = &vswp->plist;
6082 		READ_ENTER(&plist->lockrw);
6083 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
6084 		    vswp->instance, plist->num_ports);
6085 
6086 		for (port = plist->head; port != NULL; port = port->p_next) {
6087 			ldcl = &port->p_ldclist;
6088 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
6089 			    port->p_instance, port->num_ldcs);
6090 			READ_ENTER(&ldcl->lockrw);
6091 			ldcp = ldcl->head;
6092 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
6093 				cmn_err(CE_CONT, "chan %lu : dev %d : "
6094 				    "status %d : phase %u\n",
6095 				    ldcp->ldc_id, ldcp->dev_class,
6096 				    ldcp->ldc_status, ldcp->hphase);
6097 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
6098 				    "psession %lu\n", ldcp->ldc_id,
6099 				    ldcp->local_session, ldcp->peer_session);
6100 
6101 				cmn_err(CE_CONT, "Inbound lane:\n");
6102 				display_lane(&ldcp->lane_in);
6103 				cmn_err(CE_CONT, "Outbound lane:\n");
6104 				display_lane(&ldcp->lane_out);
6105 			}
6106 			RW_EXIT(&ldcl->lockrw);
6107 		}
6108 		RW_EXIT(&plist->lockrw);
6109 	}
6110 	cmn_err(CE_NOTE, "***** system state *****");
6111 }
6112 
6113 static void
6114 display_lane(lane_t *lp)
6115 {
6116 	dring_info_t	*drp;
6117 
6118 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
6119 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
6120 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
6121 	    lp->addr_type, lp->addr, lp->xfer_mode);
6122 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
6123 
6124 	cmn_err(CE_CONT, "Dring info:\n");
6125 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
6126 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
6127 		    drp->num_descriptors, drp->descriptor_size);
6128 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
6129 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
6130 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
6131 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
6132 		    drp->ident, drp->end_idx);
6133 		display_ring(drp);
6134 	}
6135 }
6136 
6137 static void
6138 display_ring(dring_info_t *dringp)
6139 {
6140 	uint64_t		i;
6141 	uint64_t		priv_count = 0;
6142 	uint64_t		pub_count = 0;
6143 	vnet_public_desc_t	*pub_addr = NULL;
6144 	vsw_private_desc_t	*priv_addr = NULL;
6145 
6146 	for (i = 0; i < vsw_ntxds; i++) {
6147 		if (dringp->pub_addr != NULL) {
6148 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
6149 
6150 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
6151 				pub_count++;
6152 		}
6153 
6154 		if (dringp->priv_addr != NULL) {
6155 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
6156 
6157 			if (priv_addr->dstate == VIO_DESC_FREE)
6158 				priv_count++;
6159 		}
6160 	}
6161 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
6162 	    i, priv_count, pub_count);
6163 }
6164 
6165 static void
6166 dump_flags(uint64_t state)
6167 {
6168 	int	i;
6169 
6170 	typedef struct flag_name {
6171 		int	flag_val;
6172 		char	*flag_name;
6173 	} flag_name_t;
6174 
6175 	flag_name_t	flags[] = {
6176 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
6177 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
6178 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
6179 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
6180 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
6181 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
6182 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
6183 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
6184 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
6185 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
6186 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
6187 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
6188 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
6189 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
6190 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
6191 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
6192 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
6193 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
6194 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
6195 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
6196 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
6197 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
6198 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
6199 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
6200 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
6201 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
6202 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
6203 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
6204 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
6205 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
6206 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
6207 
6208 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
6209 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
6210 		if (state & flags[i].flag_val)
6211 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
6212 	}
6213 }
6214