xref: /titanic_51/usr/src/uts/sun4v/io/vsw.c (revision 4496171313bed39e96f21bc2f9faf2868e267ae3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 
74 /*
75  * Function prototypes.
76  */
77 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
78 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
79 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
80 static	void vsw_get_md_properties(vsw_t *vswp);
81 static	int vsw_get_physaddr(vsw_t *);
82 static	int vsw_setup_layer2(vsw_t *);
83 static	int vsw_setup_layer3(vsw_t *);
84 
85 /* MAC Ring table functions. */
86 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
87 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
88 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
89 static void vsw_queue_stop(vsw_queue_t *vqp);
90 static vsw_queue_t *vsw_queue_create();
91 static void vsw_queue_destroy(vsw_queue_t *vqp);
92 
93 /* MAC layer routines */
94 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
95 		mac_resource_t *mrp);
96 static	int vsw_get_hw_maddr(vsw_t *);
97 static	int vsw_set_hw(vsw_t *, vsw_port_t *);
98 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *);
99 static	int vsw_unset_hw(vsw_t *, vsw_port_t *);
100 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *);
101 static	int vsw_reconfig_hw(vsw_t *);
102 static int vsw_mac_attach(vsw_t *vswp);
103 static void vsw_mac_detach(vsw_t *vswp);
104 
105 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
106 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
107 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
108 static int vsw_mac_register(vsw_t *);
109 static int vsw_mac_unregister(vsw_t *);
110 static int vsw_m_stat(void *, uint_t, uint64_t *);
111 static void vsw_m_stop(void *arg);
112 static int vsw_m_start(void *arg);
113 static int vsw_m_unicst(void *arg, const uint8_t *);
114 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
115 static int vsw_m_promisc(void *arg, boolean_t);
116 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
117 
118 /* MDEG routines */
119 static	void vsw_mdeg_register(vsw_t *vswp);
120 static	void vsw_mdeg_unregister(vsw_t *vswp);
121 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
122 
123 /* Port add/deletion routines */
124 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
125 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
126 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
127 static	int vsw_detach_ports(vsw_t *vswp);
128 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
129 static	int vsw_port_delete(vsw_port_t *port);
130 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
131 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
132 static	int vsw_init_ldcs(vsw_port_t *port);
133 static	int vsw_uninit_ldcs(vsw_port_t *port);
134 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
135 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
136 static	int vsw_drain_ldcs(vsw_port_t *port);
137 static	int vsw_drain_port_taskq(vsw_port_t *port);
138 static	void vsw_marker_task(void *);
139 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
140 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
141 
142 /* Interrupt routines */
143 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
144 
145 /* Handshake routines */
146 static	void vsw_restart_ldc(vsw_ldc_t *);
147 static	void vsw_restart_handshake(vsw_ldc_t *);
148 static	void vsw_handle_reset(vsw_ldc_t *);
149 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
150 static	void vsw_next_milestone(vsw_ldc_t *);
151 static	int vsw_supported_version(vio_ver_msg_t *);
152 
153 /* Data processing routines */
154 static void vsw_process_pkt(void *);
155 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
156 static void vsw_process_ctrl_pkt(void *);
157 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
158 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
159 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
160 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
161 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
162 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
163 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
164 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
165 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
166 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
167 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
168 
169 /* Switching/data transmit routines */
170 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
171 	    vsw_port_t *port, mac_resource_handle_t);
172 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
173 	    vsw_port_t *port, mac_resource_handle_t);
174 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
175 	    vsw_port_t *port);
176 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
177 	    vsw_port_t *port);
178 static	int vsw_portsend(vsw_port_t *, mblk_t *);
179 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
180 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
181 
182 /* Packet creation routines */
183 static void vsw_send_ver(void *);
184 static void vsw_send_attr(vsw_ldc_t *);
185 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
186 static void vsw_send_dring_info(vsw_ldc_t *);
187 static void vsw_send_rdx(vsw_ldc_t *);
188 
189 static void vsw_send_msg(vsw_ldc_t *, void *, int);
190 
191 /* Forwarding database (FDB) routines */
192 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
193 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
194 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
195 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
196 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
197 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
198 static	void vsw_del_addr(uint8_t, void *, uint64_t);
199 static	void vsw_del_mcst_port(vsw_port_t *);
200 static	void vsw_del_mcst_vsw(vsw_t *);
201 
202 /* Dring routines */
203 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
204 static void vsw_create_privring(vsw_ldc_t *);
205 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
206 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
207     int *);
208 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
209 
210 static void vsw_set_lane_attr(vsw_t *, lane_t *);
211 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
212 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
213 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
214 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
215 
216 /* Misc support routines */
217 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
218 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
219 static int vsw_free_ring(dring_info_t *);
220 
221 
222 /* Debugging routines */
223 static void dump_flags(uint64_t);
224 static void display_state(void);
225 static void display_lane(lane_t *);
226 static void display_ring(dring_info_t *);
227 
228 int	vsw_num_handshakes = 3;		/* # of handshake attempts */
229 int	vsw_wretries = 100;		/* # of write attempts */
230 int	vsw_chain_len = 150;		/* max # of mblks in msg chain */
231 int	vsw_desc_delay = 0;		/* delay in us */
232 int	vsw_read_attempts = 5;		/* # of reads of descriptor */
233 
234 uint32_t	vsw_mblk_size = VSW_MBLK_SIZE;
235 uint32_t	vsw_num_mblks = VSW_NUM_MBLKS;
236 
237 
238 /*
239  * mode specific frame switching function
240  */
241 void		(*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *,
242 			mac_resource_handle_t);
243 
244 static	mac_callbacks_t	vsw_m_callbacks = {
245 	0,
246 	vsw_m_stat,
247 	vsw_m_start,
248 	vsw_m_stop,
249 	vsw_m_promisc,
250 	vsw_m_multicst,
251 	vsw_m_unicst,
252 	vsw_m_tx,
253 	NULL,
254 	NULL,
255 	NULL
256 };
257 
258 static	struct	cb_ops	vsw_cb_ops = {
259 	nulldev,			/* cb_open */
260 	nulldev,			/* cb_close */
261 	nodev,				/* cb_strategy */
262 	nodev,				/* cb_print */
263 	nodev,				/* cb_dump */
264 	nodev,				/* cb_read */
265 	nodev,				/* cb_write */
266 	nodev,				/* cb_ioctl */
267 	nodev,				/* cb_devmap */
268 	nodev,				/* cb_mmap */
269 	nodev,				/* cb_segmap */
270 	nochpoll,			/* cb_chpoll */
271 	ddi_prop_op,			/* cb_prop_op */
272 	NULL,				/* cb_stream */
273 	D_MP,				/* cb_flag */
274 	CB_REV,				/* rev */
275 	nodev,				/* int (*cb_aread)() */
276 	nodev				/* int (*cb_awrite)() */
277 };
278 
279 static	struct	dev_ops	vsw_ops = {
280 	DEVO_REV,		/* devo_rev */
281 	0,			/* devo_refcnt */
282 	vsw_getinfo,		/* devo_getinfo */
283 	nulldev,		/* devo_identify */
284 	nulldev,		/* devo_probe */
285 	vsw_attach,		/* devo_attach */
286 	vsw_detach,		/* devo_detach */
287 	nodev,			/* devo_reset */
288 	&vsw_cb_ops,		/* devo_cb_ops */
289 	(struct bus_ops *)NULL,	/* devo_bus_ops */
290 	ddi_power		/* devo_power */
291 };
292 
293 extern	struct	mod_ops	mod_driverops;
294 static struct modldrv vswmodldrv = {
295 	&mod_driverops,
296 	"sun4v Virtual Switch Driver %I%",
297 	&vsw_ops,
298 };
299 
300 #define	LDC_ENTER_LOCK(ldcp)	\
301 				mutex_enter(&((ldcp)->ldc_cblock));\
302 				mutex_enter(&((ldcp)->ldc_txlock));
303 #define	LDC_EXIT_LOCK(ldcp)	\
304 				mutex_exit(&((ldcp)->ldc_txlock));\
305 				mutex_exit(&((ldcp)->ldc_cblock));
306 
307 /* Driver soft state ptr  */
308 static void	*vsw_state;
309 
310 /*
311  * Linked list of "vsw_t" structures - one per instance.
312  */
313 vsw_t		*vsw_head = NULL;
314 krwlock_t	vsw_rw;
315 
316 /*
317  * Property names
318  */
319 static char vdev_propname[] = "virtual-device";
320 static char vsw_propname[] = "virtual-network-switch";
321 static char physdev_propname[] = "vsw-phys-dev";
322 static char smode_propname[] = "vsw-switch-mode";
323 static char macaddr_propname[] = "local-mac-address";
324 static char remaddr_propname[] = "remote-mac-address";
325 static char ldcids_propname[] = "ldc-ids";
326 static char chan_propname[] = "channel-endpoint";
327 static char id_propname[] = "id";
328 static char reg_propname[] = "reg";
329 
330 /* supported versions */
331 static	ver_sup_t	vsw_versions[] = { {1, 0} };
332 
333 /*
334  * Matching criteria passed to the MDEG to register interest
335  * in changes to 'virtual-device-port' nodes identified by their
336  * 'id' property.
337  */
338 static md_prop_match_t vport_prop_match[] = {
339 	{ MDET_PROP_VAL,    "id"   },
340 	{ MDET_LIST_END,    NULL    }
341 };
342 
343 static mdeg_node_match_t vport_match = { "virtual-device-port",
344 						vport_prop_match };
345 
346 /*
347  * Specification of an MD node passed to the MDEG to filter any
348  * 'vport' nodes that do not belong to the specified node. This
349  * template is copied for each vsw instance and filled in with
350  * the appropriate 'cfg-handle' value before being passed to the MDEG.
351  */
352 static mdeg_prop_spec_t vsw_prop_template[] = {
353 	{ MDET_PROP_STR,    "name",		vsw_propname },
354 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
355 	{ MDET_LIST_END,    NULL,		NULL	}
356 };
357 
358 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
359 
360 /*
361  * From /etc/system enable/disable thread per ring. This is a mode
362  * selection that is done a vsw driver attach time.
363  */
364 boolean_t vsw_multi_ring_enable = B_FALSE;
365 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;
366 
367 /*
368  * Print debug messages - set to 0x1f to enable all msgs
369  * or 0x0 to turn all off.
370  */
371 int vswdbg = 0x0;
372 
373 /*
374  * debug levels:
375  * 0x01:	Function entry/exit tracing
376  * 0x02:	Internal function messages
377  * 0x04:	Verbose internal messages
378  * 0x08:	Warning messages
379  * 0x10:	Error messages
380  */
381 
382 static void
383 vswdebug(vsw_t *vswp, const char *fmt, ...)
384 {
385 	char buf[512];
386 	va_list ap;
387 
388 	va_start(ap, fmt);
389 	(void) vsprintf(buf, fmt, ap);
390 	va_end(ap);
391 
392 	if (vswp == NULL)
393 		cmn_err(CE_CONT, "%s\n", buf);
394 	else
395 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
396 }
397 
398 /*
399  * For the moment the state dump routines have their own
400  * private flag.
401  */
402 #define	DUMP_STATE	0
403 
404 #if DUMP_STATE
405 
406 #define	DUMP_TAG(tag) \
407 {			\
408 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
409 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
410 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
411 }
412 
413 #define	DUMP_TAG_PTR(tag) \
414 {			\
415 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
416 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
417 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
418 }
419 
420 #define	DUMP_FLAGS(flags) dump_flags(flags);
421 #define	DISPLAY_STATE()	display_state()
422 
423 #else
424 
425 #define	DUMP_TAG(tag)
426 #define	DUMP_TAG_PTR(tag)
427 #define	DUMP_FLAGS(state)
428 #define	DISPLAY_STATE()
429 
430 #endif	/* DUMP_STATE */
431 
432 #ifdef DEBUG
433 
434 #define	D1		\
435 if (vswdbg & 0x01)	\
436 	vswdebug
437 
438 #define	D2		\
439 if (vswdbg & 0x02)	\
440 	vswdebug
441 
442 #define	D3		\
443 if (vswdbg & 0x04)	\
444 	vswdebug
445 
446 #define	DWARN		\
447 if (vswdbg & 0x08)	\
448 	vswdebug
449 
450 #define	DERR		\
451 if (vswdbg & 0x10)	\
452 	vswdebug
453 
454 #else
455 
456 #define	DERR		if (0)	vswdebug
457 #define	DWARN		if (0)	vswdebug
458 #define	D1		if (0)	vswdebug
459 #define	D2		if (0)	vswdebug
460 #define	D3		if (0)	vswdebug
461 
462 #endif	/* DEBUG */
463 
464 static struct modlinkage modlinkage = {
465 	MODREV_1,
466 	&vswmodldrv,
467 	NULL
468 };
469 
470 int
471 _init(void)
472 {
473 	int status;
474 
475 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
476 
477 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
478 	if (status != 0) {
479 		return (status);
480 	}
481 
482 	mac_init_ops(&vsw_ops, "vsw");
483 	status = mod_install(&modlinkage);
484 	if (status != 0) {
485 		ddi_soft_state_fini(&vsw_state);
486 	}
487 	return (status);
488 }
489 
490 int
491 _fini(void)
492 {
493 	int status;
494 
495 	status = mod_remove(&modlinkage);
496 	if (status != 0)
497 		return (status);
498 	mac_fini_ops(&vsw_ops);
499 	ddi_soft_state_fini(&vsw_state);
500 
501 	rw_destroy(&vsw_rw);
502 
503 	return (status);
504 }
505 
506 int
507 _info(struct modinfo *modinfop)
508 {
509 	return (mod_info(&modlinkage, modinfop));
510 }
511 
512 static int
513 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
514 {
515 	vsw_t		*vswp;
516 	int		instance, i;
517 	char		hashname[MAXNAMELEN];
518 	char		qname[TASKQ_NAMELEN];
519 	int		rv = 1;
520 	enum		{ PROG_init = 0x00,
521 				PROG_if_lock = 0x01,
522 				PROG_fdb = 0x02,
523 				PROG_mfdb = 0x04,
524 				PROG_report_dev = 0x08,
525 				PROG_plist = 0x10,
526 				PROG_taskq = 0x20}
527 			progress;
528 
529 	progress = PROG_init;
530 
531 	switch (cmd) {
532 	case DDI_ATTACH:
533 		break;
534 	case DDI_RESUME:
535 		/* nothing to do for this non-device */
536 		return (DDI_SUCCESS);
537 	case DDI_PM_RESUME:
538 	default:
539 		return (DDI_FAILURE);
540 	}
541 
542 	instance = ddi_get_instance(dip);
543 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
544 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
545 		return (DDI_FAILURE);
546 	}
547 	vswp = ddi_get_soft_state(vsw_state, instance);
548 
549 	if (vswp == NULL) {
550 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
551 		goto vsw_attach_fail;
552 	}
553 
554 	vswp->dip = dip;
555 	vswp->instance = instance;
556 	ddi_set_driver_private(dip, (caddr_t)vswp);
557 
558 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
559 	progress |= PROG_if_lock;
560 
561 	/*
562 	 * Get the various properties such as physical device name
563 	 * (vsw-phys-dev), switch mode etc from the MD.
564 	 */
565 	vsw_get_md_properties(vswp);
566 
567 	/* setup the unicast forwarding database  */
568 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
569 							vswp->instance);
570 	D2(vswp, "creating unicast hash table (%s)...", hashname);
571 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
572 		mod_hash_null_valdtor, sizeof (void *));
573 
574 	progress |= PROG_fdb;
575 
576 	/* setup the multicast fowarding database */
577 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
578 							vswp->instance);
579 	D2(vswp, "creating multicast hash table %s)...", hashname);
580 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
581 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
582 			mod_hash_null_valdtor, sizeof (void *));
583 
584 	progress |= PROG_mfdb;
585 
586 	/*
587 	 * create lock protecting list of multicast addresses
588 	 * which could come via m_multicst() entry point when plumbed.
589 	 */
590 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
591 	vswp->mcap = NULL;
592 
593 	ddi_report_dev(vswp->dip);
594 
595 	progress |= PROG_report_dev;
596 
597 	WRITE_ENTER(&vsw_rw);
598 	vswp->next = vsw_head;
599 	vsw_head = vswp;
600 	RW_EXIT(&vsw_rw);
601 
602 	/* setup the port list */
603 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
604 	vswp->plist.head = NULL;
605 
606 	progress |= PROG_plist;
607 
608 	/*
609 	 * Create the taskq which will process all the VIO
610 	 * control messages.
611 	 */
612 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
613 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
614 					TASKQ_DEFAULTPRI, 0)) == NULL) {
615 		cmn_err(CE_WARN, "Unable to create task queue");
616 		goto vsw_attach_fail;
617 	}
618 
619 	progress |= PROG_taskq;
620 
621 	/* select best switching mode */
622 	for (i = 0; i < vswp->smode_num; i++) {
623 		vswp->smode_idx = i;
624 		switch (vswp->smode[i]) {
625 		case VSW_LAYER2:
626 		case VSW_LAYER2_PROMISC:
627 			rv = vsw_setup_layer2(vswp);
628 			break;
629 
630 		case VSW_LAYER3:
631 			rv = vsw_setup_layer3(vswp);
632 			break;
633 
634 		default:
635 			DERR(vswp, "unknown switch mode");
636 			rv = 1;
637 			break;
638 		}
639 
640 		if (rv == 0)
641 			break;
642 	}
643 
644 	if (rv == 1) {
645 		cmn_err(CE_WARN, "Unable to setup switching mode");
646 		goto vsw_attach_fail;
647 	}
648 
649 	D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]);
650 
651 	/*
652 	 * Register with the MAC layer as a network device so
653 	 * we can be plumbed if desired.
654 	 *
655 	 * Do this in both layer 2 and layer 3 mode.
656 	 */
657 	vswp->if_state &= ~VSW_IF_UP;
658 	if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
659 		if (vsw_mac_register(vswp) != 0) {
660 			cmn_err(CE_WARN, "Unable to register as provider "
661 				" with MAC layer, continuing with attach");
662 		}
663 	}
664 
665 	/* prevent auto-detaching */
666 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
667 				DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
668 		cmn_err(CE_NOTE, "Unable to set \"%s\" property for "
669 			"instance %u", DDI_NO_AUTODETACH, instance);
670 	}
671 
672 	/*
673 	 * Now we have everything setup, register for MD change
674 	 * events.
675 	 */
676 	vsw_mdeg_register(vswp);
677 
678 	return (DDI_SUCCESS);
679 
680 vsw_attach_fail:
681 	DERR(NULL, "vsw_attach: failed");
682 
683 	if (progress & PROG_taskq)
684 		ddi_taskq_destroy(vswp->taskq_p);
685 
686 	if (progress & PROG_plist)
687 		rw_destroy(&vswp->plist.lockrw);
688 
689 	if (progress & PROG_report_dev) {
690 		ddi_remove_minor_node(dip, NULL);
691 		mutex_destroy(&vswp->mca_lock);
692 	}
693 
694 	if (progress & PROG_mfdb) {
695 		mod_hash_destroy_hash(vswp->mfdb);
696 		vswp->mfdb = NULL;
697 		rw_destroy(&vswp->mfdbrw);
698 	}
699 
700 	if (progress & PROG_fdb) {
701 		mod_hash_destroy_hash(vswp->fdb);
702 		vswp->fdb = NULL;
703 	}
704 
705 	if (progress & PROG_if_lock)
706 		rw_destroy(&vswp->if_lockrw);
707 
708 	ddi_soft_state_free(vsw_state, instance);
709 	return (DDI_FAILURE);
710 }
711 
712 static int
713 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
714 {
715 	vio_mblk_pool_t		*poolp, *npoolp;
716 	vsw_t			**vswpp, *vswp;
717 	int 			instance;
718 
719 	instance = ddi_get_instance(dip);
720 	vswp = ddi_get_soft_state(vsw_state, instance);
721 
722 	if (vswp == NULL) {
723 		return (DDI_FAILURE);
724 	}
725 
726 	switch (cmd) {
727 	case DDI_DETACH:
728 		break;
729 	case DDI_SUSPEND:
730 	case DDI_PM_SUSPEND:
731 	default:
732 		return (DDI_FAILURE);
733 	}
734 
735 	D2(vswp, "detaching instance %d", instance);
736 
737 	if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
738 		if (vsw_mac_unregister(vswp) != 0) {
739 			cmn_err(CE_WARN, "Unable to detach from MAC layer");
740 			return (DDI_FAILURE);
741 		}
742 		rw_destroy(&vswp->if_lockrw);
743 	}
744 
745 	vsw_mdeg_unregister(vswp);
746 
747 	/* remove mac layer callback */
748 	if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
749 		mac_rx_remove(vswp->mh, vswp->mrh);
750 		vswp->mrh = NULL;
751 	}
752 
753 	if (vsw_detach_ports(vswp) != 0) {
754 		cmn_err(CE_WARN, "Unable to detach ports");
755 		return (DDI_FAILURE);
756 	}
757 
758 	/*
759 	 * Now that the ports have been deleted, stop and close
760 	 * the physical device.
761 	 */
762 	if (vswp->mh != NULL) {
763 		if (vswp->mstarted)
764 			mac_stop(vswp->mh);
765 		if (vswp->mresources)
766 			mac_resource_set(vswp->mh, NULL, NULL);
767 		mac_close(vswp->mh);
768 
769 		vswp->mh = NULL;
770 		vswp->txinfo = NULL;
771 	}
772 
773 	/*
774 	 * Destroy any free pools that may still exist.
775 	 */
776 	poolp = vswp->rxh;
777 	while (poolp != NULL) {
778 		npoolp = vswp->rxh = poolp->nextp;
779 		if (vio_destroy_mblks(poolp) != 0) {
780 			vswp->rxh = poolp;
781 			return (DDI_FAILURE);
782 		}
783 		poolp = npoolp;
784 	}
785 
786 	/*
787 	 * Remove this instance from any entries it may be on in
788 	 * the hash table by using the list of addresses maintained
789 	 * in the vsw_t structure.
790 	 */
791 	vsw_del_mcst_vsw(vswp);
792 
793 	vswp->mcap = NULL;
794 	mutex_destroy(&vswp->mca_lock);
795 
796 	/*
797 	 * By now any pending tasks have finished and the underlying
798 	 * ldc's have been destroyed, so its safe to delete the control
799 	 * message taskq.
800 	 */
801 	if (vswp->taskq_p != NULL)
802 		ddi_taskq_destroy(vswp->taskq_p);
803 
804 	/*
805 	 * At this stage all the data pointers in the hash table
806 	 * should be NULL, as all the ports have been removed and will
807 	 * have deleted themselves from the port lists which the data
808 	 * pointers point to. Hence we can destroy the table using the
809 	 * default destructors.
810 	 */
811 	D2(vswp, "vsw_detach: destroying hash tables..");
812 	mod_hash_destroy_hash(vswp->fdb);
813 	vswp->fdb = NULL;
814 
815 	WRITE_ENTER(&vswp->mfdbrw);
816 	mod_hash_destroy_hash(vswp->mfdb);
817 	vswp->mfdb = NULL;
818 	RW_EXIT(&vswp->mfdbrw);
819 	rw_destroy(&vswp->mfdbrw);
820 
821 	ddi_remove_minor_node(dip, NULL);
822 
823 	rw_destroy(&vswp->plist.lockrw);
824 	WRITE_ENTER(&vsw_rw);
825 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
826 		if (*vswpp == vswp) {
827 			*vswpp = vswp->next;
828 			break;
829 		}
830 	}
831 	RW_EXIT(&vsw_rw);
832 	ddi_soft_state_free(vsw_state, instance);
833 
834 	return (DDI_SUCCESS);
835 }
836 
837 static int
838 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
839 {
840 	_NOTE(ARGUNUSED(dip))
841 
842 	vsw_t	*vswp = NULL;
843 	dev_t	dev = (dev_t)arg;
844 	int	instance;
845 
846 	instance = getminor(dev);
847 
848 	switch (infocmd) {
849 	case DDI_INFO_DEVT2DEVINFO:
850 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
851 			*result = NULL;
852 			return (DDI_FAILURE);
853 		}
854 		*result = vswp->dip;
855 		return (DDI_SUCCESS);
856 
857 	case DDI_INFO_DEVT2INSTANCE:
858 		*result = (void *)(uintptr_t)instance;
859 		return (DDI_SUCCESS);
860 
861 	default:
862 		*result = NULL;
863 		return (DDI_FAILURE);
864 	}
865 }
866 
867 /*
868  * Get the properties from our MD node.
869  */
870 static void
871 vsw_get_md_properties(vsw_t *vswp)
872 {
873 	md_t		*mdp = NULL;
874 	int		num_nodes = 0;
875 	int		len = 0, listsz = 0;
876 	int		num_vdev = 0;
877 	int		i, idx;
878 	boolean_t	found_node = B_FALSE;
879 	char		*smode = NULL;
880 	char		*curr_mode = NULL;
881 	char		*physname = NULL;
882 	char		*node_name = NULL;
883 	char		*dev;
884 	uint64_t 	macaddr = 0;
885 	uint64_t	md_inst, obp_inst;
886 	mde_cookie_t	*listp = NULL;
887 	mde_cookie_t	rootnode;
888 
889 	D1(vswp, "%s: enter", __func__);
890 
891 	/*
892 	 * Further down we compare the obp 'reg' property to the
893 	 * 'cfg-handle' property in the vsw MD node to determine
894 	 * if the node refers to this particular instance. So if
895 	 * we can't read the obp value then there is no point
896 	 * in proceeding further.
897 	 */
898 	if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip,
899 			DDI_PROP_DONTPASS, reg_propname) != 1) {
900 		cmn_err(CE_WARN, "Unable to read %s property "
901 			"from OBP device node", reg_propname);
902 		return;
903 	}
904 
905 	obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
906 		DDI_PROP_DONTPASS, reg_propname, 0);
907 
908 	D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst);
909 
910 	if ((mdp = md_get_handle()) == NULL) {
911 		DERR(vswp, "%s: unable to init MD", __func__);
912 		return;
913 	}
914 
915 	if ((num_nodes = md_node_count(mdp)) <= 0) {
916 		DERR(vswp, "%s: invalid number of  nodes found %d",
917 			__func__, num_nodes);
918 		(void) md_fini_handle(mdp);
919 		return;
920 	}
921 
922 	D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes);
923 
924 	/* allocate enough space for node list */
925 	listsz = num_nodes * sizeof (mde_cookie_t);
926 	listp = kmem_zalloc(listsz, KM_SLEEP);
927 
928 	rootnode = md_root_node(mdp);
929 
930 	/* Get the list of virtual devices */
931 	num_vdev = md_scan_dag(mdp, rootnode,
932 		md_find_name(mdp, vdev_propname),
933 		md_find_name(mdp, "fwd"), listp);
934 
935 	if (num_vdev <= 0) {
936 		DERR(vswp, "%s: didn't find any virtual-device nodes in MD",
937 			__func__);
938 		goto md_prop_exit;
939 	}
940 
941 	D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev);
942 
943 	/* Look for the virtual switch nodes in the list */
944 	for (idx = 0; idx < num_vdev; idx++) {
945 		if (md_get_prop_str(mdp, listp[idx],
946 				"name", &node_name) != 0) {
947 			DERR(vswp, "%s: unable to get node name", __func__);
948 			continue;
949 
950 		}
951 
952 		if (strcmp(node_name, vsw_propname) == 0) {
953 			/* Virtual switch node */
954 			if (md_get_prop_val(mdp, listp[idx],
955 				"cfg-handle", &md_inst) != 0) {
956 				DERR(vswp, "%s: unable to get cfg-handle from"
957 					" node %d", __func__, idx);
958 				goto md_prop_exit;
959 			} else if (md_inst == obp_inst) {
960 				D2(vswp, "%s: found matching node (%d)"
961 					" 0x%llx == 0x%llx", __func__, idx,
962 					md_inst, obp_inst);
963 				found_node = B_TRUE;
964 				break;
965 			}
966 		}
967 	}
968 
969 	if (!found_node) {
970 		DWARN(vswp, "%s: couldn't find correct vsw node", __func__);
971 		goto md_prop_exit;
972 	}
973 
974 	/*
975 	 * Now, having found the correct node, get the various properties.
976 	 */
977 
978 	if (md_get_prop_data(mdp, listp[idx], physdev_propname,
979 				(uint8_t **)(&physname), &len) != 0) {
980 		cmn_err(CE_WARN, "%s: unable to get name(s) of physical "
981 			"device(s) from MD", __func__);
982 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
983 		cmn_err(CE_WARN, "%s is too long a device name", physname);
984 	} else {
985 		(void) strncpy(vswp->physname, physname, strlen(physname) + 1);
986 		vswp->mdprops |= VSW_MD_PHYSNAME;
987 		D2(vswp, "%s: using first device specified (%s)",
988 			__func__, vswp->physname);
989 	}
990 
991 #ifdef DEBUG
992 	/*
993 	 * As a temporary measure to aid testing we check to see if there
994 	 * is a vsw.conf file present. If there is we use the value of the
995 	 * vsw_physname property in the file as the name of the physical
996 	 * device, overriding the value from the MD.
997 	 *
998 	 * There may be multiple devices listed, but for the moment
999 	 * we just use the first one.
1000 	 */
1001 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
1002 		"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
1003 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
1004 			cmn_err(CE_WARN, "%s is too long a device name", dev);
1005 		} else {
1006 			cmn_err(CE_NOTE, "%s: using device name (%s) from "
1007 				"config file", __func__, dev);
1008 
1009 			(void) strncpy(vswp->physname, dev, strlen(dev) + 1);
1010 			vswp->mdprops |= VSW_MD_PHYSNAME;
1011 		}
1012 
1013 		ddi_prop_free(dev);
1014 
1015 	}
1016 #endif
1017 
1018 	/* mac address for vswitch device itself */
1019 	if (md_get_prop_val(mdp, listp[idx],
1020 			macaddr_propname, &macaddr) != 0) {
1021 		cmn_err(CE_WARN, "!Unable to get MAC address from MD");
1022 
1023 		/*
1024 		 * Fallback to using the mac address of the physical
1025 		 * device.
1026 		 */
1027 		if (vsw_get_physaddr(vswp) == 0) {
1028 			cmn_err(CE_NOTE, "!Using MAC address from physical "
1029 				"device (%s)", vswp->physname);
1030 		}
1031 	} else {
1032 		READ_ENTER(&vswp->if_lockrw);
1033 		for (i = ETHERADDRL - 1; i >= 0; i--) {
1034 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
1035 			macaddr >>= 8;
1036 		}
1037 		RW_EXIT(&vswp->if_lockrw);
1038 		vswp->mdprops |= VSW_MD_MACADDR;
1039 	}
1040 
1041 	/*
1042 	 * Get the switch-mode property. The modes are listed in
1043 	 * decreasing order of preference, i.e. prefered mode is
1044 	 * first item in list.
1045 	 */
1046 	len = 0;
1047 	vswp->smode_num = 0;
1048 	if (md_get_prop_data(mdp, listp[idx], smode_propname,
1049 				(uint8_t **)(&smode), &len) != 0) {
1050 		/*
1051 		 * Unable to get switch-mode property from MD, nothing
1052 		 * more we can do.
1053 		 */
1054 		cmn_err(CE_WARN, "!unable to get switch mode property");
1055 		goto md_prop_exit;
1056 	}
1057 
1058 	curr_mode = smode;
1059 	/*
1060 	 * Modes of operation:
1061 	 * 'switched'	 - layer 2 switching, underlying HW in
1062 	 *			programmed mode.
1063 	 * 'promiscuous' - layer 2 switching, underlying HW in
1064 	 *			promiscuous mode.
1065 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
1066 	 *			in non-promiscuous mode.
1067 	 */
1068 	while ((curr_mode < (smode + len)) && (vswp->smode_num < NUM_SMODES)) {
1069 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
1070 		if (strcmp(curr_mode, "switched") == 0) {
1071 			vswp->smode[vswp->smode_num++] = VSW_LAYER2;
1072 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
1073 			vswp->smode[vswp->smode_num++] = VSW_LAYER2_PROMISC;
1074 		} else if (strcmp(curr_mode, "routed") == 0) {
1075 			vswp->smode[vswp->smode_num++] = VSW_LAYER3;
1076 		} else {
1077 			cmn_err(CE_WARN, "Unknown switch mode %s, setting to"
1078 				" default switched mode", curr_mode);
1079 			vswp->smode[vswp->smode_num++] = VSW_LAYER2;
1080 		}
1081 		curr_mode += strlen(curr_mode) + 1;
1082 	}
1083 
1084 	D2(vswp, "%d switching modes specified", vswp->smode_num);
1085 
1086 	if (vswp->smode_num > 0)
1087 		vswp->mdprops |= VSW_MD_SMODE;
1088 
1089 md_prop_exit:
1090 	(void) md_fini_handle(mdp);
1091 
1092 	kmem_free(listp, listsz);
1093 
1094 	D1(vswp, "%s: exit", __func__);
1095 }
1096 
1097 /*
1098  * Get the mac address of the physical device.
1099  *
1100  * Returns 0 on success, 1 on failure.
1101  */
1102 static int
1103 vsw_get_physaddr(vsw_t *vswp)
1104 {
1105 	mac_handle_t	mh;
1106 	char		drv[LIFNAMSIZ];
1107 	uint_t		ddi_instance;
1108 
1109 	D1(vswp, "%s: enter", __func__);
1110 
1111 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS)
1112 		return (1);
1113 
1114 	if (mac_open(vswp->physname, ddi_instance, &mh) != 0) {
1115 		cmn_err(CE_WARN, "!mac_open %s failed", vswp->physname);
1116 		return (1);
1117 	}
1118 
1119 	READ_ENTER(&vswp->if_lockrw);
1120 	mac_unicst_get(mh, vswp->if_addr.ether_addr_octet);
1121 	RW_EXIT(&vswp->if_lockrw);
1122 
1123 	mac_close(mh);
1124 
1125 	vswp->mdprops |= VSW_DEV_MACADDR;
1126 
1127 	D1(vswp, "%s: exit", __func__);
1128 
1129 	return (0);
1130 }
1131 
1132 /*
1133  * Check to see if the card supports the setting of multiple unicst
1134  * addresses.
1135  *
1136  * Returns 0 if card supports the programming of multiple unicast addresses
1137  * and there are free address slots available, otherwise returns 1.
1138  */
1139 static int
1140 vsw_get_hw_maddr(vsw_t *vswp)
1141 {
1142 	D1(vswp, "%s: enter", __func__);
1143 
1144 	if (vswp->mh == NULL) {
1145 		return (1);
1146 	}
1147 
1148 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
1149 		DWARN(vswp, "Unable to get capabilities of"
1150 			" underlying device (%s)", vswp->physname);
1151 		return (1);
1152 	}
1153 
1154 	if (vswp->maddr.maddr_naddrfree == 0) {
1155 		cmn_err(CE_WARN,
1156 			"!device %s has no free unicast address slots",
1157 			vswp->physname);
1158 		return (1);
1159 	}
1160 
1161 	D2(vswp, "%s: %d addrs : %d free", __func__,
1162 		vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
1163 
1164 	D1(vswp, "%s: exit", __func__);
1165 
1166 	return (0);
1167 }
1168 
1169 /*
1170  * Setup for layer 2 switching.
1171  *
1172  * Returns 0 on success, 1 on failure.
1173  */
1174 static int
1175 vsw_setup_layer2(vsw_t *vswp)
1176 {
1177 	D1(vswp, "%s: enter", __func__);
1178 
1179 	vsw_switch_frame = vsw_switch_l2_frame;
1180 
1181 	/*
1182 	 * Attempt to link into the MAC layer so we can get
1183 	 * and send packets out over the physical adapter.
1184 	 */
1185 	if (vswp->mdprops & VSW_MD_PHYSNAME) {
1186 		if (vsw_mac_attach(vswp) != 0) {
1187 			/*
1188 			 * Registration with the MAC layer has failed,
1189 			 * so return 1 so that can fall back to next
1190 			 * prefered switching method.
1191 			 */
1192 			cmn_err(CE_WARN, "!Unable to join as MAC layer "
1193 				"client");
1194 			return (1);
1195 		}
1196 
1197 		if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
1198 			/*
1199 			 * Verify that underlying device can support multiple
1200 			 * unicast mac addresses, and has free capacity.
1201 			 */
1202 			if (vsw_get_hw_maddr(vswp) != 0) {
1203 				cmn_err(CE_WARN, "!unable to setup switching");
1204 				vsw_mac_detach(vswp);
1205 				return (1);
1206 			}
1207 		}
1208 
1209 	} else {
1210 		/*
1211 		 * No physical device name found in MD which is
1212 		 * required for layer 2.
1213 		 */
1214 		cmn_err(CE_WARN, "!no physical device name specified");
1215 		return (1);
1216 	}
1217 
1218 	D1(vswp, "%s: exit", __func__);
1219 
1220 	return (0);
1221 }
1222 
1223 static int
1224 vsw_setup_layer3(vsw_t *vswp)
1225 {
1226 	D1(vswp, "%s: enter", __func__);
1227 
1228 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1229 	vsw_switch_frame = vsw_switch_l3_frame;
1230 
1231 	D1(vswp, "%s: exit", __func__);
1232 
1233 	return (0);
1234 }
1235 
1236 /*
1237  * Link into the MAC layer to gain access to the services provided by
1238  * the underlying physical device driver (which should also have
1239  * registered with the MAC layer).
1240  *
1241  * Only when in layer 2 mode.
1242  */
1243 static int
1244 vsw_mac_attach(vsw_t *vswp)
1245 {
1246 	char	drv[LIFNAMSIZ];
1247 	uint_t	ddi_instance;
1248 
1249 	D1(vswp, "%s: enter", __func__);
1250 
1251 	vswp->mh = NULL;
1252 	vswp->mrh = NULL;
1253 	vswp->mstarted = B_FALSE;
1254 	vswp->mresources = B_FALSE;
1255 
1256 	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
1257 
1258 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
1259 		cmn_err(CE_WARN, "invalid device name: %s", vswp->physname);
1260 		goto mac_fail_exit;
1261 	}
1262 	if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
1263 		cmn_err(CE_WARN, "mac_open %s failed", vswp->physname);
1264 		goto mac_fail_exit;
1265 	}
1266 
1267 	ASSERT(vswp->mh != NULL);
1268 
1269 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1270 
1271 	if (vsw_multi_ring_enable) {
1272 		vsw_mac_ring_tbl_init(vswp);
1273 
1274 		/*
1275 		 * Register our receive callback.
1276 		 */
1277 		vswp->mrh = mac_rx_add(vswp->mh,
1278 			vsw_rx_queue_cb, (void *)vswp);
1279 
1280 		/*
1281 		 * Register our mac resource callback.
1282 		 */
1283 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
1284 		vswp->mresources = B_TRUE;
1285 
1286 		/*
1287 		 * Get the ring resources available to us from
1288 		 * the mac below us.
1289 		 */
1290 		mac_resources(vswp->mh);
1291 	} else {
1292 		/*
1293 		 * Just register our rx callback function
1294 		 */
1295 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1296 	}
1297 
1298 	ASSERT(vswp->mrh != NULL);
1299 
1300 	/* Get the MAC tx fn */
1301 	vswp->txinfo = mac_tx_get(vswp->mh);
1302 
1303 	/* start the interface */
1304 	if (mac_start(vswp->mh) != 0) {
1305 		cmn_err(CE_WARN, "could not start mac interface");
1306 		goto mac_fail_exit;
1307 	}
1308 
1309 	vswp->mstarted = B_TRUE;
1310 
1311 	D1(vswp, "%s: exit", __func__);
1312 	return (0);
1313 
1314 mac_fail_exit:
1315 	vsw_mac_detach(vswp);
1316 
1317 	D1(vswp, "%s: exit", __func__);
1318 	return (1);
1319 }
1320 
1321 static void
1322 vsw_mac_detach(vsw_t *vswp)
1323 {
1324 	D1(vswp, "vsw_mac_detach: enter");
1325 
1326 	ASSERT(vswp != NULL);
1327 
1328 	if (vsw_multi_ring_enable) {
1329 		vsw_mac_ring_tbl_destroy(vswp);
1330 	}
1331 
1332 	if (vswp->mh != NULL) {
1333 		if (vswp->mstarted)
1334 			mac_stop(vswp->mh);
1335 		if (vswp->mrh != NULL)
1336 			mac_rx_remove(vswp->mh, vswp->mrh);
1337 		if (vswp->mresources)
1338 			mac_resource_set(vswp->mh, NULL, NULL);
1339 		mac_close(vswp->mh);
1340 	}
1341 
1342 	vswp->mrh = NULL;
1343 	vswp->mh = NULL;
1344 	vswp->txinfo = NULL;
1345 	vswp->mstarted = B_FALSE;
1346 
1347 	D1(vswp, "vsw_mac_detach: exit");
1348 }
1349 
1350 /*
1351  * Depending on the mode specified, the capabilites and capacity
1352  * of the underlying device setup the physical device.
1353  *
1354  * If in layer 3 mode, then do nothing.
1355  *
1356  * If in layer 2 programmed mode attempt to program the unicast address
1357  * associated with the port into the physical device. If this is not
1358  * possible due to resource exhaustion or simply because the device does
1359  * not support multiple unicast addresses then if required fallback onto
1360  * putting the card into promisc mode.
1361  *
1362  * If in promisc mode then simply set the card into promisc mode.
1363  *
1364  * Returns 0 success, 1 on failure.
1365  */
1366 static int
1367 vsw_set_hw(vsw_t *vswp, vsw_port_t *port)
1368 {
1369 	mac_multi_addr_t	mac_addr;
1370 	void			*mah;
1371 	int			err;
1372 
1373 	D1(vswp, "%s: enter", __func__);
1374 
1375 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1376 		return (0);
1377 
1378 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
1379 		return (vsw_set_hw_promisc(vswp, port));
1380 	}
1381 
1382 	if (vswp->maddr.maddr_handle == NULL)
1383 		return (1);
1384 
1385 	mah = vswp->maddr.maddr_handle;
1386 
1387 	/*
1388 	 * Attempt to program the unicast address into the HW.
1389 	 */
1390 	mac_addr.mma_addrlen = ETHERADDRL;
1391 	ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
1392 
1393 	err = vswp->maddr.maddr_add(mah, &mac_addr);
1394 	if (err != 0) {
1395 		cmn_err(CE_WARN, "!failed to program addr "
1396 			"%x:%x:%x:%x:%x:%x for port %d into device %s "
1397 			": err %d", port->p_macaddr.ether_addr_octet[0],
1398 			port->p_macaddr.ether_addr_octet[1],
1399 			port->p_macaddr.ether_addr_octet[2],
1400 			port->p_macaddr.ether_addr_octet[3],
1401 			port->p_macaddr.ether_addr_octet[4],
1402 			port->p_macaddr.ether_addr_octet[5],
1403 			port->p_instance, vswp->physname, err);
1404 
1405 		/*
1406 		 * Mark that attempt should be made to re-config sometime
1407 		 * in future if a port is deleted.
1408 		 */
1409 		vswp->recfg_reqd = B_TRUE;
1410 
1411 		/*
1412 		 * Only 1 mode specified, nothing more to do.
1413 		 */
1414 		if (vswp->smode_num == 1)
1415 			return (err);
1416 
1417 		/*
1418 		 * If promiscuous was next mode specified try to
1419 		 * set the card into that mode.
1420 		 */
1421 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
1422 			(vswp->smode[vswp->smode_idx + 1]
1423 					== VSW_LAYER2_PROMISC)) {
1424 			vswp->smode_idx += 1;
1425 			return (vsw_set_hw_promisc(vswp, port));
1426 		}
1427 		return (err);
1428 	}
1429 
1430 	port->addr_slot = mac_addr.mma_slot;
1431 	port->addr_set = VSW_ADDR_HW;
1432 
1433 	D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x for port %d "
1434 		"into slot %d of device %s",
1435 		port->p_macaddr.ether_addr_octet[0],
1436 		port->p_macaddr.ether_addr_octet[1],
1437 		port->p_macaddr.ether_addr_octet[2],
1438 		port->p_macaddr.ether_addr_octet[3],
1439 		port->p_macaddr.ether_addr_octet[4],
1440 		port->p_macaddr.ether_addr_octet[5],
1441 		port->p_instance, port->addr_slot, vswp->physname);
1442 
1443 	D1(vswp, "%s: exit", __func__);
1444 
1445 	return (0);
1446 }
1447 
1448 /*
1449  * If in layer 3 mode do nothing.
1450  *
1451  * If in layer 2 switched mode remove the address from the physical
1452  * device.
1453  *
1454  * If in layer 2 promiscuous mode disable promisc mode.
1455  *
1456  * Returns 0 on success.
1457  */
1458 static int
1459 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port)
1460 {
1461 	int		err;
1462 	void		*mah;
1463 
1464 	D1(vswp, "%s: enter", __func__);
1465 
1466 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1467 		return (0);
1468 
1469 	if (port->addr_set == VSW_ADDR_PROMISC) {
1470 		return (vsw_unset_hw_promisc(vswp, port));
1471 	}
1472 
1473 	if (port->addr_set == VSW_ADDR_HW) {
1474 		if (vswp->mh == NULL)
1475 			return (1);
1476 
1477 		if (vswp->maddr.maddr_handle == NULL)
1478 			return (1);
1479 
1480 		mah = vswp->maddr.maddr_handle;
1481 
1482 		err = vswp->maddr.maddr_remove(mah, port->addr_slot);
1483 		if (err != 0) {
1484 			cmn_err(CE_WARN, "!Unable to remove addr "
1485 				"%x:%x:%x:%x:%x:%x for port %d from device %s"
1486 				" : (err %d)",
1487 				port->p_macaddr.ether_addr_octet[0],
1488 				port->p_macaddr.ether_addr_octet[1],
1489 				port->p_macaddr.ether_addr_octet[2],
1490 				port->p_macaddr.ether_addr_octet[3],
1491 				port->p_macaddr.ether_addr_octet[4],
1492 				port->p_macaddr.ether_addr_octet[5],
1493 				port->p_instance, vswp->physname, err);
1494 			return (err);
1495 		}
1496 
1497 		port->addr_set = VSW_ADDR_UNSET;
1498 
1499 		D2(vswp, "removed addr %x:%x:%x:%x:%x:%x for "
1500 			"port %d from device %s",
1501 			port->p_macaddr.ether_addr_octet[0],
1502 			port->p_macaddr.ether_addr_octet[1],
1503 			port->p_macaddr.ether_addr_octet[2],
1504 			port->p_macaddr.ether_addr_octet[3],
1505 			port->p_macaddr.ether_addr_octet[4],
1506 			port->p_macaddr.ether_addr_octet[5],
1507 			port->p_instance, vswp->physname);
1508 	}
1509 
1510 	D1(vswp, "%s: exit", __func__);
1511 	return (0);
1512 }
1513 
1514 /*
1515  * Set network card into promisc mode.
1516  *
1517  * Returns 0 on success, 1 on failure.
1518  */
1519 static int
1520 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port)
1521 {
1522 	D1(vswp, "%s: enter", __func__);
1523 
1524 	if (vswp->mh == NULL)
1525 		return (1);
1526 
1527 	if (vswp->promisc_cnt++ == 0) {
1528 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1529 			vswp->promisc_cnt--;
1530 			return (1);
1531 		}
1532 		cmn_err(CE_NOTE, "!switching device %s into promiscuous mode",
1533 				vswp->physname);
1534 	}
1535 	port->addr_set = VSW_ADDR_PROMISC;
1536 
1537 	D1(vswp, "%s: exit", __func__);
1538 
1539 	return (0);
1540 }
1541 
1542 /*
1543  * Turn off promiscuous mode on network card.
1544  *
1545  * Returns 0 on success, 1 on failure.
1546  */
1547 static int
1548 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port)
1549 {
1550 	vsw_port_list_t 	*plist = &vswp->plist;
1551 
1552 	D1(vswp, "%s: enter", __func__);
1553 
1554 	if (vswp->mh == NULL)
1555 		return (1);
1556 
1557 	ASSERT(port->addr_set == VSW_ADDR_PROMISC);
1558 
1559 	if (--vswp->promisc_cnt == 0) {
1560 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
1561 			vswp->promisc_cnt++;
1562 			return (1);
1563 		}
1564 
1565 		/*
1566 		 * We are exiting promisc mode either because we were
1567 		 * only in promisc mode because we had failed over from
1568 		 * switched mode due to HW resource issues, or the user
1569 		 * wanted the card in promisc mode for all the ports and
1570 		 * the last port is now being deleted. Tweak the message
1571 		 * accordingly.
1572 		 */
1573 		if (plist->num_ports != 0) {
1574 			cmn_err(CE_NOTE, "!switching device %s back to "
1575 				"programmed mode", vswp->physname);
1576 		} else {
1577 			cmn_err(CE_NOTE, "!switching device %s out of "
1578 				"promiscuous mode", vswp->physname);
1579 		}
1580 	}
1581 	port->addr_set = VSW_ADDR_UNSET;
1582 
1583 	D1(vswp, "%s: exit", __func__);
1584 	return (0);
1585 }
1586 
1587 /*
1588  * Determine whether or not we are operating in our prefered
1589  * mode and if not whether the physical resources now allow us
1590  * to operate in it.
1591  *
1592  * Should only be invoked after port which is being deleted has been
1593  * removed from the port list.
1594  */
1595 static int
1596 vsw_reconfig_hw(vsw_t *vswp)
1597 {
1598 	vsw_port_list_t 	*plist = &vswp->plist;
1599 	mac_multi_addr_t	mac_addr;
1600 	vsw_port_t		*tp;
1601 	void			*mah;
1602 	int			rv = 0;
1603 	int			s_idx;
1604 
1605 	D1(vswp, "%s: enter", __func__);
1606 
1607 	if (vswp->maddr.maddr_handle == NULL)
1608 		return (1);
1609 
1610 	/*
1611 	 * Check if there are now sufficient HW resources to
1612 	 * attempt a re-config.
1613 	 */
1614 	if (plist->num_ports > vswp->maddr.maddr_naddrfree)
1615 		return (1);
1616 
1617 	/*
1618 	 * If we are in layer 2 (i.e. switched) or would like to be
1619 	 * in layer 2 then check if any ports need to be programmed
1620 	 * into the HW.
1621 	 *
1622 	 * This can happen in two cases - switched was specified as
1623 	 * the prefered mode of operation but we exhausted the HW
1624 	 * resources and so failed over to the next specifed mode,
1625 	 * or switched was the only mode specified so after HW
1626 	 * resources were exhausted there was nothing more we
1627 	 * could do.
1628 	 */
1629 	if (vswp->smode_idx > 0)
1630 		s_idx = vswp->smode_idx - 1;
1631 	else
1632 		s_idx = vswp->smode_idx;
1633 
1634 	if (vswp->smode[s_idx] == VSW_LAYER2) {
1635 		mah = vswp->maddr.maddr_handle;
1636 
1637 		D2(vswp, "%s: attempting reconfig..", __func__);
1638 
1639 		/*
1640 		 * Scan the port list for any port whose address has not
1641 		 * be programmed in HW - there should be a max of one.
1642 		 */
1643 		for (tp = plist->head; tp != NULL; tp = tp->p_next) {
1644 			if (tp->addr_set != VSW_ADDR_HW) {
1645 				mac_addr.mma_addrlen = ETHERADDRL;
1646 				ether_copy(&tp->p_macaddr, &mac_addr.mma_addr);
1647 
1648 				rv = vswp->maddr.maddr_add(mah, &mac_addr);
1649 				if (rv != 0) {
1650 					DWARN(vswp, "Error setting addr in "
1651 						"HW for port %d err %d",
1652 						tp->p_instance, rv);
1653 					goto reconfig_err_exit;
1654 				}
1655 				tp->addr_slot = mac_addr.mma_slot;
1656 
1657 				D2(vswp, "re-programmed port %d "
1658 					"addr %x:%x:%x:%x:%x:%x into slot %d"
1659 					" of device %s", tp->p_instance,
1660 					tp->p_macaddr.ether_addr_octet[0],
1661 					tp->p_macaddr.ether_addr_octet[1],
1662 					tp->p_macaddr.ether_addr_octet[2],
1663 					tp->p_macaddr.ether_addr_octet[3],
1664 					tp->p_macaddr.ether_addr_octet[4],
1665 					tp->p_macaddr.ether_addr_octet[5],
1666 					tp->addr_slot, vswp->physname);
1667 
1668 				/*
1669 				 * If up to now we had to put the card into
1670 				 * promisc mode to see this address, we
1671 				 * can now safely disable promisc mode.
1672 				 */
1673 				if (tp->addr_set == VSW_ADDR_PROMISC)
1674 					(void) vsw_unset_hw_promisc(vswp, tp);
1675 
1676 				tp->addr_set = VSW_ADDR_HW;
1677 			}
1678 		}
1679 
1680 		/* no further re-config needed */
1681 		vswp->recfg_reqd = B_FALSE;
1682 
1683 		vswp->smode_idx = s_idx;
1684 
1685 		return (0);
1686 	}
1687 
1688 reconfig_err_exit:
1689 	return (rv);
1690 }
1691 
1692 static void
1693 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
1694 {
1695 	ringp->ring_state = VSW_MAC_RING_FREE;
1696 	ringp->ring_arg = NULL;
1697 	ringp->ring_blank = NULL;
1698 	ringp->ring_vqp = NULL;
1699 	ringp->ring_vswp = vswp;
1700 }
1701 
1702 static void
1703 vsw_mac_ring_tbl_init(vsw_t *vswp)
1704 {
1705 	int		i;
1706 
1707 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
1708 
1709 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
1710 	vswp->mac_ring_tbl  =
1711 		kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t),
1712 		KM_SLEEP);
1713 
1714 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
1715 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
1716 }
1717 
1718 static void
1719 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
1720 {
1721 	int	i;
1722 
1723 	mutex_enter(&vswp->mac_ring_lock);
1724 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1725 		if (vswp->mac_ring_tbl[i].ring_state != VSW_MAC_RING_FREE) {
1726 			/*
1727 			 * Destroy the queue.
1728 			 */
1729 			vsw_queue_stop(vswp->mac_ring_tbl[i].ring_vqp);
1730 			vsw_queue_destroy(vswp->mac_ring_tbl[i].ring_vqp);
1731 
1732 			/*
1733 			 * Re-initialize the structure.
1734 			 */
1735 			vsw_mac_ring_tbl_entry_init(vswp,
1736 				&vswp->mac_ring_tbl[i]);
1737 		}
1738 	}
1739 	mutex_exit(&vswp->mac_ring_lock);
1740 
1741 	mutex_destroy(&vswp->mac_ring_lock);
1742 	kmem_free(vswp->mac_ring_tbl,
1743 		vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
1744 	vswp->mac_ring_tbl_sz = 0;
1745 }
1746 
1747 /*
1748  * Handle resource add callbacks from the driver below.
1749  */
1750 static mac_resource_handle_t
1751 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
1752 {
1753 	vsw_t		*vswp = (vsw_t *)arg;
1754 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
1755 	vsw_mac_ring_t	*ringp;
1756 	vsw_queue_t	*vqp;
1757 	int		i;
1758 
1759 	ASSERT(vswp != NULL);
1760 	ASSERT(mrp != NULL);
1761 	ASSERT(vswp->mac_ring_tbl != NULL);
1762 
1763 	D1(vswp, "%s: enter", __func__);
1764 
1765 	/*
1766 	 * Check to make sure we have the correct resource type.
1767 	 */
1768 	if (mrp->mr_type != MAC_RX_FIFO)
1769 		return (NULL);
1770 
1771 	/*
1772 	 * Find a open entry in the ring table.
1773 	 */
1774 	mutex_enter(&vswp->mac_ring_lock);
1775 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1776 		ringp = &vswp->mac_ring_tbl[i];
1777 
1778 		/*
1779 		 * Check for an empty slot, if found, then setup queue
1780 		 * and thread.
1781 		 */
1782 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
1783 			/*
1784 			 * Create the queue for this ring.
1785 			 */
1786 			vqp = vsw_queue_create();
1787 
1788 			/*
1789 			 * Initialize the ring data structure.
1790 			 */
1791 			ringp->ring_vqp = vqp;
1792 			ringp->ring_arg = mrfp->mrf_arg;
1793 			ringp->ring_blank = mrfp->mrf_blank;
1794 			ringp->ring_state = VSW_MAC_RING_INUSE;
1795 
1796 			/*
1797 			 * Create the worker thread.
1798 			 */
1799 			vqp->vq_worker = thread_create(NULL, 0,
1800 				vsw_queue_worker, ringp, 0, &p0,
1801 				TS_RUN, minclsyspri);
1802 			if (vqp->vq_worker == NULL) {
1803 				vsw_queue_destroy(vqp);
1804 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
1805 				ringp = NULL;
1806 			}
1807 
1808 			mutex_exit(&vswp->mac_ring_lock);
1809 			D1(vswp, "%s: exit", __func__);
1810 			return ((mac_resource_handle_t)ringp);
1811 		}
1812 	}
1813 	mutex_exit(&vswp->mac_ring_lock);
1814 
1815 	/*
1816 	 * No slots in the ring table available.
1817 	 */
1818 	D1(vswp, "%s: exit", __func__);
1819 	return (NULL);
1820 }
1821 
1822 static void
1823 vsw_queue_stop(vsw_queue_t *vqp)
1824 {
1825 	mutex_enter(&vqp->vq_lock);
1826 
1827 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
1828 		vqp->vq_state = VSW_QUEUE_STOP;
1829 		cv_signal(&vqp->vq_cv);
1830 
1831 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
1832 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1833 	}
1834 
1835 	mutex_exit(&vqp->vq_lock);
1836 }
1837 
1838 static vsw_queue_t *
1839 vsw_queue_create()
1840 {
1841 	vsw_queue_t *vqp;
1842 
1843 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
1844 
1845 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
1846 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
1847 	vqp->vq_first = NULL;
1848 	vqp->vq_last = NULL;
1849 	vqp->vq_state = VSW_QUEUE_STOP;
1850 
1851 	return (vqp);
1852 }
1853 
1854 static void
1855 vsw_queue_destroy(vsw_queue_t *vqp)
1856 {
1857 	cv_destroy(&vqp->vq_cv);
1858 	mutex_destroy(&vqp->vq_lock);
1859 	kmem_free(vqp, sizeof (vsw_queue_t));
1860 }
1861 
1862 static void
1863 vsw_queue_worker(vsw_mac_ring_t *rrp)
1864 {
1865 	mblk_t		*mp;
1866 	vsw_queue_t	*vqp = rrp->ring_vqp;
1867 	vsw_t		*vswp = rrp->ring_vswp;
1868 
1869 	mutex_enter(&vqp->vq_lock);
1870 
1871 	ASSERT(vqp->vq_state == VSW_QUEUE_STOP);
1872 
1873 	/*
1874 	 * Set the state to running, since the thread is now active.
1875 	 */
1876 	vqp->vq_state = VSW_QUEUE_RUNNING;
1877 
1878 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
1879 		/*
1880 		 * Wait for work to do or the state has changed
1881 		 * to not running.
1882 		 */
1883 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
1884 				(vqp->vq_first == NULL)) {
1885 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1886 		}
1887 
1888 		/*
1889 		 * Process packets that we received from the interface.
1890 		 */
1891 		if (vqp->vq_first != NULL) {
1892 			mp = vqp->vq_first;
1893 
1894 			vqp->vq_first = NULL;
1895 			vqp->vq_last = NULL;
1896 
1897 			mutex_exit(&vqp->vq_lock);
1898 
1899 			/* switch the chain of packets received */
1900 			vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1901 
1902 			mutex_enter(&vqp->vq_lock);
1903 		}
1904 	}
1905 
1906 	/*
1907 	 * We are drained and signal we are done.
1908 	 */
1909 	vqp->vq_state = VSW_QUEUE_DRAINED;
1910 	cv_signal(&vqp->vq_cv);
1911 
1912 	/*
1913 	 * Exit lock and drain the remaining packets.
1914 	 */
1915 	mutex_exit(&vqp->vq_lock);
1916 
1917 	/*
1918 	 * Exit the thread
1919 	 */
1920 	thread_exit();
1921 }
1922 
1923 /*
1924  * static void
1925  * vsw_rx_queue_cb() - Receive callback routine when
1926  *	vsw_multi_ring_enable is non-zero.  Queue the packets
1927  *	to a packet queue for a worker thread to process.
1928  */
1929 static void
1930 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1931 {
1932 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
1933 	vsw_t		*vswp = (vsw_t *)arg;
1934 	vsw_queue_t	*vqp;
1935 	mblk_t		*bp, *last;
1936 
1937 	ASSERT(mrh != NULL);
1938 	ASSERT(vswp != NULL);
1939 	ASSERT(mp != NULL);
1940 
1941 	D1(vswp, "%s: enter", __func__);
1942 
1943 	/*
1944 	 * Find the last element in the mblk chain.
1945 	 */
1946 	bp = mp;
1947 	do {
1948 		last = bp;
1949 		bp = bp->b_next;
1950 	} while (bp != NULL);
1951 
1952 	/* Get the queue for the packets */
1953 	vqp = ringp->ring_vqp;
1954 
1955 	/*
1956 	 * Grab the lock such we can queue the packets.
1957 	 */
1958 	mutex_enter(&vqp->vq_lock);
1959 
1960 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
1961 		freemsg(mp);
1962 		goto vsw_rx_queue_cb_exit;
1963 	}
1964 
1965 	/*
1966 	 * Add the mblk chain to the queue.  If there
1967 	 * is some mblks in the queue, then add the new
1968 	 * chain to the end.
1969 	 */
1970 	if (vqp->vq_first == NULL)
1971 		vqp->vq_first = mp;
1972 	else
1973 		vqp->vq_last->b_next = mp;
1974 
1975 	vqp->vq_last = last;
1976 
1977 	/*
1978 	 * Signal the worker thread that there is work to
1979 	 * do.
1980 	 */
1981 	cv_signal(&vqp->vq_cv);
1982 
1983 	/*
1984 	 * Let go of the lock and exit.
1985 	 */
1986 vsw_rx_queue_cb_exit:
1987 	mutex_exit(&vqp->vq_lock);
1988 	D1(vswp, "%s: exit", __func__);
1989 }
1990 
1991 /*
1992  * receive callback routine. Invoked by MAC layer when there
1993  * are pkts being passed up from physical device.
1994  *
1995  * PERF: It may be more efficient when the card is in promisc
1996  * mode to check the dest address of the pkts here (against
1997  * the FDB) rather than checking later. Needs to be investigated.
1998  */
1999 static void
2000 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
2001 {
2002 	_NOTE(ARGUNUSED(mrh))
2003 
2004 	vsw_t		*vswp = (vsw_t *)arg;
2005 
2006 	ASSERT(vswp != NULL);
2007 
2008 	D1(vswp, "vsw_rx_cb: enter");
2009 
2010 	/* switch the chain of packets received */
2011 	vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
2012 
2013 	D1(vswp, "vsw_rx_cb: exit");
2014 }
2015 
2016 /*
2017  * Send a message out over the physical device via the MAC layer.
2018  *
2019  * Returns any mblks that it was unable to transmit.
2020  */
2021 static mblk_t *
2022 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
2023 {
2024 	const mac_txinfo_t	*mtp;
2025 	mblk_t			*nextp;
2026 
2027 	if (vswp->mh == NULL) {
2028 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
2029 		return (mp);
2030 	} else {
2031 		for (;;) {
2032 			nextp = mp->b_next;
2033 			mp->b_next = NULL;
2034 
2035 			mtp = vswp->txinfo;
2036 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
2037 				mp->b_next = nextp;
2038 				break;
2039 			}
2040 
2041 			if ((mp = nextp) == NULL)
2042 				break;
2043 
2044 		}
2045 
2046 	}
2047 
2048 	return (mp);
2049 }
2050 
2051 /*
2052  * Register with the MAC layer as a network device, so we
2053  * can be plumbed if necessary.
2054  */
2055 static int
2056 vsw_mac_register(vsw_t *vswp)
2057 {
2058 	mac_register_t	*macp;
2059 	int		rv;
2060 
2061 	D1(vswp, "%s: enter", __func__);
2062 
2063 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
2064 		return (EINVAL);
2065 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2066 	macp->m_driver = vswp;
2067 	macp->m_dip = vswp->dip;
2068 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
2069 	macp->m_callbacks = &vsw_m_callbacks;
2070 	macp->m_min_sdu = 0;
2071 	macp->m_max_sdu = ETHERMTU;
2072 	rv = mac_register(macp, &vswp->if_mh);
2073 	mac_free(macp);
2074 	if (rv == 0)
2075 		vswp->if_state |= VSW_IF_REG;
2076 
2077 	D1(vswp, "%s: exit", __func__);
2078 
2079 	return (rv);
2080 }
2081 
2082 static int
2083 vsw_mac_unregister(vsw_t *vswp)
2084 {
2085 	int		rv = 0;
2086 
2087 	D1(vswp, "%s: enter", __func__);
2088 
2089 	WRITE_ENTER(&vswp->if_lockrw);
2090 
2091 	if (vswp->if_state & VSW_IF_REG) {
2092 		rv = mac_unregister(vswp->if_mh);
2093 		if (rv != 0) {
2094 			DWARN(vswp, "%s: unable to unregister from MAC "
2095 				"framework", __func__);
2096 
2097 			RW_EXIT(&vswp->if_lockrw);
2098 			D1(vswp, "%s: fail exit", __func__);
2099 			return (rv);
2100 		}
2101 
2102 		/* mark i/f as down and unregistered */
2103 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
2104 	}
2105 	RW_EXIT(&vswp->if_lockrw);
2106 
2107 	vswp->mdprops &= ~(VSW_MD_MACADDR | VSW_DEV_MACADDR);
2108 
2109 	D1(vswp, "%s: exit", __func__);
2110 
2111 	return (rv);
2112 }
2113 
2114 static int
2115 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
2116 {
2117 	vsw_t			*vswp = (vsw_t *)arg;
2118 
2119 	D1(vswp, "%s: enter", __func__);
2120 
2121 	if (vswp->mh == NULL)
2122 		return (EINVAL);
2123 
2124 	/* return stats from underlying device */
2125 	*val = mac_stat_get(vswp->mh, stat);
2126 	return (0);
2127 }
2128 
2129 static void
2130 vsw_m_stop(void *arg)
2131 {
2132 	vsw_t		*vswp = (vsw_t *)arg;
2133 
2134 	D1(vswp, "%s: enter", __func__);
2135 
2136 	WRITE_ENTER(&vswp->if_lockrw);
2137 	vswp->if_state &= ~VSW_IF_UP;
2138 	RW_EXIT(&vswp->if_lockrw);
2139 
2140 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2141 }
2142 
2143 static int
2144 vsw_m_start(void *arg)
2145 {
2146 	vsw_t		*vswp = (vsw_t *)arg;
2147 
2148 	D1(vswp, "%s: enter", __func__);
2149 
2150 	WRITE_ENTER(&vswp->if_lockrw);
2151 	vswp->if_state |= VSW_IF_UP;
2152 	RW_EXIT(&vswp->if_lockrw);
2153 
2154 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2155 	return (0);
2156 }
2157 
2158 /*
2159  * Change the local interface address.
2160  */
2161 static int
2162 vsw_m_unicst(void *arg, const uint8_t *macaddr)
2163 {
2164 	vsw_t		*vswp = (vsw_t *)arg;
2165 
2166 	D1(vswp, "%s: enter", __func__);
2167 
2168 	WRITE_ENTER(&vswp->if_lockrw);
2169 	ether_copy(macaddr, &vswp->if_addr);
2170 	RW_EXIT(&vswp->if_lockrw);
2171 
2172 	D1(vswp, "%s: exit", __func__);
2173 
2174 	return (0);
2175 }
2176 
2177 static int
2178 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
2179 {
2180 	vsw_t		*vswp = (vsw_t *)arg;
2181 	mcst_addr_t	*mcst_p = NULL;
2182 	uint64_t	addr = 0x0;
2183 	int		i, ret = 0;
2184 
2185 	D1(vswp, "%s: enter", __func__);
2186 
2187 	/*
2188 	 * Convert address into form that can be used
2189 	 * as hash table key.
2190 	 */
2191 	for (i = 0; i < ETHERADDRL; i++) {
2192 		addr = (addr << 8) | mca[i];
2193 	}
2194 
2195 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
2196 
2197 	if (add) {
2198 		D2(vswp, "%s: adding multicast", __func__);
2199 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2200 			/*
2201 			 * Update the list of multicast addresses
2202 			 * contained within the vsw_t structure to
2203 			 * include this new one.
2204 			 */
2205 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
2206 			if (mcst_p == NULL) {
2207 				DERR(vswp, "%s unable to alloc mem", __func__);
2208 				return (1);
2209 			}
2210 			mcst_p->addr = addr;
2211 
2212 			mutex_enter(&vswp->mca_lock);
2213 			mcst_p->nextp = vswp->mcap;
2214 			vswp->mcap = mcst_p;
2215 			mutex_exit(&vswp->mca_lock);
2216 
2217 			/*
2218 			 * Call into the underlying driver to program the
2219 			 * address into HW.
2220 			 */
2221 			if (vswp->mh != NULL) {
2222 				ret = mac_multicst_add(vswp->mh, mca);
2223 				if (ret != 0) {
2224 					cmn_err(CE_WARN, "!unable to add "
2225 						"multicast address");
2226 					goto vsw_remove_addr;
2227 				}
2228 			}
2229 		} else {
2230 			cmn_err(CE_WARN, "!unable to add multicast address");
2231 		}
2232 		return (ret);
2233 	}
2234 
2235 vsw_remove_addr:
2236 
2237 	D2(vswp, "%s: removing multicast", __func__);
2238 	/*
2239 	 * Remove the address from the hash table..
2240 	 */
2241 	if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2242 
2243 		/*
2244 		 * ..and then from the list maintained in the
2245 		 * vsw_t structure.
2246 		 */
2247 		vsw_del_addr(VSW_LOCALDEV, vswp, addr);
2248 
2249 		if (vswp->mh != NULL)
2250 			(void) mac_multicst_remove(vswp->mh, mca);
2251 	}
2252 
2253 	D1(vswp, "%s: exit", __func__);
2254 
2255 	return (0);
2256 }
2257 
2258 static int
2259 vsw_m_promisc(void *arg, boolean_t on)
2260 {
2261 	vsw_t		*vswp = (vsw_t *)arg;
2262 
2263 	D1(vswp, "%s: enter", __func__);
2264 
2265 	WRITE_ENTER(&vswp->if_lockrw);
2266 	if (on)
2267 		vswp->if_state |= VSW_IF_PROMISC;
2268 	else
2269 		vswp->if_state &= ~VSW_IF_PROMISC;
2270 	RW_EXIT(&vswp->if_lockrw);
2271 
2272 	D1(vswp, "%s: exit", __func__);
2273 
2274 	return (0);
2275 }
2276 
2277 static mblk_t *
2278 vsw_m_tx(void *arg, mblk_t *mp)
2279 {
2280 	vsw_t		*vswp = (vsw_t *)arg;
2281 
2282 	D1(vswp, "%s: enter", __func__);
2283 
2284 	vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
2285 
2286 	D1(vswp, "%s: exit", __func__);
2287 
2288 	return (NULL);
2289 }
2290 
2291 /*
2292  * Register for machine description (MD) updates.
2293  */
2294 static void
2295 vsw_mdeg_register(vsw_t *vswp)
2296 {
2297 	mdeg_prop_spec_t	*pspecp;
2298 	mdeg_node_spec_t	*inst_specp;
2299 	mdeg_handle_t		mdeg_hdl;
2300 	size_t			templatesz;
2301 	int			inst, rv;
2302 
2303 	D1(vswp, "%s: enter", __func__);
2304 
2305 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
2306 		DDI_PROP_DONTPASS, reg_propname, -1);
2307 	if (inst == -1) {
2308 		DERR(vswp, "%s: unable to get %s property",
2309 						__func__, reg_propname);
2310 		return;
2311 	}
2312 
2313 	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
2314 
2315 	/*
2316 	 * Allocate and initialize a per-instance copy
2317 	 * of the global property spec array that will
2318 	 * uniquely identify this vsw instance.
2319 	 */
2320 	templatesz = sizeof (vsw_prop_template);
2321 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
2322 
2323 	bcopy(vsw_prop_template, pspecp, templatesz);
2324 
2325 	VSW_SET_MDEG_PROP_INST(pspecp, inst);
2326 
2327 	/* initialize the complete prop spec structure */
2328 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
2329 	inst_specp->namep = "virtual-device";
2330 	inst_specp->specp = pspecp;
2331 
2332 	/* perform the registration */
2333 	rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb,
2334 	    (void *)vswp, &mdeg_hdl);
2335 
2336 	if (rv != MDEG_SUCCESS) {
2337 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
2338 		kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
2339 		kmem_free(pspecp, templatesz);
2340 		return;
2341 	}
2342 
2343 	/* save off data that will be needed later */
2344 	vswp->inst_spec = inst_specp;
2345 	vswp->mdeg_hdl = mdeg_hdl;
2346 
2347 	D1(vswp, "%s: exit", __func__);
2348 }
2349 
2350 static void
2351 vsw_mdeg_unregister(vsw_t *vswp)
2352 {
2353 	D1(vswp, "vsw_mdeg_unregister: enter");
2354 
2355 	(void) mdeg_unregister(vswp->mdeg_hdl);
2356 
2357 	if (vswp->inst_spec->specp != NULL) {
2358 		(void) kmem_free(vswp->inst_spec->specp,
2359 			sizeof (vsw_prop_template));
2360 		vswp->inst_spec->specp = NULL;
2361 	}
2362 
2363 	if (vswp->inst_spec != NULL) {
2364 		(void) kmem_free(vswp->inst_spec,
2365 			sizeof (mdeg_node_spec_t));
2366 		vswp->inst_spec = NULL;
2367 	}
2368 
2369 	D1(vswp, "vsw_mdeg_unregister: exit");
2370 }
2371 
2372 static int
2373 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2374 {
2375 	vsw_t		*vswp;
2376 	int		idx;
2377 	md_t		*mdp;
2378 	mde_cookie_t	node;
2379 	uint64_t	inst;
2380 
2381 	if (resp == NULL)
2382 		return (MDEG_FAILURE);
2383 
2384 	vswp = (vsw_t *)cb_argp;
2385 
2386 	D1(vswp, "%s: added %d : removed %d : matched %d",
2387 		__func__, resp->added.nelem, resp->removed.nelem,
2388 		resp->match_prev.nelem);
2389 
2390 	/* process added ports */
2391 	for (idx = 0; idx < resp->added.nelem; idx++) {
2392 		mdp = resp->added.mdp;
2393 		node = resp->added.mdep[idx];
2394 
2395 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
2396 
2397 		if (vsw_port_add(vswp, mdp, &node) != 0) {
2398 			cmn_err(CE_WARN, "Unable to add new port (0x%lx)",
2399 					node);
2400 		}
2401 	}
2402 
2403 	/* process removed ports */
2404 	for (idx = 0; idx < resp->removed.nelem; idx++) {
2405 		mdp = resp->removed.mdp;
2406 		node = resp->removed.mdep[idx];
2407 
2408 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
2409 			DERR(vswp, "%s: prop(%s) not found port(%d)",
2410 				__func__, id_propname, idx);
2411 			continue;
2412 		}
2413 
2414 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
2415 
2416 		if (vsw_port_detach(vswp, inst) != 0) {
2417 			cmn_err(CE_WARN, "Unable to remove port %ld", inst);
2418 		}
2419 	}
2420 
2421 	/*
2422 	 * Currently no support for updating already active ports.
2423 	 * So, ignore the match_curr and match_priv arrays for now.
2424 	 */
2425 
2426 	D1(vswp, "%s: exit", __func__);
2427 
2428 	return (MDEG_SUCCESS);
2429 }
2430 
2431 /*
2432  * Add a new port to the system.
2433  *
2434  * Returns 0 on success, 1 on failure.
2435  */
2436 int
2437 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
2438 {
2439 	uint64_t		ldc_id;
2440 	uint8_t			*addrp;
2441 	int			i, addrsz;
2442 	int			num_nodes = 0, nchan = 0;
2443 	int			listsz = 0;
2444 	mde_cookie_t		*listp = NULL;
2445 	struct ether_addr	ea;
2446 	uint64_t		macaddr;
2447 	uint64_t		inst = 0;
2448 	vsw_port_t		*port;
2449 
2450 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
2451 		DWARN(vswp, "%s: prop(%s) not found", __func__,
2452 			id_propname);
2453 		return (1);
2454 	}
2455 
2456 	/*
2457 	 * Find the channel endpoint node(s) (which should be under this
2458 	 * port node) which contain the channel id(s).
2459 	 */
2460 	if ((num_nodes = md_node_count(mdp)) <= 0) {
2461 		DERR(vswp, "%s: invalid number of nodes found (%d)",
2462 			__func__, num_nodes);
2463 		return (1);
2464 	}
2465 
2466 	/* allocate enough space for node list */
2467 	listsz = num_nodes * sizeof (mde_cookie_t);
2468 	listp = kmem_zalloc(listsz, KM_SLEEP);
2469 
2470 	nchan = md_scan_dag(mdp, *node,
2471 		md_find_name(mdp, chan_propname),
2472 		md_find_name(mdp, "fwd"), listp);
2473 
2474 	if (nchan <= 0) {
2475 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
2476 		kmem_free(listp, listsz);
2477 		return (1);
2478 	}
2479 
2480 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
2481 
2482 	/* use property from first node found */
2483 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
2484 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
2485 			id_propname);
2486 		kmem_free(listp, listsz);
2487 		return (1);
2488 	}
2489 
2490 	/* don't need list any more */
2491 	kmem_free(listp, listsz);
2492 
2493 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
2494 
2495 	/* read mac-address property */
2496 	if (md_get_prop_data(mdp, *node, remaddr_propname,
2497 					&addrp, &addrsz)) {
2498 		DWARN(vswp, "%s: prop(%s) not found",
2499 				__func__, remaddr_propname);
2500 		return (1);
2501 	}
2502 
2503 	if (addrsz < ETHERADDRL) {
2504 		DWARN(vswp, "%s: invalid address size", __func__);
2505 		return (1);
2506 	}
2507 
2508 	macaddr = *((uint64_t *)addrp);
2509 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
2510 
2511 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2512 		ea.ether_addr_octet[i] = macaddr & 0xFF;
2513 		macaddr >>= 8;
2514 	}
2515 
2516 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
2517 		DERR(vswp, "%s: failed to attach port", __func__);
2518 		return (1);
2519 	}
2520 
2521 	port = vsw_lookup_port(vswp, (int)inst);
2522 
2523 	/* just successfuly created the port, so it should exist */
2524 	ASSERT(port != NULL);
2525 
2526 	return (0);
2527 }
2528 
2529 /*
2530  * Attach the specified port.
2531  *
2532  * Returns 0 on success, 1 on failure.
2533  */
2534 static int
2535 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
2536 struct ether_addr *macaddr)
2537 {
2538 	vsw_port_list_t		*plist = &vswp->plist;
2539 	vsw_port_t		*port, **prev_port;
2540 	int			i;
2541 
2542 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
2543 
2544 	/* port already exists? */
2545 	READ_ENTER(&plist->lockrw);
2546 	for (port = plist->head; port != NULL; port = port->p_next) {
2547 		if (port->p_instance == p_instance) {
2548 			DWARN(vswp, "%s: port instance %d already attached",
2549 				__func__, p_instance);
2550 			RW_EXIT(&plist->lockrw);
2551 			return (1);
2552 		}
2553 	}
2554 	RW_EXIT(&plist->lockrw);
2555 
2556 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
2557 	port->p_vswp = vswp;
2558 	port->p_instance = p_instance;
2559 	port->p_ldclist.num_ldcs = 0;
2560 	port->p_ldclist.head = NULL;
2561 	port->addr_set = VSW_ADDR_UNSET;
2562 
2563 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
2564 
2565 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
2566 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
2567 
2568 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
2569 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
2570 
2571 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
2572 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
2573 	port->state = VSW_PORT_INIT;
2574 
2575 	if (nids > VSW_PORT_MAX_LDCS) {
2576 		D2(vswp, "%s: using first of %d ldc ids",
2577 			__func__, nids);
2578 		nids = VSW_PORT_MAX_LDCS;
2579 	}
2580 
2581 	D2(vswp, "%s: %d nids", __func__, nids);
2582 	for (i = 0; i < nids; i++) {
2583 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
2584 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
2585 			DERR(vswp, "%s: ldc_attach failed", __func__);
2586 
2587 			rw_destroy(&port->p_ldclist.lockrw);
2588 
2589 			cv_destroy(&port->ref_cv);
2590 			mutex_destroy(&port->ref_lock);
2591 
2592 			cv_destroy(&port->state_cv);
2593 			mutex_destroy(&port->state_lock);
2594 
2595 			mutex_destroy(&port->tx_lock);
2596 			mutex_destroy(&port->mca_lock);
2597 			kmem_free(port, sizeof (vsw_port_t));
2598 			return (1);
2599 		}
2600 	}
2601 
2602 	ether_copy(macaddr, &port->p_macaddr);
2603 
2604 	WRITE_ENTER(&plist->lockrw);
2605 
2606 	/* create the fdb entry for this port/mac address */
2607 	(void) vsw_add_fdb(vswp, port);
2608 
2609 	(void) vsw_set_hw(vswp, port);
2610 
2611 	/* link it into the list of ports for this vsw instance */
2612 	prev_port = (vsw_port_t **)(&plist->head);
2613 	port->p_next = *prev_port;
2614 	*prev_port = port;
2615 	plist->num_ports++;
2616 	RW_EXIT(&plist->lockrw);
2617 
2618 	/*
2619 	 * Initialise the port and any ldc's under it.
2620 	 */
2621 	(void) vsw_init_ldcs(port);
2622 
2623 	D1(vswp, "%s: exit", __func__);
2624 	return (0);
2625 }
2626 
2627 /*
2628  * Detach the specified port.
2629  *
2630  * Returns 0 on success, 1 on failure.
2631  */
2632 static int
2633 vsw_port_detach(vsw_t *vswp, int p_instance)
2634 {
2635 	vsw_port_t	*port = NULL;
2636 	vsw_port_list_t	*plist = &vswp->plist;
2637 
2638 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
2639 
2640 	WRITE_ENTER(&plist->lockrw);
2641 
2642 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
2643 		RW_EXIT(&plist->lockrw);
2644 		return (1);
2645 	}
2646 
2647 	if (vsw_plist_del_node(vswp, port)) {
2648 		RW_EXIT(&plist->lockrw);
2649 		return (1);
2650 	}
2651 
2652 	/* Remove address if was programmed into HW. */
2653 	(void) vsw_unset_hw(vswp, port);
2654 
2655 	/* Remove the fdb entry for this port/mac address */
2656 	(void) vsw_del_fdb(vswp, port);
2657 
2658 	/* Remove any multicast addresses.. */
2659 	vsw_del_mcst_port(port);
2660 
2661 	/*
2662 	 * No longer need to hold writer lock on port list now
2663 	 * that we have unlinked the target port from the list.
2664 	 */
2665 	RW_EXIT(&plist->lockrw);
2666 
2667 	READ_ENTER(&plist->lockrw);
2668 
2669 	if (vswp->recfg_reqd)
2670 		(void) vsw_reconfig_hw(vswp);
2671 
2672 	RW_EXIT(&plist->lockrw);
2673 
2674 	if (vsw_port_delete(port)) {
2675 		return (1);
2676 	}
2677 
2678 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
2679 	return (0);
2680 }
2681 
2682 /*
2683  * Detach all active ports.
2684  *
2685  * Returns 0 on success, 1 on failure.
2686  */
2687 static int
2688 vsw_detach_ports(vsw_t *vswp)
2689 {
2690 	vsw_port_list_t 	*plist = &vswp->plist;
2691 	vsw_port_t		*port = NULL;
2692 
2693 	D1(vswp, "%s: enter", __func__);
2694 
2695 	WRITE_ENTER(&plist->lockrw);
2696 
2697 	while ((port = plist->head) != NULL) {
2698 		if (vsw_plist_del_node(vswp, port)) {
2699 			DERR(vswp, "%s: Error deleting port %d"
2700 				" from port list", __func__,
2701 				port->p_instance);
2702 			RW_EXIT(&plist->lockrw);
2703 			return (1);
2704 		}
2705 
2706 		/* Remove address if was programmed into HW. */
2707 		(void) vsw_unset_hw(vswp, port);
2708 
2709 		/* Remove the fdb entry for this port/mac address */
2710 		(void) vsw_del_fdb(vswp, port);
2711 
2712 		/* Remove any multicast addresses.. */
2713 		vsw_del_mcst_port(port);
2714 
2715 		/*
2716 		 * No longer need to hold the lock on the port list
2717 		 * now that we have unlinked the target port from the
2718 		 * list.
2719 		 */
2720 		RW_EXIT(&plist->lockrw);
2721 		if (vsw_port_delete(port)) {
2722 			DERR(vswp, "%s: Error deleting port %d",
2723 				__func__, port->p_instance);
2724 			return (1);
2725 		}
2726 		WRITE_ENTER(&plist->lockrw);
2727 	}
2728 	RW_EXIT(&plist->lockrw);
2729 
2730 	D1(vswp, "%s: exit", __func__);
2731 
2732 	return (0);
2733 }
2734 
2735 /*
2736  * Delete the specified port.
2737  *
2738  * Returns 0 on success, 1 on failure.
2739  */
2740 static int
2741 vsw_port_delete(vsw_port_t *port)
2742 {
2743 	vsw_ldc_list_t 		*ldcl;
2744 	vsw_t			*vswp = port->p_vswp;
2745 
2746 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
2747 
2748 	(void) vsw_uninit_ldcs(port);
2749 
2750 	/*
2751 	 * Wait for any pending ctrl msg tasks which reference this
2752 	 * port to finish.
2753 	 */
2754 	if (vsw_drain_port_taskq(port))
2755 		return (1);
2756 
2757 	/*
2758 	 * Wait for port reference count to hit zero.
2759 	 */
2760 	mutex_enter(&port->ref_lock);
2761 	while (port->ref_cnt != 0)
2762 		cv_wait(&port->ref_cv, &port->ref_lock);
2763 	mutex_exit(&port->ref_lock);
2764 
2765 	/*
2766 	 * Wait for any active callbacks to finish
2767 	 */
2768 	if (vsw_drain_ldcs(port))
2769 		return (1);
2770 
2771 	ldcl = &port->p_ldclist;
2772 	WRITE_ENTER(&ldcl->lockrw);
2773 	while (ldcl->num_ldcs > 0) {
2774 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
2775 			cmn_err(CE_WARN, "unable to detach ldc %ld",
2776 					ldcl->head->ldc_id);
2777 			RW_EXIT(&ldcl->lockrw);
2778 			return (1);
2779 		}
2780 	}
2781 	RW_EXIT(&ldcl->lockrw);
2782 
2783 	rw_destroy(&port->p_ldclist.lockrw);
2784 
2785 	mutex_destroy(&port->mca_lock);
2786 	mutex_destroy(&port->tx_lock);
2787 	cv_destroy(&port->ref_cv);
2788 	mutex_destroy(&port->ref_lock);
2789 
2790 	cv_destroy(&port->state_cv);
2791 	mutex_destroy(&port->state_lock);
2792 
2793 	kmem_free(port, sizeof (vsw_port_t));
2794 
2795 	D1(vswp, "%s: exit", __func__);
2796 
2797 	return (0);
2798 }
2799 
2800 /*
2801  * Attach a logical domain channel (ldc) under a specified port.
2802  *
2803  * Returns 0 on success, 1 on failure.
2804  */
2805 static int
2806 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
2807 {
2808 	vsw_t 		*vswp = port->p_vswp;
2809 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
2810 	vsw_ldc_t 	*ldcp = NULL;
2811 	ldc_attr_t 	attr;
2812 	ldc_status_t	istatus;
2813 	int 		status = DDI_FAILURE;
2814 	int		rv;
2815 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
2816 				PROG_callback = 0x2}
2817 			progress;
2818 
2819 	progress = PROG_init;
2820 
2821 	D1(vswp, "%s: enter", __func__);
2822 
2823 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
2824 	if (ldcp == NULL) {
2825 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
2826 		return (1);
2827 	}
2828 	ldcp->ldc_id = ldc_id;
2829 
2830 	/* allocate pool of receive mblks */
2831 	rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
2832 	if (rv) {
2833 		DWARN(vswp, "%s: unable to create free mblk pool for"
2834 			" channel %ld (rv %d)", __func__, ldc_id, rv);
2835 		kmem_free(ldcp, sizeof (vsw_ldc_t));
2836 		return (1);
2837 	}
2838 
2839 	progress |= PROG_mblks;
2840 
2841 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
2842 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
2843 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
2844 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
2845 
2846 	/* required for handshake with peer */
2847 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
2848 	ldcp->peer_session = 0;
2849 	ldcp->session_status = 0;
2850 
2851 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
2852 	ldcp->hss_id = 1;	/* Initial handshake session id */
2853 
2854 	/* only set for outbound lane, inbound set by peer */
2855 	mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
2856 	mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
2857 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
2858 
2859 	attr.devclass = LDC_DEV_NT_SVC;
2860 	attr.instance = ddi_get_instance(vswp->dip);
2861 	attr.mode = LDC_MODE_UNRELIABLE;
2862 	attr.mtu = VSW_LDC_MTU;
2863 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
2864 	if (status != 0) {
2865 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
2866 		    __func__, ldc_id, status);
2867 		goto ldc_attach_fail;
2868 	}
2869 
2870 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
2871 	if (status != 0) {
2872 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
2873 		    __func__, ldc_id, status);
2874 		(void) ldc_fini(ldcp->ldc_handle);
2875 		goto ldc_attach_fail;
2876 	}
2877 
2878 	progress |= PROG_callback;
2879 
2880 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
2881 
2882 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2883 		DERR(vswp, "%s: ldc_status failed", __func__);
2884 		mutex_destroy(&ldcp->status_lock);
2885 		goto ldc_attach_fail;
2886 	}
2887 
2888 	ldcp->ldc_status = istatus;
2889 	ldcp->ldc_port = port;
2890 	ldcp->ldc_vswp = vswp;
2891 
2892 	/* link it into the list of channels for this port */
2893 	WRITE_ENTER(&ldcl->lockrw);
2894 	ldcp->ldc_next = ldcl->head;
2895 	ldcl->head = ldcp;
2896 	ldcl->num_ldcs++;
2897 	RW_EXIT(&ldcl->lockrw);
2898 
2899 	D1(vswp, "%s: exit", __func__);
2900 	return (0);
2901 
2902 ldc_attach_fail:
2903 	mutex_destroy(&ldcp->ldc_txlock);
2904 	mutex_destroy(&ldcp->ldc_cblock);
2905 
2906 	cv_destroy(&ldcp->drain_cv);
2907 
2908 	if (progress & PROG_callback) {
2909 		(void) ldc_unreg_callback(ldcp->ldc_handle);
2910 	}
2911 
2912 	if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) {
2913 		if (vio_destroy_mblks(ldcp->rxh) != 0) {
2914 			/*
2915 			 * Something odd has happened, as the destroy
2916 			 * will only fail if some mblks have been allocated
2917 			 * from the pool already (which shouldn't happen)
2918 			 * and have not been returned.
2919 			 *
2920 			 * Add the pool pointer to a list maintained in
2921 			 * the device instance. Another attempt will be made
2922 			 * to free the pool when the device itself detaches.
2923 			 */
2924 			cmn_err(CE_WARN, "Creation of ldc channel %ld failed"
2925 				" and cannot destroy associated mblk pool",
2926 				ldc_id);
2927 			ldcp->rxh->nextp =  vswp->rxh;
2928 			vswp->rxh = ldcp->rxh;
2929 		}
2930 	}
2931 	mutex_destroy(&ldcp->drain_cv_lock);
2932 	mutex_destroy(&ldcp->hss_lock);
2933 
2934 	mutex_destroy(&ldcp->lane_in.seq_lock);
2935 	mutex_destroy(&ldcp->lane_out.seq_lock);
2936 	kmem_free(ldcp, sizeof (vsw_ldc_t));
2937 
2938 	return (1);
2939 }
2940 
2941 /*
2942  * Detach a logical domain channel (ldc) belonging to a
2943  * particular port.
2944  *
2945  * Returns 0 on success, 1 on failure.
2946  */
2947 static int
2948 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
2949 {
2950 	vsw_t 		*vswp = port->p_vswp;
2951 	vsw_ldc_t 	*ldcp, *prev_ldcp;
2952 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2953 	int 		rv;
2954 
2955 	prev_ldcp = ldcl->head;
2956 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
2957 		if (ldcp->ldc_id == ldc_id) {
2958 			break;
2959 		}
2960 	}
2961 
2962 	/* specified ldc id not found */
2963 	if (ldcp == NULL) {
2964 		DERR(vswp, "%s: ldcp = NULL", __func__);
2965 		return (1);
2966 	}
2967 
2968 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
2969 
2970 	/*
2971 	 * Before we can close the channel we must release any mapped
2972 	 * resources (e.g. drings).
2973 	 */
2974 	vsw_free_lane_resources(ldcp, INBOUND);
2975 	vsw_free_lane_resources(ldcp, OUTBOUND);
2976 
2977 	/*
2978 	 * If the close fails we are in serious trouble, as won't
2979 	 * be able to delete the parent port.
2980 	 */
2981 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
2982 		DERR(vswp, "%s: error %d closing channel %lld",
2983 			__func__, rv, ldcp->ldc_id);
2984 		return (1);
2985 	}
2986 
2987 	(void) ldc_fini(ldcp->ldc_handle);
2988 
2989 	ldcp->ldc_status = LDC_INIT;
2990 	ldcp->ldc_handle = NULL;
2991 	ldcp->ldc_vswp = NULL;
2992 
2993 	if (ldcp->rxh != NULL) {
2994 		if (vio_destroy_mblks(ldcp->rxh)) {
2995 			/*
2996 			 * Mostly likely some mblks are still in use and
2997 			 * have not been returned to the pool. Add the pool
2998 			 * to the list maintained in the device instance.
2999 			 * Another attempt will be made to destroy the pool
3000 			 * when the device detaches.
3001 			 */
3002 			ldcp->rxh->nextp =  vswp->rxh;
3003 			vswp->rxh = ldcp->rxh;
3004 		}
3005 	}
3006 
3007 	/* unlink it from the list */
3008 	prev_ldcp = ldcp->ldc_next;
3009 	ldcl->num_ldcs--;
3010 
3011 	mutex_destroy(&ldcp->ldc_txlock);
3012 	mutex_destroy(&ldcp->ldc_cblock);
3013 	cv_destroy(&ldcp->drain_cv);
3014 	mutex_destroy(&ldcp->drain_cv_lock);
3015 	mutex_destroy(&ldcp->hss_lock);
3016 	mutex_destroy(&ldcp->lane_in.seq_lock);
3017 	mutex_destroy(&ldcp->lane_out.seq_lock);
3018 	mutex_destroy(&ldcp->status_lock);
3019 
3020 	kmem_free(ldcp, sizeof (vsw_ldc_t));
3021 
3022 	return (0);
3023 }
3024 
3025 /*
3026  * Open and attempt to bring up the channel. Note that channel
3027  * can only be brought up if peer has also opened channel.
3028  *
3029  * Returns 0 if can open and bring up channel, otherwise
3030  * returns 1.
3031  */
3032 static int
3033 vsw_ldc_init(vsw_ldc_t *ldcp)
3034 {
3035 	vsw_t 		*vswp = ldcp->ldc_vswp;
3036 	ldc_status_t	istatus = 0;
3037 	int		rv;
3038 
3039 	D1(vswp, "%s: enter", __func__);
3040 
3041 	LDC_ENTER_LOCK(ldcp);
3042 
3043 	/* don't start at 0 in case clients don't like that */
3044 	ldcp->next_ident = 1;
3045 
3046 	rv = ldc_open(ldcp->ldc_handle);
3047 	if (rv != 0) {
3048 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
3049 		    __func__, ldcp->ldc_id, rv);
3050 		LDC_EXIT_LOCK(ldcp);
3051 		return (1);
3052 	}
3053 
3054 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
3055 		DERR(vswp, "%s: unable to get status", __func__);
3056 		LDC_EXIT_LOCK(ldcp);
3057 		return (1);
3058 
3059 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
3060 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
3061 		    __func__, ldcp->ldc_id, istatus);
3062 		LDC_EXIT_LOCK(ldcp);
3063 		return (1);
3064 	}
3065 
3066 	mutex_enter(&ldcp->status_lock);
3067 	ldcp->ldc_status = istatus;
3068 	mutex_exit(&ldcp->status_lock);
3069 
3070 	rv = ldc_up(ldcp->ldc_handle);
3071 	if (rv != 0) {
3072 		/*
3073 		 * Not a fatal error for ldc_up() to fail, as peer
3074 		 * end point may simply not be ready yet.
3075 		 */
3076 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
3077 			ldcp->ldc_id, rv);
3078 		LDC_EXIT_LOCK(ldcp);
3079 		return (1);
3080 	}
3081 
3082 	/*
3083 	 * ldc_up() call is non-blocking so need to explicitly
3084 	 * check channel status to see if in fact the channel
3085 	 * is UP.
3086 	 */
3087 	mutex_enter(&ldcp->status_lock);
3088 	istatus = ldcp->ldc_status;
3089 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
3090 		DERR(vswp, "%s: unable to get status", __func__);
3091 		mutex_exit(&ldcp->status_lock);
3092 		LDC_EXIT_LOCK(ldcp);
3093 		return (1);
3094 
3095 	}
3096 	mutex_exit(&ldcp->status_lock);
3097 	LDC_EXIT_LOCK(ldcp);
3098 
3099 	if ((istatus != LDC_UP) && (ldcp->ldc_status == LDC_UP)) {
3100 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
3101 			ldcp->ldc_id, istatus);
3102 		vsw_restart_handshake(ldcp);
3103 	}
3104 
3105 	D1(vswp, "%s: exit", __func__);
3106 	return (0);
3107 }
3108 
3109 /* disable callbacks on the channel */
3110 static int
3111 vsw_ldc_uninit(vsw_ldc_t *ldcp)
3112 {
3113 	vsw_t	*vswp = ldcp->ldc_vswp;
3114 	int	rv;
3115 
3116 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
3117 
3118 	LDC_ENTER_LOCK(ldcp);
3119 
3120 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
3121 	if (rv != 0) {
3122 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
3123 			"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
3124 		LDC_EXIT_LOCK(ldcp);
3125 		return (1);
3126 	}
3127 
3128 	mutex_enter(&ldcp->status_lock);
3129 	ldcp->ldc_status = LDC_INIT;
3130 	mutex_exit(&ldcp->status_lock);
3131 
3132 	LDC_EXIT_LOCK(ldcp);
3133 
3134 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
3135 
3136 	return (0);
3137 }
3138 
3139 static int
3140 vsw_init_ldcs(vsw_port_t *port)
3141 {
3142 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3143 	vsw_ldc_t	*ldcp;
3144 
3145 	READ_ENTER(&ldcl->lockrw);
3146 	ldcp =  ldcl->head;
3147 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3148 		(void) vsw_ldc_init(ldcp);
3149 	}
3150 	RW_EXIT(&ldcl->lockrw);
3151 
3152 	return (0);
3153 }
3154 
3155 static int
3156 vsw_uninit_ldcs(vsw_port_t *port)
3157 {
3158 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3159 	vsw_ldc_t	*ldcp;
3160 
3161 	D1(NULL, "vsw_uninit_ldcs: enter\n");
3162 
3163 	READ_ENTER(&ldcl->lockrw);
3164 	ldcp =  ldcl->head;
3165 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3166 		(void) vsw_ldc_uninit(ldcp);
3167 	}
3168 	RW_EXIT(&ldcl->lockrw);
3169 
3170 	D1(NULL, "vsw_uninit_ldcs: exit\n");
3171 
3172 	return (0);
3173 }
3174 
3175 /*
3176  * Wait until the callback(s) associated with the ldcs under the specified
3177  * port have completed.
3178  *
3179  * Prior to this function being invoked each channel under this port
3180  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3181  *
3182  * A short explaination of what we are doing below..
3183  *
3184  * The simplest approach would be to have a reference counter in
3185  * the ldc structure which is increment/decremented by the callbacks as
3186  * they use the channel. The drain function could then simply disable any
3187  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
3188  * there is a tiny window here - before the callback is able to get the lock
3189  * on the channel it is interrupted and this function gets to execute. It
3190  * sees that the ref count is zero and believes its free to delete the
3191  * associated data structures.
3192  *
3193  * We get around this by taking advantage of the fact that before the ldc
3194  * framework invokes a callback it sets a flag to indicate that there is a
3195  * callback active (or about to become active). If when we attempt to
3196  * unregister a callback when this active flag is set then the unregister
3197  * will fail with EWOULDBLOCK.
3198  *
3199  * If the unregister fails we do a cv_timedwait. We will either be signaled
3200  * by the callback as it is exiting (note we have to wait a short period to
3201  * allow the callback to return fully to the ldc framework and it to clear
3202  * the active flag), or by the timer expiring. In either case we again attempt
3203  * the unregister. We repeat this until we can succesfully unregister the
3204  * callback.
3205  *
3206  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
3207  * the case where the callback has finished but the ldc framework has not yet
3208  * cleared the active flag. In this case we would never get a cv_signal.
3209  */
3210 static int
3211 vsw_drain_ldcs(vsw_port_t *port)
3212 {
3213 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3214 	vsw_ldc_t	*ldcp;
3215 	vsw_t		*vswp = port->p_vswp;
3216 
3217 	D1(vswp, "%s: enter", __func__);
3218 
3219 	READ_ENTER(&ldcl->lockrw);
3220 
3221 	ldcp = ldcl->head;
3222 
3223 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3224 		/*
3225 		 * If we can unregister the channel callback then we
3226 		 * know that there is no callback either running or
3227 		 * scheduled to run for this channel so move on to next
3228 		 * channel in the list.
3229 		 */
3230 		mutex_enter(&ldcp->drain_cv_lock);
3231 
3232 		/* prompt active callbacks to quit */
3233 		ldcp->drain_state = VSW_LDC_DRAINING;
3234 
3235 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
3236 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
3237 				ldcp->ldc_id);
3238 			mutex_exit(&ldcp->drain_cv_lock);
3239 			continue;
3240 		} else {
3241 			/*
3242 			 * If we end up here we know that either 1) a callback
3243 			 * is currently executing, 2) is about to start (i.e.
3244 			 * the ldc framework has set the active flag but
3245 			 * has not actually invoked the callback yet, or 3)
3246 			 * has finished and has returned to the ldc framework
3247 			 * but the ldc framework has not yet cleared the
3248 			 * active bit.
3249 			 *
3250 			 * Wait for it to finish.
3251 			 */
3252 			while (ldc_unreg_callback(ldcp->ldc_handle)
3253 								== EWOULDBLOCK)
3254 				(void) cv_timedwait(&ldcp->drain_cv,
3255 					&ldcp->drain_cv_lock, lbolt + hz);
3256 
3257 			mutex_exit(&ldcp->drain_cv_lock);
3258 			D2(vswp, "%s: unreg callback for chan %ld after "
3259 				"timeout", __func__, ldcp->ldc_id);
3260 		}
3261 	}
3262 	RW_EXIT(&ldcl->lockrw);
3263 
3264 	D1(vswp, "%s: exit", __func__);
3265 	return (0);
3266 }
3267 
3268 /*
3269  * Wait until all tasks which reference this port have completed.
3270  *
3271  * Prior to this function being invoked each channel under this port
3272  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3273  */
3274 static int
3275 vsw_drain_port_taskq(vsw_port_t *port)
3276 {
3277 	vsw_t		*vswp = port->p_vswp;
3278 
3279 	D1(vswp, "%s: enter", __func__);
3280 
3281 	/*
3282 	 * Mark the port as in the process of being detached, and
3283 	 * dispatch a marker task to the queue so we know when all
3284 	 * relevant tasks have completed.
3285 	 */
3286 	mutex_enter(&port->state_lock);
3287 	port->state = VSW_PORT_DETACHING;
3288 
3289 	if ((vswp->taskq_p == NULL) ||
3290 		(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
3291 			port, DDI_NOSLEEP) != DDI_SUCCESS)) {
3292 		DERR(vswp, "%s: unable to dispatch marker task",
3293 			__func__);
3294 		mutex_exit(&port->state_lock);
3295 		return (1);
3296 	}
3297 
3298 	/*
3299 	 * Wait for the marker task to finish.
3300 	 */
3301 	while (port->state != VSW_PORT_DETACHABLE)
3302 		cv_wait(&port->state_cv, &port->state_lock);
3303 
3304 	mutex_exit(&port->state_lock);
3305 
3306 	D1(vswp, "%s: exit", __func__);
3307 
3308 	return (0);
3309 }
3310 
3311 static void
3312 vsw_marker_task(void *arg)
3313 {
3314 	vsw_port_t	*port = arg;
3315 	vsw_t		*vswp = port->p_vswp;
3316 
3317 	D1(vswp, "%s: enter", __func__);
3318 
3319 	mutex_enter(&port->state_lock);
3320 
3321 	/*
3322 	 * No further tasks should be dispatched which reference
3323 	 * this port so ok to mark it as safe to detach.
3324 	 */
3325 	port->state = VSW_PORT_DETACHABLE;
3326 
3327 	cv_signal(&port->state_cv);
3328 
3329 	mutex_exit(&port->state_lock);
3330 
3331 	D1(vswp, "%s: exit", __func__);
3332 }
3333 
3334 static vsw_port_t *
3335 vsw_lookup_port(vsw_t *vswp, int p_instance)
3336 {
3337 	vsw_port_list_t *plist = &vswp->plist;
3338 	vsw_port_t	*port;
3339 
3340 	for (port = plist->head; port != NULL; port = port->p_next) {
3341 		if (port->p_instance == p_instance) {
3342 			D2(vswp, "vsw_lookup_port: found p_instance\n");
3343 			return (port);
3344 		}
3345 	}
3346 
3347 	return (NULL);
3348 }
3349 
3350 /*
3351  * Search for and remove the specified port from the port
3352  * list. Returns 0 if able to locate and remove port, otherwise
3353  * returns 1.
3354  */
3355 static int
3356 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
3357 {
3358 	vsw_port_list_t *plist = &vswp->plist;
3359 	vsw_port_t	*curr_p, *prev_p;
3360 
3361 	if (plist->head == NULL)
3362 		return (1);
3363 
3364 	curr_p = prev_p = plist->head;
3365 
3366 	while (curr_p != NULL) {
3367 		if (curr_p == port) {
3368 			if (prev_p == curr_p) {
3369 				plist->head = curr_p->p_next;
3370 			} else {
3371 				prev_p->p_next = curr_p->p_next;
3372 			}
3373 			plist->num_ports--;
3374 			break;
3375 		} else {
3376 			prev_p = curr_p;
3377 			curr_p = curr_p->p_next;
3378 		}
3379 	}
3380 	return (0);
3381 }
3382 
3383 /*
3384  * Interrupt handler for ldc messages.
3385  */
3386 static uint_t
3387 vsw_ldc_cb(uint64_t event, caddr_t arg)
3388 {
3389 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
3390 	vsw_t 		*vswp = ldcp->ldc_vswp;
3391 	ldc_status_t	lstatus;
3392 	int		rv;
3393 
3394 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3395 
3396 	mutex_enter(&ldcp->ldc_cblock);
3397 
3398 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
3399 		mutex_exit(&ldcp->ldc_cblock);
3400 		return (LDC_SUCCESS);
3401 	}
3402 
3403 	mutex_enter(&ldcp->status_lock);
3404 	lstatus = ldcp->ldc_status;
3405 	rv = ldc_status(ldcp->ldc_handle, &ldcp->ldc_status);
3406 	mutex_exit(&ldcp->status_lock);
3407 	if (rv != 0) {
3408 		cmn_err(CE_WARN, "Unable to read channel state");
3409 		goto vsw_cb_exit;
3410 	}
3411 
3412 	if (event & LDC_EVT_UP) {
3413 		/*
3414 		 * Channel has come up, get the state and then start
3415 		 * the handshake.
3416 		 */
3417 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
3418 			__func__, ldcp->ldc_id, event, lstatus);
3419 		D2(vswp, "%s: UP: old status %ld : cur status %ld",
3420 			__func__, lstatus, ldcp->ldc_status);
3421 		if ((ldcp->ldc_status != lstatus) &&
3422 					(ldcp->ldc_status == LDC_UP)) {
3423 				vsw_restart_handshake(ldcp);
3424 		}
3425 
3426 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
3427 	}
3428 
3429 	if (event & LDC_EVT_READ) {
3430 		/*
3431 		 * Data available for reading.
3432 		 */
3433 		D2(vswp, "%s: id(ld) event(%llx) data READ",
3434 				__func__, ldcp->ldc_id, event);
3435 
3436 		vsw_process_pkt(ldcp);
3437 
3438 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
3439 
3440 		goto vsw_cb_exit;
3441 	}
3442 
3443 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
3444 		D2(vswp, "%s: id(%ld) event(%llx) DOWN/RESET",
3445 					__func__, ldcp->ldc_id, event);
3446 
3447 		/* attempt to restart the connection */
3448 		vsw_restart_ldc(ldcp);
3449 
3450 		/*
3451 		 * vsw_restart_ldc() will attempt to bring the channel
3452 		 * back up. Check here to see if that succeeded.
3453 		 */
3454 		mutex_enter(&ldcp->status_lock);
3455 		lstatus = ldcp->ldc_status;
3456 		rv = ldc_status(ldcp->ldc_handle, &ldcp->ldc_status);
3457 		mutex_exit(&ldcp->status_lock);
3458 		if (rv != 0) {
3459 			DERR(vswp, "%s: unable to read status for channel %ld",
3460 				__func__, ldcp->ldc_id);
3461 			goto vsw_cb_exit;
3462 		}
3463 
3464 		D2(vswp, "%s: id(%ld) event(%llx) DOWN/RESET event:"
3465 			" old status %ld : cur status %ld", __func__,
3466 			ldcp->ldc_id, event, lstatus, ldcp->ldc_status);
3467 
3468 		/*
3469 		 * If channel was not previously UP then (re)start the
3470 		 * handshake.
3471 		 */
3472 		if ((ldcp->ldc_status == LDC_UP) && (lstatus != LDC_UP)) {
3473 			D2(vswp, "%s: channel %ld now UP, restarting "
3474 				"handshake", __func__, ldcp->ldc_id);
3475 			vsw_restart_handshake(ldcp);
3476 		}
3477 	}
3478 
3479 	/*
3480 	 * Catch either LDC_EVT_WRITE which we don't support or any
3481 	 * unknown event.
3482 	 */
3483 	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
3484 					| LDC_EVT_DOWN | LDC_EVT_READ)) {
3485 
3486 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
3487 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3488 	}
3489 
3490 vsw_cb_exit:
3491 	mutex_exit(&ldcp->ldc_cblock);
3492 
3493 	/*
3494 	 * Let the drain function know we are finishing if it
3495 	 * is waiting.
3496 	 */
3497 	mutex_enter(&ldcp->drain_cv_lock);
3498 	if (ldcp->drain_state == VSW_LDC_DRAINING)
3499 		cv_signal(&ldcp->drain_cv);
3500 	mutex_exit(&ldcp->drain_cv_lock);
3501 
3502 	return (LDC_SUCCESS);
3503 }
3504 
3505 /*
3506  * Restart the connection with our peer. Free any existing
3507  * data structures and then attempt to bring channel back
3508  * up.
3509  */
3510 static void
3511 vsw_restart_ldc(vsw_ldc_t *ldcp)
3512 {
3513 	int		rv;
3514 	vsw_t		*vswp = ldcp->ldc_vswp;
3515 	vsw_port_t	*port;
3516 	vsw_ldc_list_t	*ldcl;
3517 
3518 	D1(vswp, "%s: enter", __func__);
3519 
3520 	port = ldcp->ldc_port;
3521 	ldcl = &port->p_ldclist;
3522 
3523 	READ_ENTER(&ldcl->lockrw);
3524 
3525 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
3526 		ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3527 
3528 	vsw_free_lane_resources(ldcp, INBOUND);
3529 	vsw_free_lane_resources(ldcp, OUTBOUND);
3530 	RW_EXIT(&ldcl->lockrw);
3531 
3532 	ldcp->lane_in.lstate = 0;
3533 	ldcp->lane_out.lstate = 0;
3534 
3535 	/*
3536 	 * Remove parent port from any multicast groups
3537 	 * it may have registered with. Client must resend
3538 	 * multicast add command after handshake completes.
3539 	 */
3540 	(void) vsw_del_fdb(vswp, port);
3541 
3542 	vsw_del_mcst_port(port);
3543 
3544 	ldcp->peer_session = 0;
3545 	ldcp->session_status = 0;
3546 	ldcp->hcnt = 0;
3547 	ldcp->hphase = VSW_MILESTONE0;
3548 
3549 	rv = ldc_up(ldcp->ldc_handle);
3550 	if (rv != 0) {
3551 		/*
3552 		 * Not a fatal error for ldc_up() to fail, as peer
3553 		 * end point may simply not be ready yet.
3554 		 */
3555 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
3556 			ldcp->ldc_id, rv);
3557 	}
3558 
3559 	D1(vswp, "%s: exit", __func__);
3560 }
3561 
3562 /*
3563  * (Re)start a handshake with our peer by sending them
3564  * our version info.
3565  */
3566 static void
3567 vsw_restart_handshake(vsw_ldc_t *ldcp)
3568 {
3569 	vsw_t		*vswp = ldcp->ldc_vswp;
3570 
3571 	D1(vswp, "vsw_restart_handshake: enter");
3572 
3573 	if (ldcp->hphase != VSW_MILESTONE0) {
3574 		vsw_restart_ldc(ldcp);
3575 	}
3576 
3577 	/*
3578 	 * We now increment the transaction group id. This allows
3579 	 * us to identify and disard any tasks which are still pending
3580 	 * on the taskq and refer to the handshake session we are about
3581 	 * to restart. These stale messages no longer have any real
3582 	 * meaning.
3583 	 */
3584 	mutex_enter(&ldcp->hss_lock);
3585 	ldcp->hss_id++;
3586 	mutex_exit(&ldcp->hss_lock);
3587 
3588 	if (ldcp->hcnt++ > vsw_num_handshakes) {
3589 		cmn_err(CE_WARN, "exceeded number of permitted "
3590 			"handshake attempts (%d) on channel %ld",
3591 			ldcp->hcnt, ldcp->ldc_id);
3592 		return;
3593 	}
3594 
3595 	if ((vswp->taskq_p == NULL) ||
3596 		(ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
3597 			DDI_NOSLEEP) != DDI_SUCCESS)) {
3598 		cmn_err(CE_WARN, "Can't dispatch version handshake task");
3599 	}
3600 
3601 	D1(vswp, "vsw_restart_handshake: exit");
3602 }
3603 
3604 /*
3605  * Deal appropriately with a ECONNRESET event encountered in a ldc_*
3606  * call.
3607  */
3608 static void
3609 vsw_handle_reset(vsw_ldc_t *ldcp)
3610 {
3611 	vsw_t		*vswp = ldcp->ldc_vswp;
3612 	ldc_status_t	lstatus;
3613 
3614 	D1(vswp, "%s: enter", __func__);
3615 
3616 	mutex_enter(&ldcp->status_lock);
3617 	lstatus = ldcp->ldc_status;
3618 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
3619 		DERR(vswp, "%s: unable to read status for channel %ld",
3620 			__func__, ldcp->ldc_id);
3621 		mutex_exit(&ldcp->status_lock);
3622 		return;
3623 	}
3624 	mutex_exit(&ldcp->status_lock);
3625 
3626 	/*
3627 	 * Check the channel's previous recorded state to
3628 	 * determine if this is the first ECONNRESET event
3629 	 * we've gotten for this particular channel (i.e. was
3630 	 * previously up but is no longer). If so, terminate
3631 	 * the channel.
3632 	 */
3633 	if ((ldcp->ldc_status != LDC_UP) && (lstatus == LDC_UP)) {
3634 		vsw_restart_ldc(ldcp);
3635 	}
3636 
3637 	/*
3638 	 * vsw_restart_ldc() will also attempt to bring channel
3639 	 * back up. Check here if that succeeds.
3640 	 */
3641 	mutex_enter(&ldcp->status_lock);
3642 	lstatus = ldcp->ldc_status;
3643 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
3644 		DERR(vswp, "%s: unable to read status for channel %ld",
3645 			__func__, ldcp->ldc_id);
3646 		mutex_exit(&ldcp->status_lock);
3647 		return;
3648 	}
3649 	mutex_exit(&ldcp->status_lock);
3650 
3651 	/*
3652 	 * If channel is now up and no one else (i.e. the callback routine)
3653 	 * has dealt with it then we restart the handshake here.
3654 	 */
3655 	if ((lstatus != LDC_UP) && (ldcp->ldc_status == LDC_UP)) {
3656 		vsw_restart_handshake(ldcp);
3657 	}
3658 
3659 	D1(vswp, "%s: exit", __func__);
3660 }
3661 
3662 /*
3663  * returns 0 if legal for event signified by flag to have
3664  * occured at the time it did. Otherwise returns 1.
3665  */
3666 int
3667 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
3668 {
3669 	vsw_t		*vswp = ldcp->ldc_vswp;
3670 	uint64_t	state;
3671 	uint64_t	phase;
3672 
3673 	if (dir == INBOUND)
3674 		state = ldcp->lane_in.lstate;
3675 	else
3676 		state = ldcp->lane_out.lstate;
3677 
3678 	phase = ldcp->hphase;
3679 
3680 	switch (flag) {
3681 	case VSW_VER_INFO_RECV:
3682 		if (phase > VSW_MILESTONE0) {
3683 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
3684 				" when in state %d\n", ldcp->ldc_id, phase);
3685 			vsw_restart_handshake(ldcp);
3686 			return (1);
3687 		}
3688 		break;
3689 
3690 	case VSW_VER_ACK_RECV:
3691 	case VSW_VER_NACK_RECV:
3692 		if (!(state & VSW_VER_INFO_SENT)) {
3693 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
3694 				" or VER_NACK when in state %d\n",
3695 				ldcp->ldc_id, phase);
3696 			vsw_restart_handshake(ldcp);
3697 			return (1);
3698 		} else
3699 			state &= ~VSW_VER_INFO_SENT;
3700 		break;
3701 
3702 	case VSW_ATTR_INFO_RECV:
3703 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
3704 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
3705 				" when in state %d\n", ldcp->ldc_id, phase);
3706 			vsw_restart_handshake(ldcp);
3707 			return (1);
3708 		}
3709 		break;
3710 
3711 	case VSW_ATTR_ACK_RECV:
3712 	case VSW_ATTR_NACK_RECV:
3713 		if (!(state & VSW_ATTR_INFO_SENT)) {
3714 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
3715 				" or ATTR_NACK when in state %d\n",
3716 				ldcp->ldc_id, phase);
3717 			vsw_restart_handshake(ldcp);
3718 			return (1);
3719 		} else
3720 			state &= ~VSW_ATTR_INFO_SENT;
3721 		break;
3722 
3723 	case VSW_DRING_INFO_RECV:
3724 		if (phase < VSW_MILESTONE1) {
3725 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
3726 				" when in state %d\n", ldcp->ldc_id, phase);
3727 			vsw_restart_handshake(ldcp);
3728 			return (1);
3729 		}
3730 		break;
3731 
3732 	case VSW_DRING_ACK_RECV:
3733 	case VSW_DRING_NACK_RECV:
3734 		if (!(state & VSW_DRING_INFO_SENT)) {
3735 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
3736 				" or DRING_NACK when in state %d\n",
3737 				ldcp->ldc_id, phase);
3738 			vsw_restart_handshake(ldcp);
3739 			return (1);
3740 		} else
3741 			state &= ~VSW_DRING_INFO_SENT;
3742 		break;
3743 
3744 	case VSW_RDX_INFO_RECV:
3745 		if (phase < VSW_MILESTONE3) {
3746 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
3747 				" when in state %d\n", ldcp->ldc_id, phase);
3748 			vsw_restart_handshake(ldcp);
3749 			return (1);
3750 		}
3751 		break;
3752 
3753 	case VSW_RDX_ACK_RECV:
3754 	case VSW_RDX_NACK_RECV:
3755 		if (!(state & VSW_RDX_INFO_SENT)) {
3756 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
3757 				" or RDX_NACK when in state %d\n",
3758 				ldcp->ldc_id, phase);
3759 			vsw_restart_handshake(ldcp);
3760 			return (1);
3761 		} else
3762 			state &= ~VSW_RDX_INFO_SENT;
3763 		break;
3764 
3765 	case VSW_MCST_INFO_RECV:
3766 		if (phase < VSW_MILESTONE3) {
3767 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
3768 				" when in state %d\n", ldcp->ldc_id, phase);
3769 			vsw_restart_handshake(ldcp);
3770 			return (1);
3771 		}
3772 		break;
3773 
3774 	default:
3775 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
3776 				ldcp->ldc_id, flag);
3777 		return (1);
3778 	}
3779 
3780 	if (dir == INBOUND)
3781 		ldcp->lane_in.lstate = state;
3782 	else
3783 		ldcp->lane_out.lstate = state;
3784 
3785 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
3786 
3787 	return (0);
3788 }
3789 
3790 void
3791 vsw_next_milestone(vsw_ldc_t *ldcp)
3792 {
3793 	vsw_t		*vswp = ldcp->ldc_vswp;
3794 
3795 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
3796 		ldcp->ldc_id, ldcp->hphase);
3797 
3798 	DUMP_FLAGS(ldcp->lane_in.lstate);
3799 	DUMP_FLAGS(ldcp->lane_out.lstate);
3800 
3801 	switch (ldcp->hphase) {
3802 
3803 	case VSW_MILESTONE0:
3804 		/*
3805 		 * If we haven't started to handshake with our peer,
3806 		 * start to do so now.
3807 		 */
3808 		if (ldcp->lane_out.lstate == 0) {
3809 			D2(vswp, "%s: (chan %lld) starting handshake "
3810 				"with peer", __func__, ldcp->ldc_id);
3811 			vsw_restart_handshake(ldcp);
3812 		}
3813 
3814 		/*
3815 		 * Only way to pass this milestone is to have successfully
3816 		 * negotiated version info.
3817 		 */
3818 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
3819 			(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
3820 
3821 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
3822 				__func__, ldcp->ldc_id);
3823 
3824 			/*
3825 			 * Next milestone is passed when attribute
3826 			 * information has been successfully exchanged.
3827 			 */
3828 			ldcp->hphase = VSW_MILESTONE1;
3829 			vsw_send_attr(ldcp);
3830 
3831 		}
3832 		break;
3833 
3834 	case VSW_MILESTONE1:
3835 		/*
3836 		 * Only way to pass this milestone is to have successfully
3837 		 * negotiated attribute information.
3838 		 */
3839 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
3840 
3841 			ldcp->hphase = VSW_MILESTONE2;
3842 
3843 			/*
3844 			 * If the peer device has said it wishes to
3845 			 * use descriptor rings then we send it our ring
3846 			 * info, otherwise we just set up a private ring
3847 			 * which we use an internal buffer
3848 			 */
3849 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
3850 				vsw_send_dring_info(ldcp);
3851 		}
3852 		break;
3853 
3854 
3855 	case VSW_MILESTONE2:
3856 		/*
3857 		 * If peer has indicated in its attribute message that
3858 		 * it wishes to use descriptor rings then the only way
3859 		 * to pass this milestone is for us to have received
3860 		 * valid dring info.
3861 		 *
3862 		 * If peer is not using descriptor rings then just fall
3863 		 * through.
3864 		 */
3865 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
3866 			(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
3867 			break;
3868 
3869 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
3870 				__func__, ldcp->ldc_id);
3871 
3872 		ldcp->hphase = VSW_MILESTONE3;
3873 		vsw_send_rdx(ldcp);
3874 		break;
3875 
3876 	case VSW_MILESTONE3:
3877 		/*
3878 		 * Pass this milestone when all paramaters have been
3879 		 * successfully exchanged and RDX sent in both directions.
3880 		 *
3881 		 * Mark outbound lane as available to transmit data.
3882 		 */
3883 		if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) &&
3884 			(ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) {
3885 
3886 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
3887 				__func__, ldcp->ldc_id);
3888 			D2(vswp, "%s: ** handshake complete (0x%llx : "
3889 				"0x%llx) **", __func__, ldcp->lane_in.lstate,
3890 				ldcp->lane_out.lstate);
3891 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
3892 			ldcp->hphase = VSW_MILESTONE4;
3893 			ldcp->hcnt = 0;
3894 			DISPLAY_STATE();
3895 		} else {
3896 			D2(vswp, "%s: still in milestone 3 (0x%llx :"
3897 				" 0x%llx", __func__, ldcp->lane_in.lstate,
3898 				ldcp->lane_out.lstate);
3899 		}
3900 		break;
3901 
3902 	case VSW_MILESTONE4:
3903 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
3904 							ldcp->ldc_id);
3905 		break;
3906 
3907 	default:
3908 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
3909 			ldcp->ldc_id, ldcp->hphase);
3910 	}
3911 
3912 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
3913 		ldcp->hphase);
3914 }
3915 
3916 /*
3917  * Check if major version is supported.
3918  *
3919  * Returns 0 if finds supported major number, and if necessary
3920  * adjusts the minor field.
3921  *
3922  * Returns 1 if can't match major number exactly. Sets mjor/minor
3923  * to next lowest support values, or to zero if no other values possible.
3924  */
3925 static int
3926 vsw_supported_version(vio_ver_msg_t *vp)
3927 {
3928 	int	i;
3929 
3930 	D1(NULL, "vsw_supported_version: enter");
3931 
3932 	for (i = 0; i < VSW_NUM_VER; i++) {
3933 		if (vsw_versions[i].ver_major == vp->ver_major) {
3934 			/*
3935 			 * Matching or lower major version found. Update
3936 			 * minor number if necessary.
3937 			 */
3938 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3939 				D2(NULL, "%s: adjusting minor value"
3940 					" from %d to %d", __func__,
3941 					vp->ver_minor,
3942 					vsw_versions[i].ver_minor);
3943 				vp->ver_minor = vsw_versions[i].ver_minor;
3944 			}
3945 
3946 			return (0);
3947 		}
3948 
3949 		if (vsw_versions[i].ver_major < vp->ver_major) {
3950 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3951 				D2(NULL, "%s: adjusting minor value"
3952 					" from %d to %d", __func__,
3953 					vp->ver_minor,
3954 					vsw_versions[i].ver_minor);
3955 				vp->ver_minor = vsw_versions[i].ver_minor;
3956 			}
3957 			return (1);
3958 		}
3959 	}
3960 
3961 	/* No match was possible, zero out fields */
3962 	vp->ver_major = 0;
3963 	vp->ver_minor = 0;
3964 
3965 	D1(NULL, "vsw_supported_version: exit");
3966 
3967 	return (1);
3968 }
3969 
3970 /*
3971  * Main routine for processing messages received over LDC.
3972  */
3973 static void
3974 vsw_process_pkt(void *arg)
3975 {
3976 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
3977 	vsw_t 		*vswp = ldcp->ldc_vswp;
3978 	size_t		msglen;
3979 	vio_msg_tag_t	tag;
3980 	def_msg_t	dmsg;
3981 	int 		rv = 0;
3982 
3983 
3984 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3985 
3986 	/*
3987 	 * If channel is up read messages until channel is empty.
3988 	 */
3989 	do {
3990 		msglen = sizeof (dmsg);
3991 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
3992 
3993 		if (rv != 0) {
3994 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
3995 				"len(%d)\n", __func__, ldcp->ldc_id,
3996 							rv, msglen);
3997 		}
3998 
3999 		/* channel has been reset */
4000 		if (rv == ECONNRESET) {
4001 			vsw_handle_reset(ldcp);
4002 			break;
4003 		}
4004 
4005 		if (msglen == 0) {
4006 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
4007 			ldcp->ldc_id);
4008 			break;
4009 		}
4010 
4011 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
4012 		    ldcp->ldc_id, msglen);
4013 
4014 		/*
4015 		 * Figure out what sort of packet we have gotten by
4016 		 * examining the msg tag, and then switch it appropriately.
4017 		 */
4018 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
4019 
4020 		switch (tag.vio_msgtype) {
4021 		case VIO_TYPE_CTRL:
4022 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
4023 			break;
4024 		case VIO_TYPE_DATA:
4025 			vsw_process_data_pkt(ldcp, &dmsg, tag);
4026 			break;
4027 		case VIO_TYPE_ERR:
4028 			vsw_process_err_pkt(ldcp, &dmsg, tag);
4029 			break;
4030 		default:
4031 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
4032 				"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
4033 			break;
4034 		}
4035 	} while (msglen);
4036 
4037 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4038 }
4039 
4040 /*
4041  * Dispatch a task to process a VIO control message.
4042  */
4043 static void
4044 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
4045 {
4046 	vsw_ctrl_task_t		*ctaskp = NULL;
4047 	vsw_port_t		*port = ldcp->ldc_port;
4048 	vsw_t			*vswp = port->p_vswp;
4049 
4050 	D1(vswp, "%s: enter", __func__);
4051 
4052 	/*
4053 	 * We need to handle RDX ACK messages in-band as once they
4054 	 * are exchanged it is possible that we will get an
4055 	 * immediate (legitimate) data packet.
4056 	 */
4057 	if ((tag.vio_subtype_env == VIO_RDX) &&
4058 		(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
4059 
4060 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV))
4061 			return;
4062 
4063 		ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV;
4064 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
4065 			"(ostate 0x%llx : hphase %d)", __func__,
4066 			ldcp->ldc_id, ldcp->lane_out.lstate, ldcp->hphase);
4067 		vsw_next_milestone(ldcp);
4068 		return;
4069 	}
4070 
4071 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
4072 
4073 	if (ctaskp == NULL) {
4074 		DERR(vswp, "%s: unable to alloc space for ctrl"
4075 			" msg", __func__);
4076 		vsw_restart_handshake(ldcp);
4077 		return;
4078 	}
4079 
4080 	ctaskp->ldcp = ldcp;
4081 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
4082 	mutex_enter(&ldcp->hss_lock);
4083 	ctaskp->hss_id = ldcp->hss_id;
4084 	mutex_exit(&ldcp->hss_lock);
4085 
4086 	/*
4087 	 * Dispatch task to processing taskq if port is not in
4088 	 * the process of being detached.
4089 	 */
4090 	mutex_enter(&port->state_lock);
4091 	if (port->state == VSW_PORT_INIT) {
4092 		if ((vswp->taskq_p == NULL) ||
4093 			(ddi_taskq_dispatch(vswp->taskq_p,
4094 			vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
4095 							!= DDI_SUCCESS)) {
4096 			DERR(vswp, "%s: unable to dispatch task to taskq",
4097 				__func__);
4098 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4099 			mutex_exit(&port->state_lock);
4100 			vsw_restart_handshake(ldcp);
4101 			return;
4102 		}
4103 	} else {
4104 		DWARN(vswp, "%s: port %d detaching, not dispatching "
4105 			"task", __func__, port->p_instance);
4106 	}
4107 
4108 	mutex_exit(&port->state_lock);
4109 
4110 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
4111 			ldcp->ldc_id);
4112 	D1(vswp, "%s: exit", __func__);
4113 }
4114 
4115 /*
4116  * Process a VIO ctrl message. Invoked from taskq.
4117  */
4118 static void
4119 vsw_process_ctrl_pkt(void *arg)
4120 {
4121 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
4122 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
4123 	vsw_t 		*vswp = ldcp->ldc_vswp;
4124 	vio_msg_tag_t	tag;
4125 	uint16_t	env;
4126 
4127 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4128 
4129 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
4130 	env = tag.vio_subtype_env;
4131 
4132 	/* stale pkt check */
4133 	mutex_enter(&ldcp->hss_lock);
4134 	if (ctaskp->hss_id < ldcp->hss_id) {
4135 		DWARN(vswp, "%s: discarding stale packet belonging to"
4136 			" earlier (%ld) handshake session", __func__,
4137 			ctaskp->hss_id);
4138 		mutex_exit(&ldcp->hss_lock);
4139 		return;
4140 	}
4141 	mutex_exit(&ldcp->hss_lock);
4142 
4143 	/* session id check */
4144 	if (ldcp->session_status & VSW_PEER_SESSION) {
4145 		if (ldcp->peer_session != tag.vio_sid) {
4146 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
4147 				__func__, ldcp->ldc_id, tag.vio_sid);
4148 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4149 			vsw_restart_handshake(ldcp);
4150 			return;
4151 		}
4152 	}
4153 
4154 	/*
4155 	 * Switch on vio_subtype envelope, then let lower routines
4156 	 * decide if its an INFO, ACK or NACK packet.
4157 	 */
4158 	switch (env) {
4159 	case VIO_VER_INFO:
4160 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
4161 		break;
4162 	case VIO_DRING_REG:
4163 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
4164 		break;
4165 	case VIO_DRING_UNREG:
4166 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
4167 		break;
4168 	case VIO_ATTR_INFO:
4169 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
4170 		break;
4171 	case VNET_MCAST_INFO:
4172 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
4173 		break;
4174 	case VIO_RDX:
4175 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
4176 		break;
4177 	default:
4178 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
4179 							__func__, env);
4180 	}
4181 
4182 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4183 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4184 }
4185 
4186 /*
4187  * Version negotiation. We can end up here either because our peer
4188  * has responded to a handshake message we have sent it, or our peer
4189  * has initiated a handshake with us. If its the former then can only
4190  * be ACK or NACK, if its the later can only be INFO.
4191  *
4192  * If its an ACK we move to the next stage of the handshake, namely
4193  * attribute exchange. If its a NACK we see if we can specify another
4194  * version, if we can't we stop.
4195  *
4196  * If it is an INFO we reset all params associated with communication
4197  * in that direction over this channel (remember connection is
4198  * essentially 2 independent simplex channels).
4199  */
4200 void
4201 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
4202 {
4203 	vio_ver_msg_t	*ver_pkt;
4204 	vsw_t 		*vswp = ldcp->ldc_vswp;
4205 
4206 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4207 
4208 	/*
4209 	 * We know this is a ctrl/version packet so
4210 	 * cast it into the correct structure.
4211 	 */
4212 	ver_pkt = (vio_ver_msg_t *)pkt;
4213 
4214 	switch (ver_pkt->tag.vio_subtype) {
4215 	case VIO_SUBTYPE_INFO:
4216 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
4217 
4218 		/*
4219 		 * Record the session id, which we will use from now
4220 		 * until we see another VER_INFO msg. Even then the
4221 		 * session id in most cases will be unchanged, execpt
4222 		 * if channel was reset.
4223 		 */
4224 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
4225 			(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
4226 			DERR(vswp, "%s: updating session id for chan %lld "
4227 				"from %llx to %llx", __func__, ldcp->ldc_id,
4228 				ldcp->peer_session, ver_pkt->tag.vio_sid);
4229 		}
4230 
4231 		ldcp->peer_session = ver_pkt->tag.vio_sid;
4232 		ldcp->session_status |= VSW_PEER_SESSION;
4233 
4234 		/* Legal message at this time ? */
4235 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
4236 			return;
4237 
4238 		/*
4239 		 * First check the device class. Currently only expect
4240 		 * to be talking to a network device. In the future may
4241 		 * also talk to another switch.
4242 		 */
4243 		if (ver_pkt->dev_class != VDEV_NETWORK) {
4244 			DERR(vswp, "%s: illegal device class %d", __func__,
4245 				ver_pkt->dev_class);
4246 
4247 			ver_pkt->tag.vio_sid = ldcp->local_session;
4248 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4249 
4250 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4251 
4252 			vsw_send_msg(ldcp, (void *)ver_pkt,
4253 					sizeof (vio_ver_msg_t));
4254 
4255 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
4256 			vsw_next_milestone(ldcp);
4257 			return;
4258 		} else {
4259 			ldcp->dev_class = ver_pkt->dev_class;
4260 		}
4261 
4262 		/*
4263 		 * Now check the version.
4264 		 */
4265 		if (vsw_supported_version(ver_pkt) == 0) {
4266 			/*
4267 			 * Support this major version and possibly
4268 			 * adjusted minor version.
4269 			 */
4270 
4271 			D2(vswp, "%s: accepted ver %d:%d", __func__,
4272 				ver_pkt->ver_major, ver_pkt->ver_minor);
4273 
4274 			/* Store accepted values */
4275 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4276 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4277 
4278 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4279 
4280 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
4281 		} else {
4282 			/*
4283 			 * NACK back with the next lower major/minor
4284 			 * pairing we support (if don't suuport any more
4285 			 * versions then they will be set to zero.
4286 			 */
4287 
4288 			D2(vswp, "%s: replying with ver %d:%d", __func__,
4289 				ver_pkt->ver_major, ver_pkt->ver_minor);
4290 
4291 			/* Store updated values */
4292 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4293 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4294 
4295 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4296 
4297 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
4298 		}
4299 
4300 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4301 		ver_pkt->tag.vio_sid = ldcp->local_session;
4302 		vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t));
4303 
4304 		vsw_next_milestone(ldcp);
4305 		break;
4306 
4307 	case VIO_SUBTYPE_ACK:
4308 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
4309 
4310 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
4311 			return;
4312 
4313 		/* Store updated values */
4314 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
4315 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4316 
4317 
4318 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
4319 		vsw_next_milestone(ldcp);
4320 
4321 		break;
4322 
4323 	case VIO_SUBTYPE_NACK:
4324 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
4325 
4326 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
4327 			return;
4328 
4329 		/*
4330 		 * If our peer sent us a NACK with the ver fields set to
4331 		 * zero then there is nothing more we can do. Otherwise see
4332 		 * if we support either the version suggested, or a lesser
4333 		 * one.
4334 		 */
4335 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
4336 			DERR(vswp, "%s: peer unable to negotiate any "
4337 				"further.", __func__);
4338 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
4339 			vsw_next_milestone(ldcp);
4340 			return;
4341 		}
4342 
4343 		/*
4344 		 * Check to see if we support this major version or
4345 		 * a lower one. If we don't then maj/min will be set
4346 		 * to zero.
4347 		 */
4348 		(void) vsw_supported_version(ver_pkt);
4349 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
4350 			/* Nothing more we can do */
4351 			DERR(vswp, "%s: version negotiation failed.\n",
4352 								__func__);
4353 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
4354 			vsw_next_milestone(ldcp);
4355 		} else {
4356 			/* found a supported major version */
4357 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
4358 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
4359 
4360 			D2(vswp, "%s: resending with updated values (%x, %x)",
4361 				__func__, ver_pkt->ver_major,
4362 				ver_pkt->ver_minor);
4363 
4364 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
4365 			ver_pkt->tag.vio_sid = ldcp->local_session;
4366 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4367 
4368 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4369 
4370 			vsw_send_msg(ldcp, (void *)ver_pkt,
4371 					sizeof (vio_ver_msg_t));
4372 
4373 			vsw_next_milestone(ldcp);
4374 
4375 		}
4376 		break;
4377 
4378 	default:
4379 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4380 			ver_pkt->tag.vio_subtype);
4381 	}
4382 
4383 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4384 }
4385 
4386 /*
4387  * Process an attribute packet. We can end up here either because our peer
4388  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
4389  * peer has sent us an attribute INFO message
4390  *
4391  * If its an ACK we then move to the next stage of the handshake which
4392  * is to send our descriptor ring info to our peer. If its a NACK then
4393  * there is nothing more we can (currently) do.
4394  *
4395  * If we get a valid/acceptable INFO packet (and we have already negotiated
4396  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
4397  * NACK back and reset channel state to INACTIV.
4398  *
4399  * FUTURE: in time we will probably negotiate over attributes, but for
4400  * the moment unacceptable attributes are regarded as a fatal error.
4401  *
4402  */
4403 void
4404 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
4405 {
4406 	vnet_attr_msg_t		*attr_pkt;
4407 	vsw_t			*vswp = ldcp->ldc_vswp;
4408 	vsw_port_t		*port = ldcp->ldc_port;
4409 	uint64_t		macaddr = 0;
4410 	int			i;
4411 
4412 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4413 
4414 	/*
4415 	 * We know this is a ctrl/attr packet so
4416 	 * cast it into the correct structure.
4417 	 */
4418 	attr_pkt = (vnet_attr_msg_t *)pkt;
4419 
4420 	switch (attr_pkt->tag.vio_subtype) {
4421 	case VIO_SUBTYPE_INFO:
4422 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4423 
4424 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
4425 			return;
4426 
4427 		/*
4428 		 * If the attributes are unacceptable then we NACK back.
4429 		 */
4430 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
4431 
4432 			DERR(vswp, "%s (chan %d): invalid attributes",
4433 				__func__, ldcp->ldc_id);
4434 
4435 			vsw_free_lane_resources(ldcp, INBOUND);
4436 
4437 			attr_pkt->tag.vio_sid = ldcp->local_session;
4438 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4439 
4440 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
4441 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
4442 			vsw_send_msg(ldcp, (void *)attr_pkt,
4443 					sizeof (vnet_attr_msg_t));
4444 
4445 			vsw_next_milestone(ldcp);
4446 			return;
4447 		}
4448 
4449 		/*
4450 		 * Otherwise store attributes for this lane and update
4451 		 * lane state.
4452 		 */
4453 		ldcp->lane_in.mtu = attr_pkt->mtu;
4454 		ldcp->lane_in.addr = attr_pkt->addr;
4455 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
4456 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
4457 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
4458 
4459 		macaddr = ldcp->lane_in.addr;
4460 		for (i = ETHERADDRL - 1; i >= 0; i--) {
4461 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
4462 			macaddr >>= 8;
4463 		}
4464 
4465 		/* create the fdb entry for this port/mac address */
4466 		(void) vsw_add_fdb(vswp, port);
4467 
4468 		/* setup device specifc xmit routines */
4469 		mutex_enter(&port->tx_lock);
4470 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
4471 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
4472 			port->transmit = vsw_dringsend;
4473 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
4474 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
4475 			vsw_create_privring(ldcp);
4476 			port->transmit = vsw_descrsend;
4477 		}
4478 		mutex_exit(&port->tx_lock);
4479 
4480 		attr_pkt->tag.vio_sid = ldcp->local_session;
4481 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4482 
4483 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
4484 
4485 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
4486 
4487 		vsw_send_msg(ldcp, (void *)attr_pkt,
4488 					sizeof (vnet_attr_msg_t));
4489 
4490 		vsw_next_milestone(ldcp);
4491 		break;
4492 
4493 	case VIO_SUBTYPE_ACK:
4494 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4495 
4496 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
4497 			return;
4498 
4499 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
4500 		vsw_next_milestone(ldcp);
4501 		break;
4502 
4503 	case VIO_SUBTYPE_NACK:
4504 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4505 
4506 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
4507 			return;
4508 
4509 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
4510 		vsw_next_milestone(ldcp);
4511 		break;
4512 
4513 	default:
4514 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4515 			attr_pkt->tag.vio_subtype);
4516 	}
4517 
4518 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4519 }
4520 
4521 /*
4522  * Process a dring info packet. We can end up here either because our peer
4523  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
4524  * peer has sent us a dring INFO message.
4525  *
4526  * If we get a valid/acceptable INFO packet (and we have already negotiated
4527  * a version) we ACK back and update the lane state, otherwise we NACK back.
4528  *
4529  * FUTURE: nothing to stop client from sending us info on multiple dring's
4530  * but for the moment we will just use the first one we are given.
4531  *
4532  */
4533 void
4534 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
4535 {
4536 	vio_dring_reg_msg_t	*dring_pkt;
4537 	vsw_t			*vswp = ldcp->ldc_vswp;
4538 	ldc_mem_info_t		minfo;
4539 	dring_info_t		*dp, *dbp;
4540 	int			dring_found = 0;
4541 
4542 	/*
4543 	 * We know this is a ctrl/dring packet so
4544 	 * cast it into the correct structure.
4545 	 */
4546 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
4547 
4548 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4549 
4550 	switch (dring_pkt->tag.vio_subtype) {
4551 	case VIO_SUBTYPE_INFO:
4552 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4553 
4554 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
4555 			return;
4556 
4557 		/*
4558 		 * If the dring params are unacceptable then we NACK back.
4559 		 */
4560 		if (vsw_check_dring_info(dring_pkt)) {
4561 
4562 			DERR(vswp, "%s (%lld): invalid dring info",
4563 				__func__, ldcp->ldc_id);
4564 
4565 			vsw_free_lane_resources(ldcp, INBOUND);
4566 
4567 			dring_pkt->tag.vio_sid = ldcp->local_session;
4568 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4569 
4570 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
4571 
4572 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
4573 
4574 			vsw_send_msg(ldcp, (void *)dring_pkt,
4575 					sizeof (vio_dring_reg_msg_t));
4576 
4577 			vsw_next_milestone(ldcp);
4578 			return;
4579 		}
4580 
4581 		/*
4582 		 * Otherwise, attempt to map in the dring using the
4583 		 * cookie. If that succeeds we send back a unique dring
4584 		 * identifier that the sending side will use in future
4585 		 * to refer to this descriptor ring.
4586 		 */
4587 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4588 
4589 		dp->num_descriptors = dring_pkt->num_descriptors;
4590 		dp->descriptor_size = dring_pkt->descriptor_size;
4591 		dp->options = dring_pkt->options;
4592 		dp->ncookies = dring_pkt->ncookies;
4593 
4594 		/*
4595 		 * Note: should only get one cookie. Enforced in
4596 		 * the ldc layer.
4597 		 */
4598 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
4599 			sizeof (ldc_mem_cookie_t));
4600 
4601 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
4602 			dp->num_descriptors, dp->descriptor_size);
4603 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
4604 			dp->options, dp->ncookies);
4605 
4606 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
4607 			dp->ncookies, dp->num_descriptors,
4608 			dp->descriptor_size, LDC_SHADOW_MAP,
4609 			&(dp->handle))) != 0) {
4610 
4611 			DERR(vswp, "%s: dring_map failed\n", __func__);
4612 
4613 			kmem_free(dp, sizeof (dring_info_t));
4614 			vsw_free_lane_resources(ldcp, INBOUND);
4615 
4616 			dring_pkt->tag.vio_sid = ldcp->local_session;
4617 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4618 
4619 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
4620 
4621 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
4622 			vsw_send_msg(ldcp, (void *)dring_pkt,
4623 				sizeof (vio_dring_reg_msg_t));
4624 
4625 			vsw_next_milestone(ldcp);
4626 			return;
4627 		}
4628 
4629 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4630 
4631 			DERR(vswp, "%s: dring_addr failed\n", __func__);
4632 
4633 			kmem_free(dp, sizeof (dring_info_t));
4634 			vsw_free_lane_resources(ldcp, INBOUND);
4635 
4636 			dring_pkt->tag.vio_sid = ldcp->local_session;
4637 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4638 
4639 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
4640 
4641 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
4642 			vsw_send_msg(ldcp, (void *)dring_pkt,
4643 				sizeof (vio_dring_reg_msg_t));
4644 
4645 			vsw_next_milestone(ldcp);
4646 			return;
4647 		} else {
4648 			/* store the address of the pub part of ring */
4649 			dp->pub_addr = minfo.vaddr;
4650 		}
4651 
4652 		/* no private section as we are importing */
4653 		dp->priv_addr = NULL;
4654 
4655 		/*
4656 		 * Using simple mono increasing int for ident at
4657 		 * the moment.
4658 		 */
4659 		dp->ident = ldcp->next_ident;
4660 		ldcp->next_ident++;
4661 
4662 		dp->end_idx = 0;
4663 		dp->next = NULL;
4664 
4665 		/*
4666 		 * Link it onto the end of the list of drings
4667 		 * for this lane.
4668 		 */
4669 		if (ldcp->lane_in.dringp == NULL) {
4670 			D2(vswp, "%s: adding first INBOUND dring", __func__);
4671 			ldcp->lane_in.dringp = dp;
4672 		} else {
4673 			dbp = ldcp->lane_in.dringp;
4674 
4675 			while (dbp->next != NULL)
4676 				dbp = dbp->next;
4677 
4678 			dbp->next = dp;
4679 		}
4680 
4681 		/* acknowledge it */
4682 		dring_pkt->tag.vio_sid = ldcp->local_session;
4683 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4684 		dring_pkt->dring_ident = dp->ident;
4685 
4686 		vsw_send_msg(ldcp, (void *)dring_pkt,
4687 				sizeof (vio_dring_reg_msg_t));
4688 
4689 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
4690 		vsw_next_milestone(ldcp);
4691 		break;
4692 
4693 	case VIO_SUBTYPE_ACK:
4694 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4695 
4696 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
4697 			return;
4698 
4699 		/*
4700 		 * Peer is acknowledging our dring info and will have
4701 		 * sent us a dring identifier which we will use to
4702 		 * refer to this ring w.r.t. our peer.
4703 		 */
4704 		dp = ldcp->lane_out.dringp;
4705 		if (dp != NULL) {
4706 			/*
4707 			 * Find the ring this ident should be associated
4708 			 * with.
4709 			 */
4710 			if (vsw_dring_match(dp, dring_pkt)) {
4711 				dring_found = 1;
4712 
4713 			} else while (dp != NULL) {
4714 				if (vsw_dring_match(dp, dring_pkt)) {
4715 					dring_found = 1;
4716 					break;
4717 				}
4718 				dp = dp->next;
4719 			}
4720 
4721 			if (dring_found == 0) {
4722 				DERR(NULL, "%s: unrecognised ring cookie",
4723 					__func__);
4724 				vsw_restart_handshake(ldcp);
4725 				return;
4726 			}
4727 
4728 		} else {
4729 			DERR(vswp, "%s: DRING ACK received but no drings "
4730 				"allocated", __func__);
4731 			vsw_restart_handshake(ldcp);
4732 			return;
4733 		}
4734 
4735 		/* store ident */
4736 		dp->ident = dring_pkt->dring_ident;
4737 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
4738 		vsw_next_milestone(ldcp);
4739 		break;
4740 
4741 	case VIO_SUBTYPE_NACK:
4742 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4743 
4744 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
4745 			return;
4746 
4747 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
4748 		vsw_next_milestone(ldcp);
4749 		break;
4750 
4751 	default:
4752 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4753 			dring_pkt->tag.vio_subtype);
4754 	}
4755 
4756 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4757 }
4758 
4759 /*
4760  * Process a request from peer to unregister a dring.
4761  *
4762  * For the moment we just restart the handshake if our
4763  * peer endpoint attempts to unregister a dring.
4764  */
4765 void
4766 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
4767 {
4768 	vsw_t			*vswp = ldcp->ldc_vswp;
4769 	vio_dring_unreg_msg_t	*dring_pkt;
4770 
4771 	/*
4772 	 * We know this is a ctrl/dring packet so
4773 	 * cast it into the correct structure.
4774 	 */
4775 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
4776 
4777 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4778 
4779 	switch (dring_pkt->tag.vio_subtype) {
4780 	case VIO_SUBTYPE_INFO:
4781 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4782 
4783 		DWARN(vswp, "%s: restarting handshake..", __func__);
4784 		vsw_restart_handshake(ldcp);
4785 		break;
4786 
4787 	case VIO_SUBTYPE_ACK:
4788 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4789 
4790 		DWARN(vswp, "%s: restarting handshake..", __func__);
4791 		vsw_restart_handshake(ldcp);
4792 		break;
4793 
4794 	case VIO_SUBTYPE_NACK:
4795 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4796 
4797 		DWARN(vswp, "%s: restarting handshake..", __func__);
4798 		vsw_restart_handshake(ldcp);
4799 		break;
4800 
4801 	default:
4802 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4803 			dring_pkt->tag.vio_subtype);
4804 		vsw_restart_handshake(ldcp);
4805 	}
4806 
4807 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4808 }
4809 
4810 #define	SND_MCST_NACK(ldcp, pkt) \
4811 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4812 	pkt->tag.vio_sid = ldcp->local_session; \
4813 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t));
4814 
4815 /*
4816  * Process a multicast request from a vnet.
4817  *
4818  * Vnet's specify a multicast address that they are interested in. This
4819  * address is used as a key into the hash table which forms the multicast
4820  * forwarding database (mFDB).
4821  *
4822  * The table keys are the multicast addresses, while the table entries
4823  * are pointers to lists of ports which wish to receive packets for the
4824  * specified multicast address.
4825  *
4826  * When a multicast packet is being switched we use the address as a key
4827  * into the hash table, and then walk the appropriate port list forwarding
4828  * the pkt to each port in turn.
4829  *
4830  * If a vnet is no longer interested in a particular multicast grouping
4831  * we simply find the correct location in the hash table and then delete
4832  * the relevant port from the port list.
4833  *
4834  * To deal with the case whereby a port is being deleted without first
4835  * removing itself from the lists in the hash table, we maintain a list
4836  * of multicast addresses the port has registered an interest in, within
4837  * the port structure itself. We then simply walk that list of addresses
4838  * using them as keys into the hash table and remove the port from the
4839  * appropriate lists.
4840  */
4841 static void
4842 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
4843 {
4844 	vnet_mcast_msg_t	*mcst_pkt;
4845 	vsw_port_t		*port = ldcp->ldc_port;
4846 	vsw_t			*vswp = ldcp->ldc_vswp;
4847 	int			i;
4848 
4849 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4850 
4851 	/*
4852 	 * We know this is a ctrl/mcast packet so
4853 	 * cast it into the correct structure.
4854 	 */
4855 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
4856 
4857 	switch (mcst_pkt->tag.vio_subtype) {
4858 	case VIO_SUBTYPE_INFO:
4859 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4860 
4861 		/*
4862 		 * Check if in correct state to receive a multicast
4863 		 * message (i.e. handshake complete). If not reset
4864 		 * the handshake.
4865 		 */
4866 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
4867 			return;
4868 
4869 		/*
4870 		 * Before attempting to add or remove address check
4871 		 * that they are valid multicast addresses.
4872 		 * If not, then NACK back.
4873 		 */
4874 		for (i = 0; i < mcst_pkt->count; i++) {
4875 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
4876 				DERR(vswp, "%s: invalid multicast address",
4877 								__func__);
4878 				SND_MCST_NACK(ldcp, mcst_pkt);
4879 				return;
4880 			}
4881 		}
4882 
4883 		/*
4884 		 * Now add/remove the addresses. If this fails we
4885 		 * NACK back.
4886 		 */
4887 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
4888 			SND_MCST_NACK(ldcp, mcst_pkt);
4889 			return;
4890 		}
4891 
4892 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4893 		mcst_pkt->tag.vio_sid = ldcp->local_session;
4894 
4895 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
4896 
4897 		vsw_send_msg(ldcp, (void *)mcst_pkt,
4898 					sizeof (vnet_mcast_msg_t));
4899 		break;
4900 
4901 	case VIO_SUBTYPE_ACK:
4902 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4903 
4904 		/*
4905 		 * We shouldn't ever get a multicast ACK message as
4906 		 * at the moment we never request multicast addresses
4907 		 * to be set on some other device. This may change in
4908 		 * the future if we have cascading switches.
4909 		 */
4910 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
4911 			return;
4912 
4913 				/* Do nothing */
4914 		break;
4915 
4916 	case VIO_SUBTYPE_NACK:
4917 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4918 
4919 		/*
4920 		 * We shouldn't get a multicast NACK packet for the
4921 		 * same reasons as we shouldn't get a ACK packet.
4922 		 */
4923 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
4924 			return;
4925 
4926 				/* Do nothing */
4927 		break;
4928 
4929 	default:
4930 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4931 			mcst_pkt->tag.vio_subtype);
4932 	}
4933 
4934 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4935 }
4936 
4937 static void
4938 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
4939 {
4940 	vio_rdx_msg_t	*rdx_pkt;
4941 	vsw_t		*vswp = ldcp->ldc_vswp;
4942 
4943 	/*
4944 	 * We know this is a ctrl/rdx packet so
4945 	 * cast it into the correct structure.
4946 	 */
4947 	rdx_pkt = (vio_rdx_msg_t *)pkt;
4948 
4949 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4950 
4951 	switch (rdx_pkt->tag.vio_subtype) {
4952 	case VIO_SUBTYPE_INFO:
4953 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4954 
4955 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV))
4956 			return;
4957 
4958 		rdx_pkt->tag.vio_sid = ldcp->local_session;
4959 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4960 
4961 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
4962 
4963 		ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT;
4964 
4965 		vsw_send_msg(ldcp, (void *)rdx_pkt,
4966 				sizeof (vio_rdx_msg_t));
4967 
4968 		vsw_next_milestone(ldcp);
4969 		break;
4970 
4971 	case VIO_SUBTYPE_ACK:
4972 		/*
4973 		 * Should be handled in-band by callback handler.
4974 		 */
4975 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
4976 		vsw_restart_handshake(ldcp);
4977 		break;
4978 
4979 	case VIO_SUBTYPE_NACK:
4980 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4981 
4982 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV))
4983 			return;
4984 
4985 		ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV;
4986 		vsw_next_milestone(ldcp);
4987 		break;
4988 
4989 	default:
4990 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4991 			rdx_pkt->tag.vio_subtype);
4992 	}
4993 
4994 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4995 }
4996 
4997 static void
4998 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
4999 {
5000 	uint16_t	env = tag.vio_subtype_env;
5001 	vsw_t		*vswp = ldcp->ldc_vswp;
5002 
5003 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5004 
5005 	/* session id check */
5006 	if (ldcp->session_status & VSW_PEER_SESSION) {
5007 		if (ldcp->peer_session != tag.vio_sid) {
5008 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
5009 				__func__, ldcp->ldc_id, tag.vio_sid);
5010 			vsw_restart_handshake(ldcp);
5011 			return;
5012 		}
5013 	}
5014 
5015 	/*
5016 	 * It is an error for us to be getting data packets
5017 	 * before the handshake has completed.
5018 	 */
5019 	if (ldcp->hphase != VSW_MILESTONE4) {
5020 		DERR(vswp, "%s: got data packet before handshake complete "
5021 			"hphase %d (%x: %x)", __func__, ldcp->hphase,
5022 			ldcp->lane_in.lstate, ldcp->lane_out.lstate);
5023 		DUMP_FLAGS(ldcp->lane_in.lstate);
5024 		DUMP_FLAGS(ldcp->lane_out.lstate);
5025 		vsw_restart_handshake(ldcp);
5026 		return;
5027 	}
5028 
5029 	/*
5030 	 * Switch on vio_subtype envelope, then let lower routines
5031 	 * decide if its an INFO, ACK or NACK packet.
5032 	 */
5033 	if (env == VIO_DRING_DATA) {
5034 		vsw_process_data_dring_pkt(ldcp, dpkt);
5035 	} else if (env == VIO_PKT_DATA) {
5036 		vsw_process_data_raw_pkt(ldcp, dpkt);
5037 	} else if (env == VIO_DESC_DATA) {
5038 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
5039 	} else {
5040 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
5041 							__func__, env);
5042 	}
5043 
5044 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5045 }
5046 
5047 #define	SND_DRING_NACK(ldcp, pkt) \
5048 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5049 	pkt->tag.vio_sid = ldcp->local_session; \
5050 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t));
5051 
5052 static void
5053 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
5054 {
5055 	vio_dring_msg_t		*dring_pkt;
5056 	vnet_public_desc_t	*pub_addr = NULL;
5057 	vsw_private_desc_t	*priv_addr = NULL;
5058 	dring_info_t		*dp = NULL;
5059 	vsw_t			*vswp = ldcp->ldc_vswp;
5060 	mblk_t			*mp = NULL;
5061 	mblk_t			*bp = NULL;
5062 	mblk_t			*bpt = NULL;
5063 	size_t			nbytes = 0;
5064 	size_t			off = 0;
5065 	uint64_t		ncookies = 0;
5066 	uint64_t		chain = 0;
5067 	uint64_t		j, len;
5068 	uint32_t		pos, start, datalen;
5069 	uint32_t		range_start, range_end;
5070 	int32_t			end, num, cnt = 0;
5071 	int			i, rv;
5072 	boolean_t		ack_needed = B_FALSE;
5073 	boolean_t		prev_desc_ack = B_FALSE;
5074 	int			read_attempts = 0;
5075 
5076 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5077 
5078 	/*
5079 	 * We know this is a data/dring packet so
5080 	 * cast it into the correct structure.
5081 	 */
5082 	dring_pkt = (vio_dring_msg_t *)dpkt;
5083 
5084 	/*
5085 	 * Switch on the vio_subtype. If its INFO then we need to
5086 	 * process the data. If its an ACK we need to make sure
5087 	 * it makes sense (i.e did we send an earlier data/info),
5088 	 * and if its a NACK then we maybe attempt a retry.
5089 	 */
5090 	switch (dring_pkt->tag.vio_subtype) {
5091 	case VIO_SUBTYPE_INFO:
5092 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
5093 
5094 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
5095 				dring_pkt->dring_ident)) == NULL) {
5096 
5097 			DERR(vswp, "%s(%lld): unable to find dring from "
5098 				"ident 0x%llx", __func__, ldcp->ldc_id,
5099 				dring_pkt->dring_ident);
5100 
5101 			SND_DRING_NACK(ldcp, dring_pkt);
5102 			return;
5103 		}
5104 
5105 		start = pos = dring_pkt->start_idx;
5106 		end = dring_pkt->end_idx;
5107 		len = dp->num_descriptors;
5108 
5109 		range_start = range_end = pos;
5110 
5111 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
5112 			__func__, ldcp->ldc_id, start, end);
5113 
5114 		if (end == -1) {
5115 			num = -1;
5116 		} else if (end >= 0) {
5117 			num = end >= pos ?
5118 				end - pos + 1: (len - pos + 1) + end;
5119 
5120 			/* basic sanity check */
5121 			if (end > len) {
5122 				DERR(vswp, "%s(%lld): endpoint %lld outside "
5123 					"ring length %lld", __func__,
5124 					ldcp->ldc_id, end, len);
5125 
5126 				SND_DRING_NACK(ldcp, dring_pkt);
5127 				return;
5128 			}
5129 		} else {
5130 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
5131 				__func__, ldcp->ldc_id, end);
5132 			SND_DRING_NACK(ldcp, dring_pkt);
5133 			return;
5134 		}
5135 
5136 		while (cnt != num) {
5137 vsw_recheck_desc:
5138 			if ((rv = ldc_mem_dring_acquire(dp->handle,
5139 							pos, pos)) != 0) {
5140 				DERR(vswp, "%s(%lld): unable to acquire "
5141 					"descriptor at pos %d: err %d",
5142 					__func__, pos, ldcp->ldc_id, rv);
5143 				SND_DRING_NACK(ldcp, dring_pkt);
5144 				return;
5145 			}
5146 
5147 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
5148 
5149 			/*
5150 			 * When given a bounded range of descriptors
5151 			 * to process, its an error to hit a descriptor
5152 			 * which is not ready. In the non-bounded case
5153 			 * (end_idx == -1) this simply indicates we have
5154 			 * reached the end of the current active range.
5155 			 */
5156 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
5157 				/* unbound - no error */
5158 				if (end == -1) {
5159 					if (read_attempts == vsw_read_attempts)
5160 						break;
5161 
5162 					delay(drv_usectohz(vsw_desc_delay));
5163 					read_attempts++;
5164 					goto vsw_recheck_desc;
5165 				}
5166 
5167 				/* bounded - error - so NACK back */
5168 				DERR(vswp, "%s(%lld): descriptor not READY "
5169 					"(%d)", __func__, ldcp->ldc_id,
5170 					pub_addr->hdr.dstate);
5171 				SND_DRING_NACK(ldcp, dring_pkt);
5172 				return;
5173 			}
5174 
5175 			DTRACE_PROBE1(read_attempts, int, read_attempts);
5176 
5177 			range_end = pos;
5178 
5179 			/*
5180 			 * If we ACK'd the previous descriptor then now
5181 			 * record the new range start position for later
5182 			 * ACK's.
5183 			 */
5184 			if (prev_desc_ack) {
5185 				range_start = pos;
5186 
5187 				D2(vswp, "%s(%lld): updating range start "
5188 					"to be %d", __func__, ldcp->ldc_id,
5189 					range_start);
5190 
5191 				prev_desc_ack = B_FALSE;
5192 			}
5193 
5194 			/*
5195 			 * Data is padded to align on 8 byte boundary,
5196 			 * datalen is actual data length, i.e. minus that
5197 			 * padding.
5198 			 */
5199 			datalen = pub_addr->nbytes;
5200 
5201 			/*
5202 			 * Does peer wish us to ACK when we have finished
5203 			 * with this descriptor ?
5204 			 */
5205 			if (pub_addr->hdr.ack)
5206 				ack_needed = B_TRUE;
5207 
5208 			D2(vswp, "%s(%lld): processing desc %lld at pos"
5209 				" 0x%llx : dstate 0x%lx : datalen 0x%lx",
5210 				__func__, ldcp->ldc_id, pos, pub_addr,
5211 				pub_addr->hdr.dstate, datalen);
5212 
5213 			/*
5214 			 * Mark that we are starting to process descriptor.
5215 			 */
5216 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
5217 
5218 			mp = vio_allocb(ldcp->rxh);
5219 			if (mp == NULL) {
5220 				/*
5221 				 * No free receive buffers available, so
5222 				 * fallback onto allocb(9F). Make sure that
5223 				 * we get a data buffer which is a multiple
5224 				 * of 8 as this is required by ldc_mem_copy.
5225 				 */
5226 				DTRACE_PROBE(allocb);
5227 				mp = allocb(datalen + VNET_IPALIGN + 8,
5228 								BPRI_MED);
5229 			}
5230 
5231 			/*
5232 			 * Ensure that we ask ldc for an aligned
5233 			 * number of bytes.
5234 			 */
5235 			nbytes = datalen + VNET_IPALIGN;
5236 			if (nbytes & 0x7) {
5237 				off = 8 - (nbytes & 0x7);
5238 				nbytes += off;
5239 			}
5240 
5241 			ncookies = pub_addr->ncookies;
5242 			rv = ldc_mem_copy(ldcp->ldc_handle,
5243 				(caddr_t)mp->b_rptr, 0, &nbytes,
5244 				pub_addr->memcookie, ncookies,
5245 				LDC_COPY_IN);
5246 
5247 			if (rv != 0) {
5248 				DERR(vswp, "%s(%d): unable to copy in "
5249 					"data from %d cookies in desc %d"
5250 					" (rv %d)", __func__, ldcp->ldc_id,
5251 					ncookies, pos, rv);
5252 				freemsg(mp);
5253 
5254 				pub_addr->hdr.dstate = VIO_DESC_DONE;
5255 				(void) ldc_mem_dring_release(dp->handle,
5256 								pos, pos);
5257 				break;
5258 			} else {
5259 				D2(vswp, "%s(%d): copied in %ld bytes"
5260 					" using %d cookies", __func__,
5261 					ldcp->ldc_id, nbytes, ncookies);
5262 			}
5263 
5264 			/* adjust the read pointer to skip over the padding */
5265 			mp->b_rptr += VNET_IPALIGN;
5266 
5267 			/* point to the actual end of data */
5268 			mp->b_wptr = mp->b_rptr + datalen;
5269 
5270 			/* build a chain of received packets */
5271 			if (bp == NULL) {
5272 				/* first pkt */
5273 				bp = mp;
5274 				bp->b_next = bp->b_prev = NULL;
5275 				bpt = bp;
5276 				chain = 1;
5277 			} else {
5278 				mp->b_next = NULL;
5279 				mp->b_prev = bpt;
5280 				bpt->b_next = mp;
5281 				bpt = mp;
5282 				chain++;
5283 			}
5284 
5285 			/* mark we are finished with this descriptor */
5286 			pub_addr->hdr.dstate = VIO_DESC_DONE;
5287 
5288 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
5289 
5290 			/*
5291 			 * Send an ACK back to peer if requested.
5292 			 */
5293 			if (ack_needed) {
5294 				ack_needed = B_FALSE;
5295 
5296 				dring_pkt->start_idx = range_start;
5297 				dring_pkt->end_idx = range_end;
5298 
5299 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
5300 					" requested", __func__, ldcp->ldc_id,
5301 					dring_pkt->start_idx,
5302 					dring_pkt->end_idx);
5303 
5304 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
5305 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5306 				dring_pkt->tag.vio_sid = ldcp->local_session;
5307 				vsw_send_msg(ldcp, (void *)dring_pkt,
5308 					sizeof (vio_dring_msg_t));
5309 
5310 				prev_desc_ack = B_TRUE;
5311 				range_start = pos;
5312 			}
5313 
5314 			/* next descriptor */
5315 			pos = (pos + 1) % len;
5316 			cnt++;
5317 
5318 			/*
5319 			 * Break out of loop here and stop processing to
5320 			 * allow some other network device (or disk) to
5321 			 * get access to the cpu.
5322 			 */
5323 			/* send the chain of packets to be switched */
5324 			if (chain > vsw_chain_len) {
5325 				D3(vswp, "%s(%lld): switching chain of %d "
5326 					"msgs", __func__, ldcp->ldc_id, chain);
5327 				vsw_switch_frame(vswp, bp, VSW_VNETPORT,
5328 							ldcp->ldc_port, NULL);
5329 				bp = NULL;
5330 				break;
5331 			}
5332 		}
5333 
5334 		/* send the chain of packets to be switched */
5335 		if (bp != NULL) {
5336 			D3(vswp, "%s(%lld): switching chain of %d msgs",
5337 					__func__, ldcp->ldc_id, chain);
5338 			vsw_switch_frame(vswp, bp, VSW_VNETPORT,
5339 							ldcp->ldc_port, NULL);
5340 		}
5341 
5342 		DTRACE_PROBE1(msg_cnt, int, cnt);
5343 
5344 		/*
5345 		 * We are now finished so ACK back with the state
5346 		 * set to STOPPING so our peer knows we are finished
5347 		 */
5348 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5349 		dring_pkt->tag.vio_sid = ldcp->local_session;
5350 
5351 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
5352 
5353 		DTRACE_PROBE(stop_process_sent);
5354 
5355 		/*
5356 		 * We have not processed any more descriptors beyond
5357 		 * the last one we ACK'd.
5358 		 */
5359 		if (prev_desc_ack)
5360 			range_start = range_end;
5361 
5362 		dring_pkt->start_idx = range_start;
5363 		dring_pkt->end_idx = range_end;
5364 
5365 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
5366 			__func__, ldcp->ldc_id, dring_pkt->start_idx,
5367 			dring_pkt->end_idx);
5368 
5369 		vsw_send_msg(ldcp, (void *)dring_pkt,
5370 					sizeof (vio_dring_msg_t));
5371 		break;
5372 
5373 	case VIO_SUBTYPE_ACK:
5374 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
5375 		/*
5376 		 * Verify that the relevant descriptors are all
5377 		 * marked as DONE
5378 		 */
5379 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
5380 			dring_pkt->dring_ident)) == NULL) {
5381 			DERR(vswp, "%s: unknown ident in ACK", __func__);
5382 			return;
5383 		}
5384 
5385 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5386 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5387 
5388 		start = end = 0;
5389 		start = dring_pkt->start_idx;
5390 		end = dring_pkt->end_idx;
5391 		len = dp->num_descriptors;
5392 
5393 		j = num = 0;
5394 		/* calculate # descriptors taking into a/c wrap around */
5395 		num = end >= start ? end - start + 1: (len - start + 1) + end;
5396 
5397 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
5398 			__func__, ldcp->ldc_id, start, end, num);
5399 
5400 		mutex_enter(&dp->dlock);
5401 		dp->last_ack_recv = end;
5402 		mutex_exit(&dp->dlock);
5403 
5404 		for (i = start; j < num; i = (i + 1) % len, j++) {
5405 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5406 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5407 
5408 			/*
5409 			 * If the last descriptor in a range has the ACK
5410 			 * bit set then we will get two messages from our
5411 			 * peer relating to it. The normal ACK msg and then
5412 			 * a subsequent STOP msg. The first message will have
5413 			 * resulted in the descriptor being reclaimed and
5414 			 * its state set to FREE so when we encounter a non
5415 			 * DONE descriptor we need to check to see if its
5416 			 * because we have just reclaimed it.
5417 			 */
5418 			mutex_enter(&priv_addr->dstate_lock);
5419 			if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
5420 				/* clear all the fields */
5421 				bzero(priv_addr->datap, priv_addr->datalen);
5422 				priv_addr->datalen = 0;
5423 
5424 				pub_addr->hdr.dstate = VIO_DESC_FREE;
5425 				pub_addr->hdr.ack = 0;
5426 
5427 				priv_addr->dstate = VIO_DESC_FREE;
5428 				mutex_exit(&priv_addr->dstate_lock);
5429 
5430 				D3(vswp, "clearing descp %d : pub state "
5431 					"0x%llx : priv state 0x%llx", i,
5432 					pub_addr->hdr.dstate,
5433 					priv_addr->dstate);
5434 
5435 			} else {
5436 				mutex_exit(&priv_addr->dstate_lock);
5437 
5438 				if (dring_pkt->dring_process_state !=
5439 							VIO_DP_STOPPED) {
5440 					DERR(vswp, "%s: descriptor %lld at pos "
5441 						" 0x%llx not DONE (0x%lx)\n",
5442 						__func__, i, pub_addr,
5443 						pub_addr->hdr.dstate);
5444 					return;
5445 				}
5446 			}
5447 		}
5448 
5449 		/*
5450 		 * If our peer is stopping processing descriptors then
5451 		 * we check to make sure it has processed all the descriptors
5452 		 * we have updated. If not then we send it a new message
5453 		 * to prompt it to restart.
5454 		 */
5455 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
5456 			DTRACE_PROBE(stop_process_recv);
5457 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
5458 				__func__, ldcp->ldc_id, dring_pkt->start_idx,
5459 				dring_pkt->end_idx);
5460 
5461 			/*
5462 			 * Check next descriptor in public section of ring.
5463 			 * If its marked as READY then we need to prompt our
5464 			 * peer to start processing the ring again.
5465 			 */
5466 			i = (end + 1) % len;
5467 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5468 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5469 
5470 			/*
5471 			 * Hold the restart lock across all of this to
5472 			 * make sure that its not possible for us to
5473 			 * decide that a msg needs to be sent in the future
5474 			 * but the sending code having already checked is
5475 			 * about to exit.
5476 			 */
5477 			mutex_enter(&dp->restart_lock);
5478 			mutex_enter(&priv_addr->dstate_lock);
5479 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
5480 
5481 				mutex_exit(&priv_addr->dstate_lock);
5482 
5483 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
5484 				dring_pkt->tag.vio_sid = ldcp->local_session;
5485 
5486 				mutex_enter(&ldcp->lane_out.seq_lock);
5487 				dring_pkt->seq_num = ldcp->lane_out.seq_num++;
5488 				mutex_exit(&ldcp->lane_out.seq_lock);
5489 
5490 				dring_pkt->start_idx = (end + 1) % len;
5491 				dring_pkt->end_idx = -1;
5492 
5493 				D2(vswp, "%s(%lld) : sending restart msg:"
5494 					" %d : %d", __func__, ldcp->ldc_id,
5495 					dring_pkt->start_idx,
5496 					dring_pkt->end_idx);
5497 
5498 				vsw_send_msg(ldcp, (void *)dring_pkt,
5499 						sizeof (vio_dring_msg_t));
5500 			} else {
5501 				mutex_exit(&priv_addr->dstate_lock);
5502 				dp->restart_reqd = B_TRUE;
5503 			}
5504 			mutex_exit(&dp->restart_lock);
5505 		}
5506 		break;
5507 
5508 	case VIO_SUBTYPE_NACK:
5509 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
5510 						__func__, ldcp->ldc_id);
5511 		/*
5512 		 * Something is badly wrong if we are getting NACK's
5513 		 * for our data pkts. So reset the channel.
5514 		 */
5515 		vsw_restart_handshake(ldcp);
5516 
5517 		break;
5518 
5519 	default:
5520 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
5521 			ldcp->ldc_id, dring_pkt->tag.vio_subtype);
5522 	}
5523 
5524 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5525 }
5526 
5527 /*
5528  * VIO_PKT_DATA (a.k.a raw data mode )
5529  *
5530  * Note - currently not supported. Do nothing.
5531  */
5532 static void
5533 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
5534 {
5535 	_NOTE(ARGUNUSED(dpkt))
5536 
5537 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
5538 
5539 	DERR(NULL, "%s (%lld): currently  not supported",
5540 						__func__, ldcp->ldc_id);
5541 
5542 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
5543 }
5544 
5545 #define	SND_IBND_DESC_NACK(ldcp, pkt) \
5546 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5547 	pkt->tag.vio_sid = ldcp->local_session; \
5548 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t));
5549 
5550 /*
5551  * Process an in-band descriptor message (most likely from
5552  * OBP).
5553  */
5554 static void
5555 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
5556 {
5557 	vio_ibnd_desc_t		*ibnd_desc;
5558 	dring_info_t		*dp = NULL;
5559 	vsw_private_desc_t	*priv_addr = NULL;
5560 	vsw_t			*vswp = ldcp->ldc_vswp;
5561 	mblk_t			*mp = NULL;
5562 	size_t			nbytes = 0;
5563 	size_t			off = 0;
5564 	uint64_t		idx = 0;
5565 	uint32_t		num = 1, len, datalen = 0;
5566 	uint64_t		ncookies = 0;
5567 	int			i, rv;
5568 	int			j = 0;
5569 
5570 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5571 
5572 	ibnd_desc = (vio_ibnd_desc_t *)pkt;
5573 
5574 	switch (ibnd_desc->hdr.tag.vio_subtype) {
5575 	case VIO_SUBTYPE_INFO:
5576 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5577 
5578 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
5579 			return;
5580 
5581 		/*
5582 		 * Data is padded to align on a 8 byte boundary,
5583 		 * nbytes is actual data length, i.e. minus that
5584 		 * padding.
5585 		 */
5586 		datalen = ibnd_desc->nbytes;
5587 
5588 		D2(vswp, "%s(%lld): processing inband desc : "
5589 			": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
5590 
5591 		ncookies = ibnd_desc->ncookies;
5592 
5593 		/*
5594 		 * allocb(9F) returns an aligned data block. We
5595 		 * need to ensure that we ask ldc for an aligned
5596 		 * number of bytes also.
5597 		 */
5598 		nbytes = datalen;
5599 		if (nbytes & 0x7) {
5600 			off = 8 - (nbytes & 0x7);
5601 			nbytes += off;
5602 		}
5603 
5604 		mp = allocb(datalen, BPRI_MED);
5605 		if (mp == NULL) {
5606 			DERR(vswp, "%s(%lld): allocb failed",
5607 					__func__, ldcp->ldc_id);
5608 			return;
5609 		}
5610 
5611 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
5612 			0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
5613 			LDC_COPY_IN);
5614 
5615 		if (rv != 0) {
5616 			DERR(vswp, "%s(%d): unable to copy in data from "
5617 				"%d cookie(s)", __func__,
5618 				ldcp->ldc_id, ncookies);
5619 			freemsg(mp);
5620 			return;
5621 		} else {
5622 			D2(vswp, "%s(%d): copied in %ld bytes using %d "
5623 				"cookies", __func__, ldcp->ldc_id, nbytes,
5624 				ncookies);
5625 		}
5626 
5627 		/* point to the actual end of data */
5628 		mp->b_wptr = mp->b_rptr + datalen;
5629 
5630 		/*
5631 		 * We ACK back every in-band descriptor message we process
5632 		 */
5633 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
5634 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
5635 		vsw_send_msg(ldcp, (void *)ibnd_desc,
5636 				sizeof (vio_ibnd_desc_t));
5637 
5638 		/* send the packet to be switched */
5639 		vsw_switch_frame(vswp, mp, VSW_VNETPORT,
5640 					ldcp->ldc_port, NULL);
5641 
5642 		break;
5643 
5644 	case VIO_SUBTYPE_ACK:
5645 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5646 
5647 		/* Verify the ACK is valid */
5648 		idx = ibnd_desc->hdr.desc_handle;
5649 
5650 		if (idx >= VSW_RING_NUM_EL) {
5651 			cmn_err(CE_WARN, "%s: corrupted ACK received "
5652 				"(idx %ld)", __func__, idx);
5653 			return;
5654 		}
5655 
5656 		if ((dp = ldcp->lane_out.dringp) == NULL) {
5657 			DERR(vswp, "%s: no dring found", __func__);
5658 			return;
5659 		}
5660 
5661 		len = dp->num_descriptors;
5662 		/*
5663 		 * If the descriptor we are being ACK'ed for is not the
5664 		 * one we expected, then pkts were lost somwhere, either
5665 		 * when we tried to send a msg, or a previous ACK msg from
5666 		 * our peer. In either case we now reclaim the descriptors
5667 		 * in the range from the last ACK we received up to the
5668 		 * current ACK.
5669 		 */
5670 		if (idx != dp->last_ack_recv) {
5671 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
5672 				__func__, dp->last_ack_recv, idx);
5673 			num = idx >= dp->last_ack_recv ?
5674 				idx - dp->last_ack_recv + 1:
5675 				(len - dp->last_ack_recv + 1) + idx;
5676 		}
5677 
5678 		/*
5679 		 * When we sent the in-band message to our peer we
5680 		 * marked the copy in our private ring as READY. We now
5681 		 * check that the descriptor we are being ACK'ed for is in
5682 		 * fact READY, i.e. it is one we have shared with our peer.
5683 		 *
5684 		 * If its not we flag an error, but still reset the descr
5685 		 * back to FREE.
5686 		 */
5687 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
5688 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5689 			mutex_enter(&priv_addr->dstate_lock);
5690 			if (priv_addr->dstate != VIO_DESC_READY) {
5691 				DERR(vswp, "%s: (%ld) desc at index %ld not "
5692 					"READY (0x%lx)", __func__,
5693 					ldcp->ldc_id, idx, priv_addr->dstate);
5694 				DERR(vswp, "%s: bound %d: ncookies %ld : "
5695 					"datalen %ld", __func__,
5696 					priv_addr->bound, priv_addr->ncookies,
5697 					priv_addr->datalen);
5698 			}
5699 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
5700 				ldcp->ldc_id, idx);
5701 			/* release resources associated with sent msg */
5702 			bzero(priv_addr->datap, priv_addr->datalen);
5703 			priv_addr->datalen = 0;
5704 			priv_addr->dstate = VIO_DESC_FREE;
5705 			mutex_exit(&priv_addr->dstate_lock);
5706 		}
5707 		/* update to next expected value */
5708 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
5709 
5710 		break;
5711 
5712 	case VIO_SUBTYPE_NACK:
5713 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5714 
5715 		/*
5716 		 * We should only get a NACK if our peer doesn't like
5717 		 * something about a message we have sent it. If this
5718 		 * happens we just release the resources associated with
5719 		 * the message. (We are relying on higher layers to decide
5720 		 * whether or not to resend.
5721 		 */
5722 
5723 		/* limit check */
5724 		idx = ibnd_desc->hdr.desc_handle;
5725 
5726 		if (idx >= VSW_RING_NUM_EL) {
5727 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
5728 				__func__, idx);
5729 			return;
5730 		}
5731 
5732 		if ((dp = ldcp->lane_out.dringp) == NULL) {
5733 			DERR(vswp, "%s: no dring found", __func__);
5734 			return;
5735 		}
5736 
5737 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5738 
5739 		/* move to correct location in ring */
5740 		priv_addr += idx;
5741 
5742 		/* release resources associated with sent msg */
5743 		mutex_enter(&priv_addr->dstate_lock);
5744 		bzero(priv_addr->datap, priv_addr->datalen);
5745 		priv_addr->datalen = 0;
5746 		priv_addr->dstate = VIO_DESC_FREE;
5747 		mutex_exit(&priv_addr->dstate_lock);
5748 
5749 		break;
5750 
5751 	default:
5752 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
5753 			ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
5754 	}
5755 
5756 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5757 }
5758 
5759 static void
5760 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
5761 {
5762 	_NOTE(ARGUNUSED(epkt))
5763 
5764 	vsw_t		*vswp = ldcp->ldc_vswp;
5765 	uint16_t	env = tag.vio_subtype_env;
5766 
5767 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
5768 
5769 	/*
5770 	 * Error vio_subtypes have yet to be defined. So for
5771 	 * the moment we can't do anything.
5772 	 */
5773 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
5774 
5775 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
5776 }
5777 
5778 /*
5779  * Switch the given ethernet frame when operating in layer 2 mode.
5780  *
5781  * vswp: pointer to the vsw instance
5782  * mp: pointer to chain of ethernet frame(s) to be switched
5783  * caller: identifies the source of this frame as:
5784  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
5785  *		2. VSW_PHYSDEV - the physical ethernet device
5786  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
5787  * arg: argument provided by the caller.
5788  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
5789  *		2. for PHYSDEV - NULL
5790  *		3. for LOCALDEV - pointer to to this vsw_t(self)
5791  */
5792 void
5793 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
5794 			vsw_port_t *arg, mac_resource_handle_t mrh)
5795 {
5796 	struct ether_header	*ehp;
5797 	vsw_port_t		*port = NULL;
5798 	mblk_t			*bp, *ret_m;
5799 	mblk_t			*nmp = NULL;
5800 	vsw_port_list_t		*plist = &vswp->plist;
5801 
5802 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
5803 
5804 	/*
5805 	 * PERF: rather than breaking up the chain here, scan it
5806 	 * to find all mblks heading to same destination and then
5807 	 * pass that sub-chain to the lower transmit functions.
5808 	 */
5809 
5810 	/* process the chain of packets */
5811 	bp = mp;
5812 	while (bp) {
5813 		mp = bp;
5814 		bp = bp->b_next;
5815 		mp->b_next = mp->b_prev = NULL;
5816 		ehp = (struct ether_header *)mp->b_rptr;
5817 
5818 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
5819 			__func__, MBLKSIZE(mp), MBLKL(mp));
5820 
5821 		READ_ENTER(&vswp->if_lockrw);
5822 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
5823 			/*
5824 			 * If destination is VSW_LOCALDEV (vsw as an eth
5825 			 * interface) and if the device is up & running,
5826 			 * send the packet up the stack on this host.
5827 			 * If the virtual interface is down, drop the packet.
5828 			 */
5829 			if (caller != VSW_LOCALDEV) {
5830 				if (vswp->if_state & VSW_IF_UP) {
5831 					RW_EXIT(&vswp->if_lockrw);
5832 					mac_rx(vswp->if_mh, mrh, mp);
5833 				} else {
5834 					RW_EXIT(&vswp->if_lockrw);
5835 					/* Interface down, drop pkt */
5836 					freemsg(mp);
5837 				}
5838 			} else {
5839 				RW_EXIT(&vswp->if_lockrw);
5840 				freemsg(mp);
5841 			}
5842 			continue;
5843 		}
5844 		RW_EXIT(&vswp->if_lockrw);
5845 
5846 		READ_ENTER(&plist->lockrw);
5847 		port = vsw_lookup_fdb(vswp, ehp);
5848 		if (port) {
5849 			/*
5850 			 * Mark the port as in-use.
5851 			 */
5852 			mutex_enter(&port->ref_lock);
5853 			port->ref_cnt++;
5854 			mutex_exit(&port->ref_lock);
5855 			RW_EXIT(&plist->lockrw);
5856 
5857 			/*
5858 			 * If plumbed and in promisc mode then copy msg
5859 			 * and send up the stack.
5860 			 */
5861 			READ_ENTER(&vswp->if_lockrw);
5862 			if (VSW_U_P(vswp->if_state)) {
5863 				RW_EXIT(&vswp->if_lockrw);
5864 				nmp = copymsg(mp);
5865 				if (nmp)
5866 					mac_rx(vswp->if_mh, mrh, nmp);
5867 			} else {
5868 				RW_EXIT(&vswp->if_lockrw);
5869 			}
5870 
5871 			/*
5872 			 * If the destination is in FDB, the packet
5873 			 * should be forwarded to the correponding
5874 			 * vsw_port (connected to a vnet device -
5875 			 * VSW_VNETPORT)
5876 			 */
5877 			(void) vsw_portsend(port, mp);
5878 
5879 			/*
5880 			 * Decrement use count in port and check if
5881 			 * should wake delete thread.
5882 			 */
5883 			mutex_enter(&port->ref_lock);
5884 			port->ref_cnt--;
5885 			if (port->ref_cnt == 0)
5886 				cv_signal(&port->ref_cv);
5887 			mutex_exit(&port->ref_lock);
5888 		} else {
5889 			RW_EXIT(&plist->lockrw);
5890 			/*
5891 			 * Destination not in FDB.
5892 			 *
5893 			 * If the destination is broadcast or
5894 			 * multicast forward the packet to all
5895 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
5896 			 * except the caller.
5897 			 */
5898 			if (IS_BROADCAST(ehp)) {
5899 				D3(vswp, "%s: BROADCAST pkt", __func__);
5900 				(void) vsw_forward_all(vswp, mp,
5901 								caller, arg);
5902 			} else if (IS_MULTICAST(ehp)) {
5903 				D3(vswp, "%s: MULTICAST pkt", __func__);
5904 				(void) vsw_forward_grp(vswp, mp,
5905 							caller, arg);
5906 			} else {
5907 				/*
5908 				 * If the destination is unicast, and came
5909 				 * from either a logical network device or
5910 				 * the switch itself when it is plumbed, then
5911 				 * send it out on the physical device and also
5912 				 * up the stack if the logical interface is
5913 				 * in promiscious mode.
5914 				 *
5915 				 * NOTE:  The assumption here is that if we
5916 				 * cannot find the destination in our fdb, its
5917 				 * a unicast address, and came from either a
5918 				 * vnet or down the stack (when plumbed) it
5919 				 * must be destinded for an ethernet device
5920 				 * outside our ldoms.
5921 				 */
5922 				if (caller == VSW_VNETPORT) {
5923 					READ_ENTER(&vswp->if_lockrw);
5924 					if (VSW_U_P(vswp->if_state)) {
5925 						RW_EXIT(&vswp->if_lockrw);
5926 						nmp = copymsg(mp);
5927 						if (nmp)
5928 							mac_rx(vswp->if_mh,
5929 								mrh, nmp);
5930 					} else {
5931 						RW_EXIT(&vswp->if_lockrw);
5932 					}
5933 					if ((ret_m = vsw_tx_msg(vswp, mp))
5934 								!= NULL) {
5935 						DERR(vswp, "%s: drop mblks to "
5936 							"phys dev", __func__);
5937 						freemsg(ret_m);
5938 					}
5939 
5940 				} else if (caller == VSW_PHYSDEV) {
5941 					/*
5942 					 * Pkt seen because card in promisc
5943 					 * mode. Send up stack if plumbed in
5944 					 * promisc mode, else drop it.
5945 					 */
5946 					READ_ENTER(&vswp->if_lockrw);
5947 					if (VSW_U_P(vswp->if_state)) {
5948 						RW_EXIT(&vswp->if_lockrw);
5949 						mac_rx(vswp->if_mh, mrh, mp);
5950 					} else {
5951 						RW_EXIT(&vswp->if_lockrw);
5952 						freemsg(mp);
5953 					}
5954 
5955 				} else if (caller == VSW_LOCALDEV) {
5956 					/*
5957 					 * Pkt came down the stack, send out
5958 					 * over physical device.
5959 					 */
5960 					if ((ret_m = vsw_tx_msg(vswp, mp))
5961 								!= NULL) {
5962 						DERR(vswp, "%s: drop mblks to "
5963 							"phys dev", __func__);
5964 						freemsg(ret_m);
5965 					}
5966 				}
5967 			}
5968 		}
5969 	}
5970 	D1(vswp, "%s: exit\n", __func__);
5971 }
5972 
5973 /*
5974  * Switch ethernet frame when in layer 3 mode (i.e. using IP
5975  * layer to do the routing).
5976  *
5977  * There is a large amount of overlap between this function and
5978  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
5979  * both these functions.
5980  */
5981 void
5982 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
5983 			vsw_port_t *arg, mac_resource_handle_t mrh)
5984 {
5985 	struct ether_header	*ehp;
5986 	vsw_port_t		*port = NULL;
5987 	mblk_t			*bp = NULL;
5988 	vsw_port_list_t		*plist = &vswp->plist;
5989 
5990 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
5991 
5992 	/*
5993 	 * In layer 3 mode should only ever be switching packets
5994 	 * between IP layer and vnet devices. So make sure thats
5995 	 * who is invoking us.
5996 	 */
5997 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
5998 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
5999 		freemsgchain(mp);
6000 		return;
6001 	}
6002 
6003 	/* process the chain of packets */
6004 	bp = mp;
6005 	while (bp) {
6006 		mp = bp;
6007 		bp = bp->b_next;
6008 		mp->b_next = mp->b_prev = NULL;
6009 		ehp = (struct ether_header *)mp->b_rptr;
6010 
6011 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
6012 			__func__, MBLKSIZE(mp), MBLKL(mp));
6013 
6014 		READ_ENTER(&plist->lockrw);
6015 		port = vsw_lookup_fdb(vswp, ehp);
6016 		if (port) {
6017 			/*
6018 			 * Mark port as in-use.
6019 			 */
6020 			mutex_enter(&port->ref_lock);
6021 			port->ref_cnt++;
6022 			mutex_exit(&port->ref_lock);
6023 			RW_EXIT(&plist->lockrw);
6024 
6025 			D2(vswp, "%s: sending to target port", __func__);
6026 			(void) vsw_portsend(port, mp);
6027 
6028 			/*
6029 			 * Finished with port so decrement ref count and
6030 			 * check if should wake delete thread.
6031 			 */
6032 			mutex_enter(&port->ref_lock);
6033 			port->ref_cnt--;
6034 			if (port->ref_cnt == 0)
6035 				cv_signal(&port->ref_cv);
6036 			mutex_exit(&port->ref_lock);
6037 		} else {
6038 			RW_EXIT(&plist->lockrw);
6039 			/*
6040 			 * Destination not in FDB
6041 			 *
6042 			 * If the destination is broadcast or
6043 			 * multicast forward the packet to all
6044 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
6045 			 * except the caller.
6046 			 */
6047 			if (IS_BROADCAST(ehp)) {
6048 				D2(vswp, "%s: BROADCAST pkt", __func__);
6049 				(void) vsw_forward_all(vswp, mp,
6050 								caller, arg);
6051 			} else if (IS_MULTICAST(ehp)) {
6052 				D2(vswp, "%s: MULTICAST pkt", __func__);
6053 				(void) vsw_forward_grp(vswp, mp,
6054 							caller, arg);
6055 			} else {
6056 				/*
6057 				 * Unicast pkt from vnet that we don't have
6058 				 * an FDB entry for, so must be destinded for
6059 				 * the outside world. Attempt to send up to the
6060 				 * IP layer to allow it to deal with it.
6061 				 */
6062 				if (caller == VSW_VNETPORT) {
6063 					READ_ENTER(&vswp->if_lockrw);
6064 					if (vswp->if_state & VSW_IF_UP) {
6065 						RW_EXIT(&vswp->if_lockrw);
6066 						D2(vswp, "%s: sending up",
6067 							__func__);
6068 						mac_rx(vswp->if_mh, mrh, mp);
6069 					} else {
6070 						RW_EXIT(&vswp->if_lockrw);
6071 						/* Interface down, drop pkt */
6072 						D2(vswp, "%s I/F down",
6073 								__func__);
6074 						freemsg(mp);
6075 					}
6076 				}
6077 			}
6078 		}
6079 	}
6080 
6081 	D1(vswp, "%s: exit", __func__);
6082 }
6083 
6084 /*
6085  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
6086  * except the caller (port on which frame arrived).
6087  */
6088 static int
6089 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
6090 {
6091 	vsw_port_list_t	*plist = &vswp->plist;
6092 	vsw_port_t	*portp;
6093 	mblk_t		*nmp = NULL;
6094 	mblk_t		*ret_m = NULL;
6095 	int		skip_port = 0;
6096 
6097 	D1(vswp, "vsw_forward_all: enter\n");
6098 
6099 	/*
6100 	 * Broadcast message from inside ldoms so send to outside
6101 	 * world if in either of layer 2 modes.
6102 	 */
6103 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
6104 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
6105 		((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
6106 
6107 		nmp = dupmsg(mp);
6108 		if (nmp) {
6109 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
6110 				DERR(vswp, "%s: dropping pkt(s) "
6111 				"consisting of %ld bytes of data for"
6112 				" physical device", __func__, MBLKL(ret_m));
6113 			freemsg(ret_m);
6114 			}
6115 		}
6116 	}
6117 
6118 	if (caller == VSW_VNETPORT)
6119 		skip_port = 1;
6120 
6121 	/*
6122 	 * Broadcast message from other vnet (layer 2 or 3) or outside
6123 	 * world (layer 2 only), send up stack if plumbed.
6124 	 */
6125 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
6126 		READ_ENTER(&vswp->if_lockrw);
6127 		if (vswp->if_state & VSW_IF_UP) {
6128 			RW_EXIT(&vswp->if_lockrw);
6129 			nmp = copymsg(mp);
6130 			if (nmp)
6131 				mac_rx(vswp->if_mh, NULL, nmp);
6132 		} else {
6133 			RW_EXIT(&vswp->if_lockrw);
6134 		}
6135 	}
6136 
6137 	/* send it to all VNETPORTs */
6138 	READ_ENTER(&plist->lockrw);
6139 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
6140 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
6141 		/*
6142 		 * Caution ! - don't reorder these two checks as arg
6143 		 * will be NULL if the caller is PHYSDEV. skip_port is
6144 		 * only set if caller is VNETPORT.
6145 		 */
6146 		if ((skip_port) && (portp == arg))
6147 			continue;
6148 		else {
6149 			nmp = dupmsg(mp);
6150 			if (nmp) {
6151 				(void) vsw_portsend(portp, nmp);
6152 			} else {
6153 				DERR(vswp, "vsw_forward_all: nmp NULL");
6154 			}
6155 		}
6156 	}
6157 	RW_EXIT(&plist->lockrw);
6158 
6159 	freemsg(mp);
6160 
6161 	D1(vswp, "vsw_forward_all: exit\n");
6162 	return (0);
6163 }
6164 
6165 /*
6166  * Forward pkts to any devices or interfaces which have registered
6167  * an interest in them (i.e. multicast groups).
6168  */
6169 static int
6170 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
6171 {
6172 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
6173 	mfdb_ent_t		*entp = NULL;
6174 	mfdb_ent_t		*tpp = NULL;
6175 	vsw_port_t 		*port;
6176 	uint64_t		key = 0;
6177 	mblk_t			*nmp = NULL;
6178 	mblk_t			*ret_m = NULL;
6179 	boolean_t		check_if = B_TRUE;
6180 
6181 	/*
6182 	 * Convert address to hash table key
6183 	 */
6184 	KEY_HASH(key, ehp->ether_dhost);
6185 
6186 	D1(vswp, "%s: key 0x%llx", __func__, key);
6187 
6188 	/*
6189 	 * If pkt came from either a vnet or down the stack (if we are
6190 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
6191 	 * over the physical adapter, and then check to see if any other
6192 	 * vnets are interested in it.
6193 	 */
6194 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
6195 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
6196 		((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
6197 		nmp = dupmsg(mp);
6198 		if (nmp) {
6199 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
6200 				DERR(vswp, "%s: dropping pkt(s) "
6201 					"consisting of %ld bytes of "
6202 					"data for physical device",
6203 					__func__, MBLKL(ret_m));
6204 				freemsg(ret_m);
6205 			}
6206 		}
6207 	}
6208 
6209 	READ_ENTER(&vswp->mfdbrw);
6210 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
6211 				(mod_hash_val_t *)&entp) != 0) {
6212 		D3(vswp, "%s: no table entry found for addr 0x%llx",
6213 								__func__, key);
6214 	} else {
6215 		/*
6216 		 * Send to list of devices associated with this address...
6217 		 */
6218 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
6219 
6220 			/* dont send to ourselves */
6221 			if ((caller == VSW_VNETPORT) &&
6222 				(tpp->d_addr == (void *)arg)) {
6223 				port = (vsw_port_t *)tpp->d_addr;
6224 				D3(vswp, "%s: not sending to ourselves"
6225 					" : port %d", __func__,
6226 					port->p_instance);
6227 				continue;
6228 
6229 			} else if ((caller == VSW_LOCALDEV) &&
6230 				(tpp->d_type == VSW_LOCALDEV)) {
6231 				D3(vswp, "%s: not sending back up stack",
6232 					__func__);
6233 				continue;
6234 			}
6235 
6236 			if (tpp->d_type == VSW_VNETPORT) {
6237 				port = (vsw_port_t *)tpp->d_addr;
6238 				D3(vswp, "%s: sending to port %ld for "
6239 					" addr 0x%llx", __func__,
6240 					port->p_instance, key);
6241 
6242 				nmp = dupmsg(mp);
6243 				if (nmp)
6244 					(void) vsw_portsend(port, nmp);
6245 			} else {
6246 				if (vswp->if_state & VSW_IF_UP) {
6247 					nmp = copymsg(mp);
6248 					if (nmp)
6249 						mac_rx(vswp->if_mh, NULL, nmp);
6250 					check_if = B_FALSE;
6251 					D3(vswp, "%s: sending up stack"
6252 						" for addr 0x%llx", __func__,
6253 						key);
6254 				}
6255 			}
6256 		}
6257 	}
6258 
6259 	RW_EXIT(&vswp->mfdbrw);
6260 
6261 	/*
6262 	 * If the pkt came from either a vnet or from physical device,
6263 	 * and if we havent already sent the pkt up the stack then we
6264 	 * check now if we can/should (i.e. the interface is plumbed
6265 	 * and in promisc mode).
6266 	 */
6267 	if ((check_if) &&
6268 		((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
6269 		READ_ENTER(&vswp->if_lockrw);
6270 		if (VSW_U_P(vswp->if_state)) {
6271 			RW_EXIT(&vswp->if_lockrw);
6272 			D3(vswp, "%s: (caller %d) finally sending up stack"
6273 				" for addr 0x%llx", __func__, caller, key);
6274 			nmp = copymsg(mp);
6275 			if (nmp)
6276 				mac_rx(vswp->if_mh, NULL, nmp);
6277 		} else {
6278 			RW_EXIT(&vswp->if_lockrw);
6279 		}
6280 	}
6281 
6282 	freemsg(mp);
6283 
6284 	D1(vswp, "%s: exit", __func__);
6285 
6286 	return (0);
6287 }
6288 
6289 /* transmit the packet over the given port */
6290 static int
6291 vsw_portsend(vsw_port_t *port, mblk_t *mp)
6292 {
6293 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
6294 	vsw_ldc_t 	*ldcp;
6295 	int		status = 0;
6296 
6297 
6298 	READ_ENTER(&ldcl->lockrw);
6299 	/*
6300 	 * Note for now, we have a single channel.
6301 	 */
6302 	ldcp = ldcl->head;
6303 	if (ldcp == NULL) {
6304 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
6305 		freemsg(mp);
6306 		RW_EXIT(&ldcl->lockrw);
6307 		return (1);
6308 	}
6309 
6310 	/*
6311 	 * Send the message out using the appropriate
6312 	 * transmit function which will free mblock when it
6313 	 * is finished with it.
6314 	 */
6315 	mutex_enter(&port->tx_lock);
6316 	if (port->transmit != NULL)
6317 		status = (*port->transmit)(ldcp, mp);
6318 	else {
6319 		freemsg(mp);
6320 	}
6321 	mutex_exit(&port->tx_lock);
6322 
6323 	RW_EXIT(&ldcl->lockrw);
6324 
6325 	return (status);
6326 }
6327 
6328 /*
6329  * Send packet out via descriptor ring to a logical device.
6330  */
6331 static int
6332 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
6333 {
6334 	vio_dring_msg_t		dring_pkt;
6335 	dring_info_t		*dp = NULL;
6336 	vsw_private_desc_t	*priv_desc = NULL;
6337 	vnet_public_desc_t	*pub = NULL;
6338 	vsw_t			*vswp = ldcp->ldc_vswp;
6339 	mblk_t			*bp;
6340 	size_t			n, size;
6341 	caddr_t			bufp;
6342 	int			idx;
6343 	int			status = LDC_TX_SUCCESS;
6344 
6345 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
6346 
6347 	/* TODO: make test a macro */
6348 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
6349 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
6350 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
6351 			"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
6352 			ldcp->lane_out.lstate);
6353 		freemsg(mp);
6354 		return (LDC_TX_FAILURE);
6355 	}
6356 
6357 	/*
6358 	 * Note - using first ring only, this may change
6359 	 * in the future.
6360 	 */
6361 	if ((dp = ldcp->lane_out.dringp) == NULL) {
6362 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
6363 			" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
6364 		freemsg(mp);
6365 		return (LDC_TX_FAILURE);
6366 	}
6367 
6368 	size = msgsize(mp);
6369 	if (size > (size_t)ETHERMAX) {
6370 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
6371 		    ldcp->ldc_id, size);
6372 		freemsg(mp);
6373 		return (LDC_TX_FAILURE);
6374 	}
6375 
6376 	/*
6377 	 * Find a free descriptor
6378 	 *
6379 	 * Note: for the moment we are assuming that we will only
6380 	 * have one dring going from the switch to each of its
6381 	 * peers. This may change in the future.
6382 	 */
6383 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
6384 		D2(vswp, "%s(%lld): no descriptor available for ring "
6385 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
6386 
6387 		/* nothing more we can do */
6388 		status = LDC_TX_NORESOURCES;
6389 		goto vsw_dringsend_free_exit;
6390 	} else {
6391 		D2(vswp, "%s(%lld): free private descriptor found at pos "
6392 			"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
6393 			priv_desc);
6394 	}
6395 
6396 	/* copy data into the descriptor */
6397 	bufp = priv_desc->datap;
6398 	bufp += VNET_IPALIGN;
6399 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
6400 		n = MBLKL(bp);
6401 		bcopy(bp->b_rptr, bufp, n);
6402 		bufp += n;
6403 	}
6404 
6405 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
6406 
6407 	pub = priv_desc->descp;
6408 	pub->nbytes = priv_desc->datalen;
6409 
6410 	mutex_enter(&priv_desc->dstate_lock);
6411 	pub->hdr.dstate = VIO_DESC_READY;
6412 	mutex_exit(&priv_desc->dstate_lock);
6413 
6414 	/*
6415 	 * Determine whether or not we need to send a message to our
6416 	 * peer prompting them to read our newly updated descriptor(s).
6417 	 */
6418 	mutex_enter(&dp->restart_lock);
6419 	if (dp->restart_reqd) {
6420 		dp->restart_reqd = B_FALSE;
6421 		mutex_exit(&dp->restart_lock);
6422 
6423 		/*
6424 		 * Send a vio_dring_msg to peer to prompt them to read
6425 		 * the updated descriptor ring.
6426 		 */
6427 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
6428 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
6429 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
6430 		dring_pkt.tag.vio_sid = ldcp->local_session;
6431 
6432 		/* Note - for now using first ring */
6433 		dring_pkt.dring_ident = dp->ident;
6434 
6435 		mutex_enter(&ldcp->lane_out.seq_lock);
6436 		dring_pkt.seq_num = ldcp->lane_out.seq_num++;
6437 		mutex_exit(&ldcp->lane_out.seq_lock);
6438 
6439 		/*
6440 		 * If last_ack_recv is -1 then we know we've not
6441 		 * received any ack's yet, so this must be the first
6442 		 * msg sent, so set the start to the begining of the ring.
6443 		 */
6444 		mutex_enter(&dp->dlock);
6445 		if (dp->last_ack_recv == -1) {
6446 			dring_pkt.start_idx = 0;
6447 		} else {
6448 			dring_pkt.start_idx = (dp->last_ack_recv + 1) %
6449 						dp->num_descriptors;
6450 		}
6451 		dring_pkt.end_idx = -1;
6452 		mutex_exit(&dp->dlock);
6453 
6454 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
6455 			ldcp->ldc_id, dp, dring_pkt.dring_ident);
6456 		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
6457 			__func__, ldcp->ldc_id, dring_pkt.start_idx,
6458 			dring_pkt.end_idx, dring_pkt.seq_num);
6459 
6460 		vsw_send_msg(ldcp, (void *)&dring_pkt,
6461 						sizeof (vio_dring_msg_t));
6462 	} else {
6463 		mutex_exit(&dp->restart_lock);
6464 		D2(vswp, "%s(%lld): updating descp %d", __func__,
6465 			ldcp->ldc_id, idx);
6466 	}
6467 
6468 vsw_dringsend_free_exit:
6469 
6470 	/* free the message block */
6471 	freemsg(mp);
6472 
6473 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
6474 	return (status);
6475 }
6476 
6477 /*
6478  * Send an in-band descriptor message over ldc.
6479  */
6480 static int
6481 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
6482 {
6483 	vsw_t			*vswp = ldcp->ldc_vswp;
6484 	vio_ibnd_desc_t		ibnd_msg;
6485 	vsw_private_desc_t	*priv_desc = NULL;
6486 	dring_info_t		*dp = NULL;
6487 	size_t			n, size = 0;
6488 	caddr_t			bufp;
6489 	mblk_t			*bp;
6490 	int			idx, i;
6491 	int			status = LDC_TX_SUCCESS;
6492 	static int		warn_msg = 1;
6493 
6494 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6495 
6496 	ASSERT(mp != NULL);
6497 
6498 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
6499 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
6500 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
6501 			__func__, ldcp->ldc_id, ldcp->ldc_status,
6502 			ldcp->lane_out.lstate);
6503 		freemsg(mp);
6504 		return (LDC_TX_FAILURE);
6505 	}
6506 
6507 	/*
6508 	 * only expect single dring to exist, which we use
6509 	 * as an internal buffer, rather than a transfer channel.
6510 	 */
6511 	if ((dp = ldcp->lane_out.dringp) == NULL) {
6512 		DERR(vswp, "%s(%lld): no dring for outbound lane",
6513 			__func__, ldcp->ldc_id);
6514 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
6515 			__func__, ldcp->ldc_id, ldcp->ldc_status,
6516 			ldcp->lane_out.lstate);
6517 		freemsg(mp);
6518 		return (LDC_TX_FAILURE);
6519 	}
6520 
6521 	size = msgsize(mp);
6522 	if (size > (size_t)ETHERMAX) {
6523 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
6524 		    ldcp->ldc_id, size);
6525 		freemsg(mp);
6526 		return (LDC_TX_FAILURE);
6527 	}
6528 
6529 	/*
6530 	 * Find a free descriptor in our buffer ring
6531 	 */
6532 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
6533 		if (warn_msg) {
6534 			DERR(vswp, "%s(%lld): no descriptor available for ring "
6535 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
6536 			warn_msg = 0;
6537 		}
6538 
6539 		/* nothing more we can do */
6540 		status = LDC_TX_NORESOURCES;
6541 		goto vsw_descrsend_free_exit;
6542 	} else {
6543 		D2(vswp, "%s(%lld): free private descriptor found at pos "
6544 			"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
6545 			priv_desc);
6546 		warn_msg = 1;
6547 	}
6548 
6549 	/* copy data into the descriptor */
6550 	bufp = priv_desc->datap;
6551 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
6552 		n = MBLKL(bp);
6553 		bcopy(bp->b_rptr, bufp, n);
6554 		bufp += n;
6555 	}
6556 
6557 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
6558 
6559 	/* create and send the in-band descp msg */
6560 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
6561 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
6562 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
6563 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
6564 
6565 	mutex_enter(&ldcp->lane_out.seq_lock);
6566 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
6567 	mutex_exit(&ldcp->lane_out.seq_lock);
6568 
6569 	/*
6570 	 * Copy the mem cookies describing the data from the
6571 	 * private region of the descriptor ring into the inband
6572 	 * descriptor.
6573 	 */
6574 	for (i = 0; i < priv_desc->ncookies; i++) {
6575 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
6576 			sizeof (ldc_mem_cookie_t));
6577 	}
6578 
6579 	ibnd_msg.hdr.desc_handle = idx;
6580 	ibnd_msg.ncookies = priv_desc->ncookies;
6581 	ibnd_msg.nbytes = size;
6582 
6583 	vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t));
6584 
6585 vsw_descrsend_free_exit:
6586 
6587 	/* free the allocated message blocks */
6588 	freemsg(mp);
6589 
6590 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6591 	return (status);
6592 }
6593 
6594 static void
6595 vsw_send_ver(void *arg)
6596 {
6597 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
6598 	vsw_t		*vswp = ldcp->ldc_vswp;
6599 	lane_t		*lp = &ldcp->lane_out;
6600 	vio_ver_msg_t	ver_msg;
6601 
6602 	D1(vswp, "%s enter", __func__);
6603 
6604 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
6605 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
6606 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
6607 	ver_msg.tag.vio_sid = ldcp->local_session;
6608 
6609 	ver_msg.ver_major = vsw_versions[0].ver_major;
6610 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
6611 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
6612 
6613 	lp->lstate |= VSW_VER_INFO_SENT;
6614 	lp->ver_major = ver_msg.ver_major;
6615 	lp->ver_minor = ver_msg.ver_minor;
6616 
6617 	DUMP_TAG(ver_msg.tag);
6618 
6619 	vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t));
6620 
6621 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
6622 }
6623 
6624 static void
6625 vsw_send_attr(vsw_ldc_t *ldcp)
6626 {
6627 	vsw_t			*vswp = ldcp->ldc_vswp;
6628 	lane_t			*lp = &ldcp->lane_out;
6629 	vnet_attr_msg_t		attr_msg;
6630 
6631 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
6632 
6633 	/*
6634 	 * Subtype is set to INFO by default
6635 	 */
6636 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
6637 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
6638 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
6639 	attr_msg.tag.vio_sid = ldcp->local_session;
6640 
6641 	/* payload copied from default settings for lane */
6642 	attr_msg.mtu = lp->mtu;
6643 	attr_msg.addr_type = lp->addr_type;
6644 	attr_msg.xfer_mode = lp->xfer_mode;
6645 	attr_msg.ack_freq = lp->xfer_mode;
6646 
6647 	READ_ENTER(&vswp->if_lockrw);
6648 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
6649 	RW_EXIT(&vswp->if_lockrw);
6650 
6651 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
6652 
6653 	DUMP_TAG(attr_msg.tag);
6654 
6655 	vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t));
6656 
6657 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
6658 }
6659 
6660 /*
6661  * Create dring info msg (which also results in the creation of
6662  * a dring).
6663  */
6664 static vio_dring_reg_msg_t *
6665 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
6666 {
6667 	vio_dring_reg_msg_t	*mp;
6668 	dring_info_t		*dp;
6669 	vsw_t			*vswp = ldcp->ldc_vswp;
6670 
6671 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
6672 
6673 	/*
6674 	 * If we can't create a dring, obviously no point sending
6675 	 * a message.
6676 	 */
6677 	if ((dp = vsw_create_dring(ldcp)) == NULL)
6678 		return (NULL);
6679 
6680 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
6681 
6682 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
6683 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
6684 	mp->tag.vio_subtype_env = VIO_DRING_REG;
6685 	mp->tag.vio_sid = ldcp->local_session;
6686 
6687 	/* payload */
6688 	mp->num_descriptors = dp->num_descriptors;
6689 	mp->descriptor_size = dp->descriptor_size;
6690 	mp->options = dp->options;
6691 	mp->ncookies = dp->ncookies;
6692 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
6693 
6694 	mp->dring_ident = 0;
6695 
6696 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
6697 
6698 	return (mp);
6699 }
6700 
6701 static void
6702 vsw_send_dring_info(vsw_ldc_t *ldcp)
6703 {
6704 	vio_dring_reg_msg_t	*dring_msg;
6705 	vsw_t			*vswp = ldcp->ldc_vswp;
6706 
6707 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
6708 
6709 	dring_msg = vsw_create_dring_info_pkt(ldcp);
6710 	if (dring_msg == NULL) {
6711 		cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg");
6712 		return;
6713 	}
6714 
6715 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
6716 
6717 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
6718 
6719 	vsw_send_msg(ldcp, dring_msg,
6720 		sizeof (vio_dring_reg_msg_t));
6721 
6722 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
6723 
6724 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
6725 }
6726 
6727 static void
6728 vsw_send_rdx(vsw_ldc_t *ldcp)
6729 {
6730 	vsw_t		*vswp = ldcp->ldc_vswp;
6731 	vio_rdx_msg_t	rdx_msg;
6732 
6733 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
6734 
6735 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
6736 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
6737 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
6738 	rdx_msg.tag.vio_sid = ldcp->local_session;
6739 
6740 	ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT;
6741 
6742 	DUMP_TAG(rdx_msg.tag);
6743 
6744 	vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t));
6745 
6746 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
6747 }
6748 
6749 /*
6750  * Generic routine to send message out over ldc channel.
6751  */
6752 static void
6753 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size)
6754 {
6755 	int		rv;
6756 	size_t		msglen = size;
6757 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
6758 	vsw_t		*vswp = ldcp->ldc_vswp;
6759 
6760 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
6761 			ldcp->ldc_id, size);
6762 
6763 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
6764 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
6765 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
6766 
6767 	mutex_enter(&ldcp->ldc_txlock);
6768 	do {
6769 		msglen = size;
6770 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
6771 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
6772 
6773 	if ((rv != 0) || (msglen != size)) {
6774 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
6775 			"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
6776 			rv, size, msglen);
6777 	}
6778 	mutex_exit(&ldcp->ldc_txlock);
6779 
6780 	/* channel has been reset */
6781 	if (rv == ECONNRESET) {
6782 		vsw_handle_reset(ldcp);
6783 	}
6784 
6785 	D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes",
6786 			ldcp->ldc_id, msglen);
6787 }
6788 
6789 /*
6790  * Add an entry into FDB, for the given mac address and port_id.
6791  * Returns 0 on success, 1 on failure.
6792  *
6793  * Lock protecting FDB must be held by calling process.
6794  */
6795 static int
6796 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
6797 {
6798 	uint64_t	addr = 0;
6799 
6800 	D1(vswp, "%s: enter", __func__);
6801 
6802 	KEY_HASH(addr, port->p_macaddr);
6803 
6804 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
6805 
6806 	/*
6807 	 * Note: duplicate keys will be rejected by mod_hash.
6808 	 */
6809 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
6810 				(mod_hash_val_t)port) != 0) {
6811 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
6812 		return (1);
6813 	}
6814 
6815 	D1(vswp, "%s: exit", __func__);
6816 	return (0);
6817 }
6818 
6819 /*
6820  * Remove an entry from FDB.
6821  * Returns 0 on success, 1 on failure.
6822  */
6823 static int
6824 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
6825 {
6826 	uint64_t	addr = 0;
6827 
6828 	D1(vswp, "%s: enter", __func__);
6829 
6830 	KEY_HASH(addr, port->p_macaddr);
6831 
6832 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
6833 
6834 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
6835 
6836 	D1(vswp, "%s: enter", __func__);
6837 
6838 	return (0);
6839 }
6840 
6841 /*
6842  * Search fdb for a given mac address.
6843  * Returns pointer to the entry if found, else returns NULL.
6844  */
6845 static vsw_port_t *
6846 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
6847 {
6848 	uint64_t	key = 0;
6849 	vsw_port_t	*port = NULL;
6850 
6851 	D1(vswp, "%s: enter", __func__);
6852 
6853 	KEY_HASH(key, ehp->ether_dhost);
6854 
6855 	D2(vswp, "%s: key = 0x%llx", __func__, key);
6856 
6857 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
6858 				(mod_hash_val_t *)&port) != 0) {
6859 		return (NULL);
6860 	}
6861 
6862 	D1(vswp, "%s: exit", __func__);
6863 
6864 	return (port);
6865 }
6866 
6867 /*
6868  * Add or remove multicast address(es).
6869  *
6870  * Returns 0 on success, 1 on failure.
6871  */
6872 static int
6873 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
6874 {
6875 	mcst_addr_t		*mcst_p = NULL;
6876 	vsw_t			*vswp = port->p_vswp;
6877 	uint64_t		addr = 0x0;
6878 	int			i, ret;
6879 
6880 	D1(vswp, "%s: enter", __func__);
6881 
6882 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
6883 
6884 	if (vswp->mh == NULL)
6885 		return (1);
6886 
6887 	for (i = 0; i < mcst_pkt->count; i++) {
6888 		/*
6889 		 * Convert address into form that can be used
6890 		 * as hash table key.
6891 		 */
6892 		KEY_HASH(addr, mcst_pkt->mca[i]);
6893 
6894 		/*
6895 		 * Add or delete the specified address/port combination.
6896 		 */
6897 		if (mcst_pkt->set == 0x1) {
6898 			D3(vswp, "%s: adding multicast address 0x%llx for "
6899 				"port %ld", __func__, addr, port->p_instance);
6900 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
6901 				/*
6902 				 * Update the list of multicast
6903 				 * addresses contained within the
6904 				 * port structure to include this new
6905 				 * one.
6906 				 */
6907 				mcst_p = kmem_alloc(sizeof (mcst_addr_t),
6908 								KM_NOSLEEP);
6909 				if (mcst_p == NULL) {
6910 					DERR(vswp, "%s: unable to alloc mem",
6911 						__func__);
6912 					return (1);
6913 				}
6914 
6915 				mcst_p->nextp = NULL;
6916 				mcst_p->addr = addr;
6917 
6918 				mutex_enter(&port->mca_lock);
6919 				mcst_p->nextp = port->mcap;
6920 				port->mcap = mcst_p;
6921 				mutex_exit(&port->mca_lock);
6922 
6923 				/*
6924 				 * Program the address into HW. If the addr
6925 				 * has already been programmed then the MAC
6926 				 * just increments a ref counter (which is
6927 				 * used when the address is being deleted)
6928 				 */
6929 				ret = mac_multicst_add(vswp->mh,
6930 						(uchar_t *)&mcst_pkt->mca[i]);
6931 				if (ret) {
6932 					cmn_err(CE_WARN, "!unable to add "
6933 						"multicast address");
6934 					(void) vsw_del_mcst(vswp, VSW_VNETPORT,
6935 						addr, port);
6936 					vsw_del_addr(VSW_VNETPORT, port, addr);
6937 					return (ret);
6938 				}
6939 
6940 			} else {
6941 				DERR(vswp, "%s: error adding multicast "
6942 					"address 0x%llx for port %ld",
6943 					__func__, addr, port->p_instance);
6944 				return (1);
6945 			}
6946 		} else {
6947 			/*
6948 			 * Delete an entry from the multicast hash
6949 			 * table and update the address list
6950 			 * appropriately.
6951 			 */
6952 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
6953 				D3(vswp, "%s: deleting multicast address "
6954 					"0x%llx for port %ld", __func__, addr,
6955 					port->p_instance);
6956 
6957 				vsw_del_addr(VSW_VNETPORT, port, addr);
6958 
6959 				/*
6960 				 * Remove the address from HW. The address
6961 				 * will actually only be removed once the ref
6962 				 * count within the MAC layer has dropped to
6963 				 * zero. I.e. we can safely call this fn even
6964 				 * if other ports are interested in this
6965 				 * address.
6966 				 */
6967 				(void) mac_multicst_remove(vswp->mh,
6968 						(uchar_t *)&mcst_pkt->mca[i]);
6969 
6970 			} else {
6971 				DERR(vswp, "%s: error deleting multicast "
6972 					"addr 0x%llx for port %ld",
6973 					__func__, addr, port->p_instance);
6974 				return (1);
6975 			}
6976 		}
6977 	}
6978 	D1(vswp, "%s: exit", __func__);
6979 	return (0);
6980 }
6981 
6982 /*
6983  * Add a new multicast entry.
6984  *
6985  * Search hash table based on address. If match found then
6986  * update associated val (which is chain of ports), otherwise
6987  * create new key/val (addr/port) pair and insert into table.
6988  */
6989 static int
6990 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
6991 {
6992 	int		dup = 0;
6993 	int		rv = 0;
6994 	mfdb_ent_t	*ment = NULL;
6995 	mfdb_ent_t	*tmp_ent = NULL;
6996 	mfdb_ent_t	*new_ent = NULL;
6997 	void		*tgt = NULL;
6998 
6999 	if (devtype == VSW_VNETPORT) {
7000 		/*
7001 		 * Being invoked from a vnet.
7002 		 */
7003 		ASSERT(arg != NULL);
7004 		tgt = arg;
7005 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
7006 			((vsw_port_t *)arg)->p_instance, addr);
7007 	} else {
7008 		/*
7009 		 * We are being invoked via the m_multicst mac entry
7010 		 * point.
7011 		 */
7012 		D2(NULL, "%s: address 0x%llx", __func__, addr);
7013 		tgt = (void *)vswp;
7014 	}
7015 
7016 	WRITE_ENTER(&vswp->mfdbrw);
7017 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
7018 				(mod_hash_val_t *)&ment) != 0) {
7019 
7020 		/* address not currently in table */
7021 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
7022 		ment->d_addr = (void *)tgt;
7023 		ment->d_type = devtype;
7024 		ment->nextp = NULL;
7025 
7026 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
7027 			(mod_hash_val_t)ment) != 0) {
7028 			DERR(vswp, "%s: hash table insertion failed", __func__);
7029 			kmem_free(ment, sizeof (mfdb_ent_t));
7030 			rv = 1;
7031 		} else {
7032 			D2(vswp, "%s: added initial entry for 0x%llx to "
7033 				"table", __func__, addr);
7034 		}
7035 	} else {
7036 		/*
7037 		 * Address in table. Check to see if specified port
7038 		 * is already associated with the address. If not add
7039 		 * it now.
7040 		 */
7041 		tmp_ent = ment;
7042 		while (tmp_ent != NULL) {
7043 			if (tmp_ent->d_addr == (void *)tgt) {
7044 				if (devtype == VSW_VNETPORT) {
7045 					DERR(vswp, "%s: duplicate port entry "
7046 						"found for portid %ld and key "
7047 						"0x%llx", __func__,
7048 						((vsw_port_t *)arg)->p_instance,
7049 						addr);
7050 				} else {
7051 					DERR(vswp, "%s: duplicate entry found"
7052 						"for key 0x%llx",
7053 						__func__, addr);
7054 				}
7055 				rv = 1;
7056 				dup = 1;
7057 				break;
7058 			}
7059 			tmp_ent = tmp_ent->nextp;
7060 		}
7061 
7062 		/*
7063 		 * Port not on list so add it to end now.
7064 		 */
7065 		if (0 == dup) {
7066 			D2(vswp, "%s: added entry for 0x%llx to table",
7067 				__func__, addr);
7068 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
7069 			new_ent->d_addr = (void *)tgt;
7070 			new_ent->d_type = devtype;
7071 			new_ent->nextp = NULL;
7072 
7073 			tmp_ent = ment;
7074 			while (tmp_ent->nextp != NULL)
7075 				tmp_ent = tmp_ent->nextp;
7076 
7077 			tmp_ent->nextp = new_ent;
7078 		}
7079 	}
7080 
7081 	RW_EXIT(&vswp->mfdbrw);
7082 	return (rv);
7083 }
7084 
7085 /*
7086  * Remove a multicast entry from the hashtable.
7087  *
7088  * Search hash table based on address. If match found, scan
7089  * list of ports associated with address. If specified port
7090  * found remove it from list.
7091  */
7092 static int
7093 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
7094 {
7095 	mfdb_ent_t	*ment = NULL;
7096 	mfdb_ent_t	*curr_p, *prev_p;
7097 	void		*tgt = NULL;
7098 
7099 	D1(vswp, "%s: enter", __func__);
7100 
7101 	if (devtype == VSW_VNETPORT) {
7102 		tgt = (vsw_port_t *)arg;
7103 		D2(vswp, "%s: removing port %d from mFDB for address"
7104 			" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
7105 			addr);
7106 	} else {
7107 		D2(vswp, "%s: removing entry", __func__);
7108 		tgt = (void *)vswp;
7109 	}
7110 
7111 	WRITE_ENTER(&vswp->mfdbrw);
7112 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
7113 				(mod_hash_val_t *)&ment) != 0) {
7114 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
7115 		RW_EXIT(&vswp->mfdbrw);
7116 		return (1);
7117 	}
7118 
7119 	prev_p = curr_p = ment;
7120 
7121 	while (curr_p != NULL) {
7122 		if (curr_p->d_addr == (void *)tgt) {
7123 			if (devtype == VSW_VNETPORT) {
7124 				D2(vswp, "%s: port %d found", __func__,
7125 					((vsw_port_t *)tgt)->p_instance);
7126 			} else {
7127 				D2(vswp, "%s: instance found", __func__);
7128 			}
7129 
7130 			if (prev_p == curr_p) {
7131 				/*
7132 				 * head of list, if no other element is in
7133 				 * list then destroy this entry, otherwise
7134 				 * just replace it with updated value.
7135 				 */
7136 				ment = curr_p->nextp;
7137 				kmem_free(curr_p, sizeof (mfdb_ent_t));
7138 				if (ment == NULL) {
7139 					(void) mod_hash_destroy(vswp->mfdb,
7140 							(mod_hash_val_t)addr);
7141 				} else {
7142 					(void) mod_hash_replace(vswp->mfdb,
7143 							(mod_hash_key_t)addr,
7144 							(mod_hash_val_t)ment);
7145 				}
7146 			} else {
7147 				/*
7148 				 * Not head of list, no need to do
7149 				 * replacement, just adjust list pointers.
7150 				 */
7151 				prev_p->nextp = curr_p->nextp;
7152 				kmem_free(curr_p, sizeof (mfdb_ent_t));
7153 			}
7154 			break;
7155 		}
7156 
7157 		prev_p = curr_p;
7158 		curr_p = curr_p->nextp;
7159 	}
7160 
7161 	RW_EXIT(&vswp->mfdbrw);
7162 
7163 	D1(vswp, "%s: exit", __func__);
7164 
7165 	return (0);
7166 }
7167 
7168 /*
7169  * Port is being deleted, but has registered an interest in one
7170  * or more multicast groups. Using the list of addresses maintained
7171  * within the port structure find the appropriate entry in the hash
7172  * table and remove this port from the list of interested ports.
7173  */
7174 static void
7175 vsw_del_mcst_port(vsw_port_t *port)
7176 {
7177 	mcst_addr_t	*mcst_p = NULL;
7178 	vsw_t		*vswp = port->p_vswp;
7179 
7180 	D1(vswp, "%s: enter", __func__);
7181 
7182 	mutex_enter(&port->mca_lock);
7183 	while (port->mcap != NULL) {
7184 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
7185 					port->mcap->addr, port);
7186 
7187 		mcst_p = port->mcap->nextp;
7188 		kmem_free(port->mcap, sizeof (mcst_addr_t));
7189 		port->mcap = mcst_p;
7190 	}
7191 	mutex_exit(&port->mca_lock);
7192 
7193 	D1(vswp, "%s: exit", __func__);
7194 }
7195 
7196 /*
7197  * This vsw instance is detaching, but has registered an interest in one
7198  * or more multicast groups. Using the list of addresses maintained
7199  * within the vsw structure find the appropriate entry in the hash
7200  * table and remove this instance from the list of interested ports.
7201  */
7202 static void
7203 vsw_del_mcst_vsw(vsw_t *vswp)
7204 {
7205 	mcst_addr_t	*next_p = NULL;
7206 
7207 	D1(vswp, "%s: enter", __func__);
7208 
7209 	mutex_enter(&vswp->mca_lock);
7210 
7211 	while (vswp->mcap != NULL) {
7212 		DERR(vswp, "%s: deleting addr 0x%llx",
7213 			__func__, vswp->mcap->addr);
7214 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
7215 				vswp->mcap->addr, NULL);
7216 
7217 		next_p = vswp->mcap->nextp;
7218 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
7219 		vswp->mcap = next_p;
7220 	}
7221 
7222 	vswp->mcap = NULL;
7223 	mutex_exit(&vswp->mca_lock);
7224 
7225 	D1(vswp, "%s: exit", __func__);
7226 }
7227 
7228 
7229 /*
7230  * Remove the specified address from the list of address maintained
7231  * in this port node.
7232  */
7233 static void
7234 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
7235 {
7236 	vsw_t		*vswp = NULL;
7237 	vsw_port_t	*port = NULL;
7238 	mcst_addr_t	*prev_p = NULL;
7239 	mcst_addr_t	*curr_p = NULL;
7240 
7241 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
7242 		__func__, devtype, addr);
7243 
7244 	if (devtype == VSW_VNETPORT) {
7245 		port = (vsw_port_t *)arg;
7246 		mutex_enter(&port->mca_lock);
7247 		prev_p = curr_p = port->mcap;
7248 	} else {
7249 		vswp = (vsw_t *)arg;
7250 		mutex_enter(&vswp->mca_lock);
7251 		prev_p = curr_p = vswp->mcap;
7252 	}
7253 
7254 	while (curr_p != NULL) {
7255 		if (curr_p->addr == addr) {
7256 			D2(NULL, "%s: address found", __func__);
7257 			/* match found */
7258 			if (prev_p == curr_p) {
7259 				/* list head */
7260 				if (devtype == VSW_VNETPORT)
7261 					port->mcap = curr_p->nextp;
7262 				else
7263 					vswp->mcap = curr_p->nextp;
7264 			} else {
7265 				prev_p->nextp = curr_p->nextp;
7266 			}
7267 			kmem_free(curr_p, sizeof (mcst_addr_t));
7268 			break;
7269 		} else {
7270 			prev_p = curr_p;
7271 			curr_p = curr_p->nextp;
7272 		}
7273 	}
7274 
7275 	if (devtype == VSW_VNETPORT)
7276 		mutex_exit(&port->mca_lock);
7277 	else
7278 		mutex_exit(&vswp->mca_lock);
7279 
7280 	D1(NULL, "%s: exit", __func__);
7281 }
7282 
7283 /*
7284  * Creates a descriptor ring (dring) and links it into the
7285  * link of outbound drings for this channel.
7286  *
7287  * Returns NULL if creation failed.
7288  */
7289 static dring_info_t *
7290 vsw_create_dring(vsw_ldc_t *ldcp)
7291 {
7292 	vsw_private_desc_t	*priv_addr = NULL;
7293 	vsw_t			*vswp = ldcp->ldc_vswp;
7294 	ldc_mem_info_t		minfo;
7295 	dring_info_t		*dp, *tp;
7296 	int			i;
7297 
7298 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
7299 
7300 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
7301 
7302 	/* create public section of ring */
7303 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
7304 			VSW_PUB_SIZE, &dp->handle)) != 0) {
7305 
7306 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
7307 			"failed", ldcp->ldc_id);
7308 		goto create_fail_exit;
7309 	}
7310 
7311 	ASSERT(dp->handle != NULL);
7312 
7313 	/*
7314 	 * Get the base address of the public section of the ring.
7315 	 */
7316 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
7317 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
7318 			ldcp->ldc_id);
7319 		goto dring_fail_exit;
7320 	} else {
7321 		ASSERT(minfo.vaddr != 0);
7322 		dp->pub_addr = minfo.vaddr;
7323 	}
7324 
7325 	dp->num_descriptors = VSW_RING_NUM_EL;
7326 	dp->descriptor_size = VSW_PUB_SIZE;
7327 	dp->options = VIO_TX_DRING;
7328 	dp->ncookies = 1;	/* guaranteed by ldc */
7329 
7330 	/*
7331 	 * create private portion of ring
7332 	 */
7333 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
7334 		(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
7335 
7336 	if (vsw_setup_ring(ldcp, dp)) {
7337 		DERR(vswp, "%s: unable to setup ring", __func__);
7338 		goto dring_fail_exit;
7339 	}
7340 
7341 	/* haven't used any descriptors yet */
7342 	dp->end_idx = 0;
7343 	dp->last_ack_recv = -1;
7344 
7345 	/* bind dring to the channel */
7346 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
7347 		LDC_SHADOW_MAP, LDC_MEM_RW,
7348 		&dp->cookie[0], &dp->ncookies)) != 0) {
7349 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
7350 			"%lld", ldcp->ldc_id);
7351 		goto dring_fail_exit;
7352 	}
7353 
7354 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
7355 	dp->restart_reqd = B_TRUE;
7356 
7357 	/*
7358 	 * Only ever create rings for outgoing lane. Link it onto
7359 	 * end of list.
7360 	 */
7361 	if (ldcp->lane_out.dringp == NULL) {
7362 		D2(vswp, "vsw_create_dring: adding first outbound ring");
7363 		ldcp->lane_out.dringp = dp;
7364 	} else {
7365 		tp = ldcp->lane_out.dringp;
7366 		while (tp->next != NULL)
7367 			tp = tp->next;
7368 
7369 		tp->next = dp;
7370 	}
7371 
7372 	return (dp);
7373 
7374 dring_fail_exit:
7375 	(void) ldc_mem_dring_destroy(dp->handle);
7376 
7377 create_fail_exit:
7378 	if (dp->priv_addr != NULL) {
7379 		priv_addr = dp->priv_addr;
7380 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
7381 			if (priv_addr->memhandle != NULL)
7382 				(void) ldc_mem_free_handle(
7383 						priv_addr->memhandle);
7384 			priv_addr++;
7385 		}
7386 		kmem_free(dp->priv_addr,
7387 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
7388 	}
7389 	mutex_destroy(&dp->dlock);
7390 
7391 	kmem_free(dp, sizeof (dring_info_t));
7392 	return (NULL);
7393 }
7394 
7395 /*
7396  * Create a ring consisting of just a private portion and link
7397  * it into the list of rings for the outbound lane.
7398  *
7399  * These type of rings are used primarily for temporary data
7400  * storage (i.e. as data buffers).
7401  */
7402 void
7403 vsw_create_privring(vsw_ldc_t *ldcp)
7404 {
7405 	dring_info_t		*dp, *tp;
7406 	vsw_t			*vswp = ldcp->ldc_vswp;
7407 
7408 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
7409 
7410 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
7411 
7412 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
7413 
7414 	/* no public section */
7415 	dp->pub_addr = NULL;
7416 
7417 	dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
7418 					VSW_RING_NUM_EL), KM_SLEEP);
7419 
7420 	dp->num_descriptors = VSW_RING_NUM_EL;
7421 
7422 	if (vsw_setup_ring(ldcp, dp)) {
7423 		DERR(vswp, "%s: setup of ring failed", __func__);
7424 		kmem_free(dp->priv_addr,
7425 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
7426 		mutex_destroy(&dp->dlock);
7427 		kmem_free(dp, sizeof (dring_info_t));
7428 		return;
7429 	}
7430 
7431 	/* haven't used any descriptors yet */
7432 	dp->end_idx = 0;
7433 
7434 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
7435 	dp->restart_reqd = B_TRUE;
7436 
7437 	/*
7438 	 * Only ever create rings for outgoing lane. Link it onto
7439 	 * end of list.
7440 	 */
7441 	if (ldcp->lane_out.dringp == NULL) {
7442 		D2(vswp, "%s: adding first outbound privring", __func__);
7443 		ldcp->lane_out.dringp = dp;
7444 	} else {
7445 		tp = ldcp->lane_out.dringp;
7446 		while (tp->next != NULL)
7447 			tp = tp->next;
7448 
7449 		tp->next = dp;
7450 	}
7451 
7452 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
7453 }
7454 
7455 /*
7456  * Setup the descriptors in the dring. Returns 0 on success, 1 on
7457  * failure.
7458  */
7459 int
7460 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
7461 {
7462 	vnet_public_desc_t	*pub_addr = NULL;
7463 	vsw_private_desc_t	*priv_addr = NULL;
7464 	vsw_t			*vswp = ldcp->ldc_vswp;
7465 	uint64_t		*tmpp;
7466 	uint64_t		offset = 0;
7467 	uint32_t		ncookies = 0;
7468 	static char		*name = "vsw_setup_ring";
7469 	int			i, j, nc, rv;
7470 
7471 	priv_addr = dp->priv_addr;
7472 	pub_addr = dp->pub_addr;
7473 
7474 	/* public section may be null but private should never be */
7475 	ASSERT(priv_addr != NULL);
7476 
7477 	/*
7478 	 * Allocate the region of memory which will be used to hold
7479 	 * the data the descriptors will refer to.
7480 	 */
7481 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
7482 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
7483 
7484 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
7485 		dp->data_sz, dp->data_addr);
7486 
7487 	tmpp = (uint64_t *)dp->data_addr;
7488 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
7489 
7490 	/*
7491 	 * Initialise some of the private and public (if they exist)
7492 	 * descriptor fields.
7493 	 */
7494 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
7495 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
7496 
7497 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
7498 			&priv_addr->memhandle)) != 0) {
7499 			DERR(vswp, "%s: alloc mem handle failed", name);
7500 			goto setup_ring_cleanup;
7501 		}
7502 
7503 		priv_addr->datap = (void *)tmpp;
7504 
7505 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
7506 			(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
7507 			LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
7508 			&(priv_addr->memcookie[0]), &ncookies);
7509 		if (rv != 0) {
7510 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
7511 				"(rv %d)", name, ldcp->ldc_id, rv);
7512 			goto setup_ring_cleanup;
7513 		}
7514 		priv_addr->bound = 1;
7515 
7516 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
7517 			name, i, priv_addr->memcookie[0].addr,
7518 			priv_addr->memcookie[0].size);
7519 
7520 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
7521 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
7522 				"invalid num of cookies (%d) for size 0x%llx",
7523 				name, ldcp->ldc_id, ncookies,
7524 				VSW_RING_EL_DATA_SZ);
7525 
7526 			goto setup_ring_cleanup;
7527 		} else {
7528 			for (j = 1; j < ncookies; j++) {
7529 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
7530 					&(priv_addr->memcookie[j]));
7531 				if (rv != 0) {
7532 					DERR(vswp, "%s: ldc_mem_nextcookie "
7533 						"failed rv (%d)", name, rv);
7534 					goto setup_ring_cleanup;
7535 				}
7536 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
7537 					"size 0x%llx", name, j,
7538 					priv_addr->memcookie[j].addr,
7539 					priv_addr->memcookie[j].size);
7540 			}
7541 
7542 		}
7543 		priv_addr->ncookies = ncookies;
7544 		priv_addr->dstate = VIO_DESC_FREE;
7545 
7546 		if (pub_addr != NULL) {
7547 
7548 			/* link pub and private sides */
7549 			priv_addr->descp = pub_addr;
7550 
7551 			pub_addr->ncookies = priv_addr->ncookies;
7552 
7553 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
7554 				bcopy(&priv_addr->memcookie[nc],
7555 					&pub_addr->memcookie[nc],
7556 					sizeof (ldc_mem_cookie_t));
7557 			}
7558 
7559 			pub_addr->hdr.dstate = VIO_DESC_FREE;
7560 			pub_addr++;
7561 		}
7562 
7563 		/*
7564 		 * move to next element in the dring and the next
7565 		 * position in the data buffer.
7566 		 */
7567 		priv_addr++;
7568 		tmpp += offset;
7569 	}
7570 
7571 	return (0);
7572 
7573 setup_ring_cleanup:
7574 	priv_addr = dp->priv_addr;
7575 
7576 	for (j = 0; j < i; j++) {
7577 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
7578 		(void) ldc_mem_free_handle(priv_addr->memhandle);
7579 
7580 		mutex_destroy(&priv_addr->dstate_lock);
7581 
7582 		priv_addr++;
7583 	}
7584 	kmem_free(dp->data_addr, dp->data_sz);
7585 
7586 	return (1);
7587 }
7588 
7589 /*
7590  * Searches the private section of a ring for a free descriptor,
7591  * starting at the location of the last free descriptor found
7592  * previously.
7593  *
7594  * Returns 0 if free descriptor is available, and updates state
7595  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
7596  *
7597  * FUTURE: might need to return contiguous range of descriptors
7598  * as dring info msg assumes all will be contiguous.
7599  */
7600 static int
7601 vsw_dring_find_free_desc(dring_info_t *dringp,
7602 		vsw_private_desc_t **priv_p, int *idx)
7603 {
7604 	vsw_private_desc_t	*addr = NULL;
7605 	int			num = VSW_RING_NUM_EL;
7606 	int			ret = 1;
7607 
7608 	D1(NULL, "%s enter\n", __func__);
7609 
7610 	ASSERT(dringp->priv_addr != NULL);
7611 
7612 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
7613 			__func__, dringp, dringp->end_idx);
7614 
7615 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
7616 
7617 	mutex_enter(&addr->dstate_lock);
7618 	if (addr->dstate == VIO_DESC_FREE) {
7619 		addr->dstate = VIO_DESC_READY;
7620 		*priv_p = addr;
7621 		*idx = dringp->end_idx;
7622 		dringp->end_idx = (dringp->end_idx + 1) % num;
7623 		ret = 0;
7624 
7625 	}
7626 	mutex_exit(&addr->dstate_lock);
7627 
7628 	/* ring full */
7629 	if (ret == 1) {
7630 		D2(NULL, "%s: no desp free: started at %d", __func__,
7631 			dringp->end_idx);
7632 	}
7633 
7634 	D1(NULL, "%s: exit\n", __func__);
7635 
7636 	return (ret);
7637 }
7638 
7639 /*
7640  * Map from a dring identifier to the ring itself. Returns
7641  * pointer to ring or NULL if no match found.
7642  */
7643 static dring_info_t *
7644 vsw_ident2dring(lane_t *lane, uint64_t ident)
7645 {
7646 	dring_info_t	*dp = NULL;
7647 
7648 	if ((dp = lane->dringp) == NULL) {
7649 		return (NULL);
7650 	} else {
7651 		if (dp->ident == ident)
7652 			return (dp);
7653 
7654 		while (dp != NULL) {
7655 			if (dp->ident == ident)
7656 				break;
7657 			dp = dp->next;
7658 		}
7659 	}
7660 
7661 	return (dp);
7662 }
7663 
7664 /*
7665  * Set the default lane attributes. These are copied into
7666  * the attr msg we send to our peer. If they are not acceptable
7667  * then (currently) the handshake ends.
7668  */
7669 static void
7670 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
7671 {
7672 	bzero(lp, sizeof (lane_t));
7673 
7674 	READ_ENTER(&vswp->if_lockrw);
7675 	ether_copy(&(vswp->if_addr), &(lp->addr));
7676 	RW_EXIT(&vswp->if_lockrw);
7677 
7678 	lp->mtu = VSW_MTU;
7679 	lp->addr_type = ADDR_TYPE_MAC;
7680 	lp->xfer_mode = VIO_DRING_MODE;
7681 	lp->ack_freq = 0;	/* for shared mode */
7682 
7683 	mutex_enter(&lp->seq_lock);
7684 	lp->seq_num = VNET_ISS;
7685 	mutex_exit(&lp->seq_lock);
7686 }
7687 
7688 /*
7689  * Verify that the attributes are acceptable.
7690  *
7691  * FUTURE: If some attributes are not acceptable, change them
7692  * our desired values.
7693  */
7694 static int
7695 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
7696 {
7697 	int	ret = 0;
7698 
7699 	D1(NULL, "vsw_check_attr enter\n");
7700 
7701 	/*
7702 	 * Note we currently only support in-band descriptors
7703 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
7704 	 */
7705 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
7706 			(pkt->xfer_mode != VIO_DRING_MODE)) {
7707 		D2(NULL, "vsw_check_attr: unknown mode %x\n",
7708 			pkt->xfer_mode);
7709 		ret = 1;
7710 	}
7711 
7712 	/* Only support MAC addresses at moment. */
7713 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
7714 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
7715 			"or address 0x%llx\n", pkt->addr_type,
7716 			pkt->addr);
7717 		ret = 1;
7718 	}
7719 
7720 	/*
7721 	 * MAC address supplied by device should match that stored
7722 	 * in the vsw-port OBP node. Need to decide what to do if they
7723 	 * don't match, for the moment just warn but don't fail.
7724 	 */
7725 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
7726 		DERR(NULL, "vsw_check_attr: device supplied address "
7727 			"0x%llx doesn't match node address 0x%llx\n",
7728 			pkt->addr, port->p_macaddr);
7729 	}
7730 
7731 	/*
7732 	 * Ack freq only makes sense in pkt mode, in shared
7733 	 * mode the ring descriptors say whether or not to
7734 	 * send back an ACK.
7735 	 */
7736 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
7737 				(pkt->ack_freq > 0)) {
7738 		D2(NULL, "vsw_check_attr: non zero ack freq "
7739 			" in SHM mode\n");
7740 		ret = 1;
7741 	}
7742 
7743 	/*
7744 	 * Note: for the moment we only support ETHER
7745 	 * frames. This may change in the future.
7746 	 */
7747 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
7748 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
7749 			pkt->mtu);
7750 		ret = 1;
7751 	}
7752 
7753 	D1(NULL, "vsw_check_attr exit\n");
7754 
7755 	return (ret);
7756 }
7757 
7758 /*
7759  * Returns 1 if there is a problem, 0 otherwise.
7760  */
7761 static int
7762 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
7763 {
7764 	_NOTE(ARGUNUSED(pkt))
7765 
7766 	int	ret = 0;
7767 
7768 	D1(NULL, "vsw_check_dring_info enter\n");
7769 
7770 	if ((pkt->num_descriptors == 0) ||
7771 		(pkt->descriptor_size == 0) ||
7772 		(pkt->ncookies != 1)) {
7773 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
7774 		ret = 1;
7775 	}
7776 
7777 	D1(NULL, "vsw_check_dring_info exit\n");
7778 
7779 	return (ret);
7780 }
7781 
7782 /*
7783  * Returns 1 if two memory cookies match. Otherwise returns 0.
7784  */
7785 static int
7786 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
7787 {
7788 	if ((m1->addr != m2->addr) ||
7789 		(m2->size != m2->size)) {
7790 		return (0);
7791 	} else {
7792 		return (1);
7793 	}
7794 }
7795 
7796 /*
7797  * Returns 1 if ring described in reg message matches that
7798  * described by dring_info structure. Otherwise returns 0.
7799  */
7800 static int
7801 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
7802 {
7803 	if ((msg->descriptor_size != dp->descriptor_size) ||
7804 		(msg->num_descriptors != dp->num_descriptors) ||
7805 		(msg->ncookies != dp->ncookies) ||
7806 		!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
7807 		return (0);
7808 	} else {
7809 		return (1);
7810 	}
7811 
7812 }
7813 
7814 static caddr_t
7815 vsw_print_ethaddr(uint8_t *a, char *ebuf)
7816 {
7817 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
7818 	    a[0], a[1], a[2], a[3], a[4], a[5]);
7819 	return (ebuf);
7820 }
7821 
7822 /*
7823  * Reset and free all the resources associated with
7824  * the channel.
7825  */
7826 static void
7827 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
7828 {
7829 	dring_info_t		*dp, *dpp;
7830 	lane_t			*lp = NULL;
7831 	int			rv = 0;
7832 
7833 	ASSERT(ldcp != NULL);
7834 
7835 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
7836 
7837 	if (dir == INBOUND) {
7838 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
7839 			" of channel %lld", __func__, ldcp->ldc_id);
7840 		lp = &ldcp->lane_in;
7841 	} else {
7842 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
7843 			" of channel %lld", __func__, ldcp->ldc_id);
7844 		lp = &ldcp->lane_out;
7845 	}
7846 
7847 	lp->lstate = VSW_LANE_INACTIV;
7848 	mutex_enter(&lp->seq_lock);
7849 	lp->seq_num = VNET_ISS;
7850 	mutex_exit(&lp->seq_lock);
7851 	if (lp->dringp) {
7852 		if (dir == INBOUND) {
7853 			dp = lp->dringp;
7854 			while (dp != NULL) {
7855 				dpp = dp->next;
7856 				if (dp->handle != NULL)
7857 					(void) ldc_mem_dring_unmap(dp->handle);
7858 				kmem_free(dp, sizeof (dring_info_t));
7859 				dp = dpp;
7860 			}
7861 		} else {
7862 			/*
7863 			 * unbind, destroy exported dring, free dring struct
7864 			 */
7865 			dp = lp->dringp;
7866 			rv = vsw_free_ring(dp);
7867 		}
7868 		if (rv == 0) {
7869 			lp->dringp = NULL;
7870 		}
7871 	}
7872 
7873 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
7874 }
7875 
7876 /*
7877  * Free ring and all associated resources.
7878  */
7879 static int
7880 vsw_free_ring(dring_info_t *dp)
7881 {
7882 	vsw_private_desc_t	*paddr = NULL;
7883 	dring_info_t		*dpp;
7884 	int			i, rv = 1;
7885 
7886 	while (dp != NULL) {
7887 		mutex_enter(&dp->dlock);
7888 		dpp = dp->next;
7889 		if (dp->priv_addr != NULL) {
7890 			/*
7891 			 * First unbind and free the memory handles
7892 			 * stored in each descriptor within the ring.
7893 			 */
7894 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
7895 				paddr = (vsw_private_desc_t *)
7896 						dp->priv_addr + i;
7897 				if (paddr->memhandle != NULL) {
7898 					if (paddr->bound == 1) {
7899 						rv = ldc_mem_unbind_handle(
7900 							paddr->memhandle);
7901 
7902 						if (rv != 0) {
7903 							DERR(NULL, "error "
7904 							"unbinding handle for "
7905 							"ring 0x%llx at pos %d",
7906 							dp, i);
7907 							mutex_exit(&dp->dlock);
7908 							return (rv);
7909 						}
7910 						paddr->bound = 0;
7911 					}
7912 
7913 					rv = ldc_mem_free_handle(
7914 							paddr->memhandle);
7915 					if (rv != 0) {
7916 						DERR(NULL, "error freeing "
7917 							"handle for ring "
7918 							"0x%llx at pos %d",
7919 							dp, i);
7920 						mutex_exit(&dp->dlock);
7921 						return (rv);
7922 					}
7923 					paddr->memhandle = NULL;
7924 				}
7925 				mutex_destroy(&paddr->dstate_lock);
7926 			}
7927 			kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
7928 					* VSW_RING_NUM_EL));
7929 		}
7930 
7931 		/*
7932 		 * Now unbind and destroy the ring itself.
7933 		 */
7934 		if (dp->handle != NULL) {
7935 			(void) ldc_mem_dring_unbind(dp->handle);
7936 			(void) ldc_mem_dring_destroy(dp->handle);
7937 		}
7938 
7939 		if (dp->data_addr != NULL) {
7940 			kmem_free(dp->data_addr, dp->data_sz);
7941 		}
7942 
7943 		mutex_exit(&dp->dlock);
7944 		mutex_destroy(&dp->dlock);
7945 		mutex_destroy(&dp->restart_lock);
7946 		kmem_free(dp, sizeof (dring_info_t));
7947 
7948 		dp = dpp;
7949 	}
7950 	return (0);
7951 }
7952 
7953 /*
7954  * Debugging routines
7955  */
7956 static void
7957 display_state(void)
7958 {
7959 	vsw_t		*vswp;
7960 	vsw_port_list_t	*plist;
7961 	vsw_port_t 	*port;
7962 	vsw_ldc_list_t	*ldcl;
7963 	vsw_ldc_t 	*ldcp;
7964 
7965 	cmn_err(CE_NOTE, "***** system state *****");
7966 
7967 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
7968 		plist = &vswp->plist;
7969 		READ_ENTER(&plist->lockrw);
7970 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
7971 			vswp->instance, plist->num_ports);
7972 
7973 		for (port = plist->head; port != NULL; port = port->p_next) {
7974 			ldcl = &port->p_ldclist;
7975 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
7976 				port->p_instance, ldcl->num_ldcs);
7977 			READ_ENTER(&ldcl->lockrw);
7978 			ldcp = ldcl->head;
7979 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
7980 				cmn_err(CE_CONT, "chan %lu : dev %d : "
7981 					"status %d : phase %u\n",
7982 					ldcp->ldc_id, ldcp->dev_class,
7983 					ldcp->ldc_status, ldcp->hphase);
7984 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
7985 					"psession %lu\n",
7986 					ldcp->ldc_id,
7987 					ldcp->local_session,
7988 					ldcp->peer_session);
7989 
7990 				cmn_err(CE_CONT, "Inbound lane:\n");
7991 				display_lane(&ldcp->lane_in);
7992 				cmn_err(CE_CONT, "Outbound lane:\n");
7993 				display_lane(&ldcp->lane_out);
7994 			}
7995 			RW_EXIT(&ldcl->lockrw);
7996 		}
7997 		RW_EXIT(&plist->lockrw);
7998 	}
7999 	cmn_err(CE_NOTE, "***** system state *****");
8000 }
8001 
8002 static void
8003 display_lane(lane_t *lp)
8004 {
8005 	dring_info_t	*drp;
8006 
8007 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
8008 		lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
8009 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
8010 		lp->addr_type, lp->addr, lp->xfer_mode);
8011 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
8012 
8013 	cmn_err(CE_CONT, "Dring info:\n");
8014 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
8015 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
8016 			drp->num_descriptors, drp->descriptor_size);
8017 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
8018 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
8019 			(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
8020 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
8021 			drp->ident, drp->end_idx);
8022 		display_ring(drp);
8023 	}
8024 }
8025 
8026 static void
8027 display_ring(dring_info_t *dringp)
8028 {
8029 	uint64_t		i;
8030 	uint64_t		priv_count = 0;
8031 	uint64_t		pub_count = 0;
8032 	vnet_public_desc_t	*pub_addr = NULL;
8033 	vsw_private_desc_t	*priv_addr = NULL;
8034 
8035 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
8036 		if (dringp->pub_addr != NULL) {
8037 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
8038 
8039 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
8040 				pub_count++;
8041 		}
8042 
8043 		if (dringp->priv_addr != NULL) {
8044 			priv_addr =
8045 				(vsw_private_desc_t *)dringp->priv_addr + i;
8046 
8047 			if (priv_addr->dstate == VIO_DESC_FREE)
8048 				priv_count++;
8049 		}
8050 	}
8051 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
8052 			i, priv_count, pub_count);
8053 }
8054 
8055 static void
8056 dump_flags(uint64_t state)
8057 {
8058 	int	i;
8059 
8060 	typedef struct flag_name {
8061 		int	flag_val;
8062 		char	*flag_name;
8063 	} flag_name_t;
8064 
8065 	flag_name_t	flags[] = {
8066 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
8067 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
8068 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
8069 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
8070 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
8071 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
8072 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
8073 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
8074 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
8075 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
8076 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
8077 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
8078 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
8079 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
8080 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
8081 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
8082 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
8083 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
8084 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
8085 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
8086 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
8087 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
8088 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
8089 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
8090 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
8091 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
8092 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
8093 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
8094 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
8095 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
8096 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
8097 
8098 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
8099 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
8100 		if (state & flags[i].flag_val)
8101 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
8102 	}
8103 }
8104