xref: /illumos-gate/usr/src/uts/sun4v/io/vsw.c (revision 0f5dca202932d0332edc1fe814b977277a94fe5c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 
74 /*
75  * Function prototypes.
76  */
77 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
78 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
79 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
80 static	void vsw_get_md_properties(vsw_t *vswp);
81 static	int vsw_get_physaddr(vsw_t *);
82 static	int vsw_setup_layer2(vsw_t *);
83 static	int vsw_setup_layer3(vsw_t *);
84 
85 /* MAC Ring table functions. */
86 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
87 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
88 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
89 static void vsw_queue_stop(vsw_queue_t *vqp);
90 static vsw_queue_t *vsw_queue_create();
91 static void vsw_queue_destroy(vsw_queue_t *vqp);
92 
93 /* MAC layer routines */
94 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
95 		mac_resource_t *mrp);
96 static	int vsw_get_hw_maddr(vsw_t *);
97 static	int vsw_set_hw(vsw_t *, vsw_port_t *);
98 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *);
99 static	int vsw_unset_hw(vsw_t *, vsw_port_t *);
100 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *);
101 static	int vsw_reconfig_hw(vsw_t *);
102 static int vsw_mac_attach(vsw_t *vswp);
103 static void vsw_mac_detach(vsw_t *vswp);
104 
105 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
106 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
107 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
108 static int vsw_mac_register(vsw_t *);
109 static int vsw_mac_unregister(vsw_t *);
110 static int vsw_m_stat(void *, uint_t, uint64_t *);
111 static void vsw_m_stop(void *arg);
112 static int vsw_m_start(void *arg);
113 static int vsw_m_unicst(void *arg, const uint8_t *);
114 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
115 static int vsw_m_promisc(void *arg, boolean_t);
116 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
117 
118 /* MDEG routines */
119 static	void vsw_mdeg_register(vsw_t *vswp);
120 static	void vsw_mdeg_unregister(vsw_t *vswp);
121 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
122 
123 /* Port add/deletion routines */
124 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
125 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
126 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
127 static	int vsw_detach_ports(vsw_t *vswp);
128 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
129 static	int vsw_port_delete(vsw_port_t *port);
130 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
131 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
132 static	int vsw_init_ldcs(vsw_port_t *port);
133 static	int vsw_uninit_ldcs(vsw_port_t *port);
134 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
135 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
136 static	int vsw_drain_ldcs(vsw_port_t *port);
137 static	int vsw_drain_port_taskq(vsw_port_t *port);
138 static	void vsw_marker_task(void *);
139 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
140 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
141 
142 /* Interrupt routines */
143 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
144 
145 /* Handshake routines */
146 static	void vsw_restart_handshake(vsw_ldc_t *);
147 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
148 static	void vsw_next_milestone(vsw_ldc_t *);
149 static	int vsw_supported_version(vio_ver_msg_t *);
150 
151 /* Data processing routines */
152 static void vsw_process_pkt(void *);
153 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
154 static void vsw_process_ctrl_pkt(void *);
155 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
156 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
157 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
158 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
159 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
160 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
161 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
162 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
163 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
164 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
165 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
166 
167 /* Switching/data transmit routines */
168 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
169 	    vsw_port_t *port, mac_resource_handle_t);
170 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
171 	    vsw_port_t *port, mac_resource_handle_t);
172 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
173 	    vsw_port_t *port);
174 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
175 	    vsw_port_t *port);
176 static	int vsw_portsend(vsw_port_t *, mblk_t *);
177 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
178 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
179 
180 /* Packet creation routines */
181 static void vsw_send_ver(vsw_ldc_t *);
182 static void vsw_send_attr(vsw_ldc_t *);
183 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
184 static void vsw_send_dring_info(vsw_ldc_t *);
185 static void vsw_send_rdx(vsw_ldc_t *);
186 
187 static void vsw_send_msg(vsw_ldc_t *, void *, int);
188 
189 /* Forwarding database (FDB) routines */
190 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
191 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
192 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
193 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
194 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
195 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
196 static	void vsw_del_addr(uint8_t, void *, uint64_t);
197 static	void vsw_del_mcst_port(vsw_port_t *);
198 static	void vsw_del_mcst_vsw(vsw_t *);
199 
200 /* Dring routines */
201 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
202 static void vsw_create_privring(vsw_ldc_t *);
203 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
204 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
205     int *);
206 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
207 
208 static void vsw_set_lane_attr(vsw_t *, lane_t *);
209 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
210 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
211 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
212 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
213 
214 /* Misc support routines */
215 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
216 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
217 static int vsw_free_ring(dring_info_t *);
218 
219 
220 /* Debugging routines */
221 static void dump_flags(uint64_t);
222 static void display_state(void);
223 static void display_lane(lane_t *);
224 static void display_ring(dring_info_t *);
225 
226 int	vsw_num_handshakes = 3;		/* # of handshake attempts */
227 int	vsw_wretries = 100;		/* # of write attempts */
228 int	vsw_chain_len = 150;		/* max # of mblks in msg chain */
229 int	vsw_desc_delay = 0;		/* delay in us */
230 int	vsw_read_attempts = 5;		/* # of reads of descriptor */
231 
232 uint32_t	vsw_mblk_size = VSW_MBLK_SIZE;
233 uint32_t	vsw_num_mblks = VSW_NUM_MBLKS;
234 
235 
236 /*
237  * mode specific frame switching function
238  */
239 void		(*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *,
240 			mac_resource_handle_t);
241 
242 static	mac_callbacks_t	vsw_m_callbacks = {
243 	0,
244 	vsw_m_stat,
245 	vsw_m_start,
246 	vsw_m_stop,
247 	vsw_m_promisc,
248 	vsw_m_multicst,
249 	vsw_m_unicst,
250 	vsw_m_tx,
251 	NULL,
252 	NULL,
253 	NULL
254 };
255 
256 static	struct	cb_ops	vsw_cb_ops = {
257 	nulldev,			/* cb_open */
258 	nulldev,			/* cb_close */
259 	nodev,				/* cb_strategy */
260 	nodev,				/* cb_print */
261 	nodev,				/* cb_dump */
262 	nodev,				/* cb_read */
263 	nodev,				/* cb_write */
264 	nodev,				/* cb_ioctl */
265 	nodev,				/* cb_devmap */
266 	nodev,				/* cb_mmap */
267 	nodev,				/* cb_segmap */
268 	nochpoll,			/* cb_chpoll */
269 	ddi_prop_op,			/* cb_prop_op */
270 	NULL,				/* cb_stream */
271 	D_MP,				/* cb_flag */
272 	CB_REV,				/* rev */
273 	nodev,				/* int (*cb_aread)() */
274 	nodev				/* int (*cb_awrite)() */
275 };
276 
277 static	struct	dev_ops	vsw_ops = {
278 	DEVO_REV,		/* devo_rev */
279 	0,			/* devo_refcnt */
280 	vsw_getinfo,		/* devo_getinfo */
281 	nulldev,		/* devo_identify */
282 	nulldev,		/* devo_probe */
283 	vsw_attach,		/* devo_attach */
284 	vsw_detach,		/* devo_detach */
285 	nodev,			/* devo_reset */
286 	&vsw_cb_ops,		/* devo_cb_ops */
287 	(struct bus_ops *)NULL,	/* devo_bus_ops */
288 	ddi_power		/* devo_power */
289 };
290 
291 extern	struct	mod_ops	mod_driverops;
292 static struct modldrv vswmodldrv = {
293 	&mod_driverops,
294 	"sun4v Virtual Switch Driver %I%",
295 	&vsw_ops,
296 };
297 
298 #define	LDC_ENTER_LOCK(ldcp)	\
299 				mutex_enter(&((ldcp)->ldc_cblock));\
300 				mutex_enter(&((ldcp)->ldc_txlock));
301 #define	LDC_EXIT_LOCK(ldcp)	\
302 				mutex_exit(&((ldcp)->ldc_txlock));\
303 				mutex_exit(&((ldcp)->ldc_cblock));
304 
305 /* Driver soft state ptr  */
306 static void	*vsw_state;
307 
308 /*
309  * Linked list of "vsw_t" structures - one per instance.
310  */
311 vsw_t		*vsw_head = NULL;
312 krwlock_t	vsw_rw;
313 
314 /*
315  * Property names
316  */
317 static char vdev_propname[] = "virtual-device";
318 static char vsw_propname[] = "virtual-network-switch";
319 static char physdev_propname[] = "vsw-phys-dev";
320 static char smode_propname[] = "vsw-switch-mode";
321 static char macaddr_propname[] = "local-mac-address";
322 static char remaddr_propname[] = "remote-mac-address";
323 static char ldcids_propname[] = "ldc-ids";
324 static char chan_propname[] = "channel-endpoint";
325 static char id_propname[] = "id";
326 static char reg_propname[] = "reg";
327 
328 /* supported versions */
329 static	ver_sup_t	vsw_versions[] = { {1, 0} };
330 
331 /*
332  * Matching criteria passed to the MDEG to register interest
333  * in changes to 'virtual-device-port' nodes identified by their
334  * 'id' property.
335  */
336 static md_prop_match_t vport_prop_match[] = {
337 	{ MDET_PROP_VAL,    "id"   },
338 	{ MDET_LIST_END,    NULL    }
339 };
340 
341 static mdeg_node_match_t vport_match = { "virtual-device-port",
342 						vport_prop_match };
343 
344 /*
345  * Specification of an MD node passed to the MDEG to filter any
346  * 'vport' nodes that do not belong to the specified node. This
347  * template is copied for each vsw instance and filled in with
348  * the appropriate 'cfg-handle' value before being passed to the MDEG.
349  */
350 static mdeg_prop_spec_t vsw_prop_template[] = {
351 	{ MDET_PROP_STR,    "name",		vsw_propname },
352 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
353 	{ MDET_LIST_END,    NULL,		NULL	}
354 };
355 
356 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
357 
358 /*
359  * From /etc/system enable/disable thread per ring. This is a mode
360  * selection that is done a vsw driver attach time.
361  */
362 boolean_t vsw_multi_ring_enable = B_FALSE;
363 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;
364 
365 /*
366  * Print debug messages - set to 0x1f to enable all msgs
367  * or 0x0 to turn all off.
368  */
369 int vswdbg = 0x0;
370 
371 /*
372  * debug levels:
373  * 0x01:	Function entry/exit tracing
374  * 0x02:	Internal function messages
375  * 0x04:	Verbose internal messages
376  * 0x08:	Warning messages
377  * 0x10:	Error messages
378  */
379 
380 static void
381 vswdebug(vsw_t *vswp, const char *fmt, ...)
382 {
383 	char buf[512];
384 	va_list ap;
385 
386 	va_start(ap, fmt);
387 	(void) vsprintf(buf, fmt, ap);
388 	va_end(ap);
389 
390 	if (vswp == NULL)
391 		cmn_err(CE_CONT, "%s\n", buf);
392 	else
393 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
394 }
395 
396 /*
397  * For the moment the state dump routines have their own
398  * private flag.
399  */
400 #define	DUMP_STATE	0
401 
402 #if DUMP_STATE
403 
404 #define	DUMP_TAG(tag) \
405 {			\
406 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
407 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
408 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
409 }
410 
411 #define	DUMP_TAG_PTR(tag) \
412 {			\
413 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
414 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
415 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
416 }
417 
418 #define	DUMP_FLAGS(flags) dump_flags(flags);
419 #define	DISPLAY_STATE()	display_state()
420 
421 #else
422 
423 #define	DUMP_TAG(tag)
424 #define	DUMP_TAG_PTR(tag)
425 #define	DUMP_FLAGS(state)
426 #define	DISPLAY_STATE()
427 
428 #endif	/* DUMP_STATE */
429 
430 #ifdef DEBUG
431 
432 #define	D1		\
433 if (vswdbg & 0x01)	\
434 	vswdebug
435 
436 #define	D2		\
437 if (vswdbg & 0x02)	\
438 	vswdebug
439 
440 #define	D3		\
441 if (vswdbg & 0x04)	\
442 	vswdebug
443 
444 #define	DWARN		\
445 if (vswdbg & 0x08)	\
446 	vswdebug
447 
448 #define	DERR		\
449 if (vswdbg & 0x10)	\
450 	vswdebug
451 
452 #else
453 
454 #define	DERR		if (0)	vswdebug
455 #define	DWARN		if (0)	vswdebug
456 #define	D1		if (0)	vswdebug
457 #define	D2		if (0)	vswdebug
458 #define	D3		if (0)	vswdebug
459 
460 #endif	/* DEBUG */
461 
462 static struct modlinkage modlinkage = {
463 	MODREV_1,
464 	&vswmodldrv,
465 	NULL
466 };
467 
468 int
469 _init(void)
470 {
471 	int status;
472 
473 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
474 
475 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
476 	if (status != 0) {
477 		return (status);
478 	}
479 
480 	mac_init_ops(&vsw_ops, "vsw");
481 	status = mod_install(&modlinkage);
482 	if (status != 0) {
483 		ddi_soft_state_fini(&vsw_state);
484 	}
485 	return (status);
486 }
487 
488 int
489 _fini(void)
490 {
491 	int status;
492 
493 	status = mod_remove(&modlinkage);
494 	if (status != 0)
495 		return (status);
496 	mac_fini_ops(&vsw_ops);
497 	ddi_soft_state_fini(&vsw_state);
498 
499 	rw_destroy(&vsw_rw);
500 
501 	return (status);
502 }
503 
504 int
505 _info(struct modinfo *modinfop)
506 {
507 	return (mod_info(&modlinkage, modinfop));
508 }
509 
510 static int
511 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
512 {
513 	vsw_t		*vswp;
514 	int		instance, i;
515 	char		hashname[MAXNAMELEN];
516 	char		qname[TASKQ_NAMELEN];
517 	int		rv = 1;
518 	enum		{ PROG_init = 0x00,
519 				PROG_if_lock = 0x01,
520 				PROG_fdb = 0x02,
521 				PROG_mfdb = 0x04,
522 				PROG_report_dev = 0x08,
523 				PROG_plist = 0x10,
524 				PROG_taskq = 0x20}
525 			progress;
526 
527 	progress = PROG_init;
528 
529 	switch (cmd) {
530 	case DDI_ATTACH:
531 		break;
532 	case DDI_RESUME:
533 		/* nothing to do for this non-device */
534 		return (DDI_SUCCESS);
535 	case DDI_PM_RESUME:
536 	default:
537 		return (DDI_FAILURE);
538 	}
539 
540 	instance = ddi_get_instance(dip);
541 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
542 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
543 		return (DDI_FAILURE);
544 	}
545 	vswp = ddi_get_soft_state(vsw_state, instance);
546 
547 	if (vswp == NULL) {
548 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
549 		goto vsw_attach_fail;
550 	}
551 
552 	vswp->dip = dip;
553 	vswp->instance = instance;
554 	ddi_set_driver_private(dip, (caddr_t)vswp);
555 
556 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
557 	progress |= PROG_if_lock;
558 
559 	/*
560 	 * Get the various properties such as physical device name
561 	 * (vsw-phys-dev), switch mode etc from the MD.
562 	 */
563 	vsw_get_md_properties(vswp);
564 
565 	/* setup the unicast forwarding database  */
566 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
567 							vswp->instance);
568 	D2(vswp, "creating unicast hash table (%s)...", hashname);
569 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
570 		mod_hash_null_valdtor, sizeof (void *));
571 
572 	progress |= PROG_fdb;
573 
574 	/* setup the multicast fowarding database */
575 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
576 							vswp->instance);
577 	D2(vswp, "creating multicast hash table %s)...", hashname);
578 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
579 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
580 			mod_hash_null_valdtor, sizeof (void *));
581 
582 	progress |= PROG_mfdb;
583 
584 	/*
585 	 * create lock protecting list of multicast addresses
586 	 * which could come via m_multicst() entry point when plumbed.
587 	 */
588 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
589 	vswp->mcap = NULL;
590 
591 	ddi_report_dev(vswp->dip);
592 
593 	progress |= PROG_report_dev;
594 
595 	WRITE_ENTER(&vsw_rw);
596 	vswp->next = vsw_head;
597 	vsw_head = vswp;
598 	RW_EXIT(&vsw_rw);
599 
600 	/* setup the port list */
601 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
602 	vswp->plist.head = NULL;
603 
604 	progress |= PROG_plist;
605 
606 	/*
607 	 * Create the taskq which will process all the VIO
608 	 * control messages.
609 	 */
610 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
611 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
612 					TASKQ_DEFAULTPRI, 0)) == NULL) {
613 		cmn_err(CE_WARN, "Unable to create task queue");
614 		goto vsw_attach_fail;
615 	}
616 
617 	progress |= PROG_taskq;
618 
619 	/* select best switching mode */
620 	for (i = 0; i < vswp->smode_num; i++) {
621 		vswp->smode_idx = i;
622 		switch (vswp->smode[i]) {
623 		case VSW_LAYER2:
624 		case VSW_LAYER2_PROMISC:
625 			rv = vsw_setup_layer2(vswp);
626 			break;
627 
628 		case VSW_LAYER3:
629 			rv = vsw_setup_layer3(vswp);
630 			break;
631 
632 		default:
633 			DERR(vswp, "unknown switch mode");
634 			rv = 1;
635 			break;
636 		}
637 
638 		if (rv == 0)
639 			break;
640 	}
641 
642 	if (rv == 1) {
643 		cmn_err(CE_WARN, "Unable to setup switching mode");
644 		goto vsw_attach_fail;
645 	}
646 
647 	D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]);
648 
649 	/*
650 	 * Register with the MAC layer as a network device so
651 	 * we can be plumbed if desired.
652 	 *
653 	 * Do this in both layer 2 and layer 3 mode.
654 	 */
655 	vswp->if_state &= ~VSW_IF_UP;
656 	if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
657 		if (vsw_mac_register(vswp) != 0) {
658 			cmn_err(CE_WARN, "Unable to register as provider "
659 				" with MAC layer, continuing with attach");
660 		}
661 	}
662 
663 	/* prevent auto-detaching */
664 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
665 				DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
666 		cmn_err(CE_NOTE, "Unable to set \"%s\" property for "
667 			"instance %u", DDI_NO_AUTODETACH, instance);
668 	}
669 
670 	/*
671 	 * Now we have everything setup, register for MD change
672 	 * events.
673 	 */
674 	vsw_mdeg_register(vswp);
675 
676 	return (DDI_SUCCESS);
677 
678 vsw_attach_fail:
679 	DERR(NULL, "vsw_attach: failed");
680 
681 	if (progress & PROG_taskq)
682 		ddi_taskq_destroy(vswp->taskq_p);
683 
684 	if (progress & PROG_plist)
685 		rw_destroy(&vswp->plist.lockrw);
686 
687 	if (progress & PROG_report_dev) {
688 		ddi_remove_minor_node(dip, NULL);
689 		mutex_destroy(&vswp->mca_lock);
690 	}
691 
692 	if (progress & PROG_mfdb) {
693 		mod_hash_destroy_hash(vswp->mfdb);
694 		vswp->mfdb = NULL;
695 		rw_destroy(&vswp->mfdbrw);
696 	}
697 
698 	if (progress & PROG_fdb) {
699 		mod_hash_destroy_hash(vswp->fdb);
700 		vswp->fdb = NULL;
701 	}
702 
703 	if (progress & PROG_if_lock)
704 		rw_destroy(&vswp->if_lockrw);
705 
706 	ddi_soft_state_free(vsw_state, instance);
707 	return (DDI_FAILURE);
708 }
709 
710 static int
711 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
712 {
713 	vio_mblk_pool_t		*poolp, *npoolp;
714 	vsw_t			**vswpp, *vswp;
715 	int 			instance;
716 
717 	instance = ddi_get_instance(dip);
718 	vswp = ddi_get_soft_state(vsw_state, instance);
719 
720 	if (vswp == NULL) {
721 		return (DDI_FAILURE);
722 	}
723 
724 	switch (cmd) {
725 	case DDI_DETACH:
726 		break;
727 	case DDI_SUSPEND:
728 	case DDI_PM_SUSPEND:
729 	default:
730 		return (DDI_FAILURE);
731 	}
732 
733 	D2(vswp, "detaching instance %d", instance);
734 
735 	if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
736 		if (vsw_mac_unregister(vswp) != 0) {
737 			cmn_err(CE_WARN, "Unable to detach from MAC layer");
738 			return (DDI_FAILURE);
739 		}
740 		rw_destroy(&vswp->if_lockrw);
741 	}
742 
743 	vsw_mdeg_unregister(vswp);
744 
745 	/* remove mac layer callback */
746 	if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
747 		mac_rx_remove(vswp->mh, vswp->mrh);
748 		vswp->mrh = NULL;
749 	}
750 
751 	if (vsw_detach_ports(vswp) != 0) {
752 		cmn_err(CE_WARN, "Unable to detach ports");
753 		return (DDI_FAILURE);
754 	}
755 
756 	/*
757 	 * Now that the ports have been deleted, stop and close
758 	 * the physical device.
759 	 */
760 	if (vswp->mh != NULL) {
761 		if (vswp->mstarted)
762 			mac_stop(vswp->mh);
763 		if (vswp->mresources)
764 			mac_resource_set(vswp->mh, NULL, NULL);
765 		mac_close(vswp->mh);
766 
767 		vswp->mh = NULL;
768 		vswp->txinfo = NULL;
769 	}
770 
771 	/*
772 	 * Destroy any free pools that may still exist.
773 	 */
774 	poolp = vswp->rxh;
775 	while (poolp != NULL) {
776 		npoolp = vswp->rxh = poolp->nextp;
777 		if (vio_destroy_mblks(poolp) != 0) {
778 			vswp->rxh = poolp;
779 			return (DDI_FAILURE);
780 		}
781 		poolp = npoolp;
782 	}
783 
784 	/*
785 	 * Remove this instance from any entries it may be on in
786 	 * the hash table by using the list of addresses maintained
787 	 * in the vsw_t structure.
788 	 */
789 	vsw_del_mcst_vsw(vswp);
790 
791 	vswp->mcap = NULL;
792 	mutex_destroy(&vswp->mca_lock);
793 
794 	/*
795 	 * By now any pending tasks have finished and the underlying
796 	 * ldc's have been destroyed, so its safe to delete the control
797 	 * message taskq.
798 	 */
799 	if (vswp->taskq_p != NULL)
800 		ddi_taskq_destroy(vswp->taskq_p);
801 
802 	/*
803 	 * At this stage all the data pointers in the hash table
804 	 * should be NULL, as all the ports have been removed and will
805 	 * have deleted themselves from the port lists which the data
806 	 * pointers point to. Hence we can destroy the table using the
807 	 * default destructors.
808 	 */
809 	D2(vswp, "vsw_detach: destroying hash tables..");
810 	mod_hash_destroy_hash(vswp->fdb);
811 	vswp->fdb = NULL;
812 
813 	WRITE_ENTER(&vswp->mfdbrw);
814 	mod_hash_destroy_hash(vswp->mfdb);
815 	vswp->mfdb = NULL;
816 	RW_EXIT(&vswp->mfdbrw);
817 	rw_destroy(&vswp->mfdbrw);
818 
819 	ddi_remove_minor_node(dip, NULL);
820 
821 	rw_destroy(&vswp->plist.lockrw);
822 	WRITE_ENTER(&vsw_rw);
823 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
824 		if (*vswpp == vswp) {
825 			*vswpp = vswp->next;
826 			break;
827 		}
828 	}
829 	RW_EXIT(&vsw_rw);
830 	ddi_soft_state_free(vsw_state, instance);
831 
832 	return (DDI_SUCCESS);
833 }
834 
835 static int
836 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
837 {
838 	_NOTE(ARGUNUSED(dip))
839 
840 	vsw_t	*vswp = NULL;
841 	dev_t	dev = (dev_t)arg;
842 	int	instance;
843 
844 	instance = getminor(dev);
845 
846 	switch (infocmd) {
847 	case DDI_INFO_DEVT2DEVINFO:
848 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
849 			*result = NULL;
850 			return (DDI_FAILURE);
851 		}
852 		*result = vswp->dip;
853 		return (DDI_SUCCESS);
854 
855 	case DDI_INFO_DEVT2INSTANCE:
856 		*result = (void *)(uintptr_t)instance;
857 		return (DDI_SUCCESS);
858 
859 	default:
860 		*result = NULL;
861 		return (DDI_FAILURE);
862 	}
863 }
864 
865 /*
866  * Get the properties from our MD node.
867  */
868 static void
869 vsw_get_md_properties(vsw_t *vswp)
870 {
871 	md_t		*mdp = NULL;
872 	int		num_nodes = 0;
873 	int		len = 0, listsz = 0;
874 	int		num_vdev = 0;
875 	int		i, idx;
876 	boolean_t	found_node = B_FALSE;
877 	char		*smode = NULL;
878 	char		*curr_mode = NULL;
879 	char		*physname = NULL;
880 	char		*node_name = NULL;
881 	char		*dev;
882 	uint64_t 	macaddr = 0;
883 	uint64_t	md_inst, obp_inst;
884 	mde_cookie_t	*listp = NULL;
885 	mde_cookie_t	rootnode;
886 
887 	D1(vswp, "%s: enter", __func__);
888 
889 	/*
890 	 * Further down we compare the obp 'reg' property to the
891 	 * 'cfg-handle' property in the vsw MD node to determine
892 	 * if the node refers to this particular instance. So if
893 	 * we can't read the obp value then there is no point
894 	 * in proceeding further.
895 	 */
896 	if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip,
897 			DDI_PROP_DONTPASS, reg_propname) != 1) {
898 		cmn_err(CE_WARN, "Unable to read %s property "
899 			"from OBP device node", reg_propname);
900 		return;
901 	}
902 
903 	obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
904 		DDI_PROP_DONTPASS, reg_propname, 0);
905 
906 	D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst);
907 
908 	if ((mdp = md_get_handle()) == NULL) {
909 		DERR(vswp, "%s: unable to init MD", __func__);
910 		return;
911 	}
912 
913 	if ((num_nodes = md_node_count(mdp)) <= 0) {
914 		DERR(vswp, "%s: invalid number of  nodes found %d",
915 			__func__, num_nodes);
916 		(void) md_fini_handle(mdp);
917 		return;
918 	}
919 
920 	D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes);
921 
922 	/* allocate enough space for node list */
923 	listsz = num_nodes * sizeof (mde_cookie_t);
924 	listp = kmem_zalloc(listsz, KM_SLEEP);
925 
926 	rootnode = md_root_node(mdp);
927 
928 	/* Get the list of virtual devices */
929 	num_vdev = md_scan_dag(mdp, rootnode,
930 		md_find_name(mdp, vdev_propname),
931 		md_find_name(mdp, "fwd"), listp);
932 
933 	if (num_vdev <= 0) {
934 		DERR(vswp, "%s: didn't find any virtual-device nodes in MD",
935 			__func__);
936 		goto md_prop_exit;
937 	}
938 
939 	D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev);
940 
941 	/* Look for the virtual switch nodes in the list */
942 	for (idx = 0; idx < num_vdev; idx++) {
943 		if (md_get_prop_str(mdp, listp[idx],
944 				"name", &node_name) != 0) {
945 			DERR(vswp, "%s: unable to get node name", __func__);
946 			continue;
947 
948 		}
949 
950 		if (strcmp(node_name, vsw_propname) == 0) {
951 			/* Virtual switch node */
952 			if (md_get_prop_val(mdp, listp[idx],
953 				"cfg-handle", &md_inst) != 0) {
954 				DERR(vswp, "%s: unable to get cfg-handle from"
955 					" node %d", __func__, idx);
956 				goto md_prop_exit;
957 			} else if (md_inst == obp_inst) {
958 				D2(vswp, "%s: found matching node (%d)"
959 					" 0x%llx == 0x%llx", __func__, idx,
960 					md_inst, obp_inst);
961 				found_node = B_TRUE;
962 				break;
963 			}
964 		}
965 	}
966 
967 	if (!found_node) {
968 		DWARN(vswp, "%s: couldn't find correct vsw node", __func__);
969 		goto md_prop_exit;
970 	}
971 
972 	/*
973 	 * Now, having found the correct node, get the various properties.
974 	 */
975 
976 	if (md_get_prop_data(mdp, listp[idx], physdev_propname,
977 				(uint8_t **)(&physname), &len) != 0) {
978 		cmn_err(CE_WARN, "%s: unable to get name(s) of physical "
979 			"device(s) from MD", __func__);
980 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
981 		cmn_err(CE_WARN, "%s is too long a device name", physname);
982 	} else {
983 		(void) strncpy(vswp->physname, physname, strlen(physname) + 1);
984 		vswp->mdprops |= VSW_MD_PHYSNAME;
985 		D2(vswp, "%s: using first device specified (%s)",
986 			__func__, vswp->physname);
987 	}
988 
989 #ifdef DEBUG
990 	/*
991 	 * As a temporary measure to aid testing we check to see if there
992 	 * is a vsw.conf file present. If there is we use the value of the
993 	 * vsw_physname property in the file as the name of the physical
994 	 * device, overriding the value from the MD.
995 	 *
996 	 * There may be multiple devices listed, but for the moment
997 	 * we just use the first one.
998 	 */
999 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
1000 		"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
1001 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
1002 			cmn_err(CE_WARN, "%s is too long a device name", dev);
1003 		} else {
1004 			cmn_err(CE_NOTE, "%s: using device name (%s) from "
1005 				"config file", __func__, dev);
1006 
1007 			(void) strncpy(vswp->physname, dev, strlen(dev) + 1);
1008 			vswp->mdprops |= VSW_MD_PHYSNAME;
1009 		}
1010 
1011 		ddi_prop_free(dev);
1012 
1013 	}
1014 #endif
1015 
1016 	/* mac address for vswitch device itself */
1017 	if (md_get_prop_val(mdp, listp[idx],
1018 			macaddr_propname, &macaddr) != 0) {
1019 		cmn_err(CE_WARN, "!Unable to get MAC address from MD");
1020 
1021 		/*
1022 		 * Fallback to using the mac address of the physical
1023 		 * device.
1024 		 */
1025 		if (vsw_get_physaddr(vswp) == 0) {
1026 			cmn_err(CE_NOTE, "!Using MAC address from physical "
1027 				"device (%s)", vswp->physname);
1028 		}
1029 	} else {
1030 		READ_ENTER(&vswp->if_lockrw);
1031 		for (i = ETHERADDRL - 1; i >= 0; i--) {
1032 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
1033 			macaddr >>= 8;
1034 		}
1035 		RW_EXIT(&vswp->if_lockrw);
1036 		vswp->mdprops |= VSW_MD_MACADDR;
1037 	}
1038 
1039 	/*
1040 	 * Get the switch-mode property. The modes are listed in
1041 	 * decreasing order of preference, i.e. prefered mode is
1042 	 * first item in list.
1043 	 */
1044 	len = 0;
1045 	vswp->smode_num = 0;
1046 	if (md_get_prop_data(mdp, listp[idx], smode_propname,
1047 				(uint8_t **)(&smode), &len) != 0) {
1048 		/*
1049 		 * Unable to get switch-mode property from MD, nothing
1050 		 * more we can do.
1051 		 */
1052 		cmn_err(CE_WARN, "!unable to get switch mode property");
1053 		goto md_prop_exit;
1054 	}
1055 
1056 	curr_mode = smode;
1057 	/*
1058 	 * Modes of operation:
1059 	 * 'switched'	 - layer 2 switching, underlying HW in
1060 	 *			programmed mode.
1061 	 * 'promiscuous' - layer 2 switching, underlying HW in
1062 	 *			promiscuous mode.
1063 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
1064 	 *			in non-promiscuous mode.
1065 	 */
1066 	while ((curr_mode < (smode + len)) && (vswp->smode_num < NUM_SMODES)) {
1067 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
1068 		if (strcmp(curr_mode, "switched") == 0) {
1069 			vswp->smode[vswp->smode_num++] = VSW_LAYER2;
1070 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
1071 			vswp->smode[vswp->smode_num++] = VSW_LAYER2_PROMISC;
1072 		} else if (strcmp(curr_mode, "routed") == 0) {
1073 			vswp->smode[vswp->smode_num++] = VSW_LAYER3;
1074 		} else {
1075 			cmn_err(CE_WARN, "Unknown switch mode %s, setting to"
1076 				" default switched mode", curr_mode);
1077 			vswp->smode[vswp->smode_num++] = VSW_LAYER2;
1078 		}
1079 		curr_mode += strlen(curr_mode) + 1;
1080 	}
1081 
1082 	D2(vswp, "%d switching modes specified", vswp->smode_num);
1083 
1084 	if (vswp->smode_num > 0)
1085 		vswp->mdprops |= VSW_MD_SMODE;
1086 
1087 md_prop_exit:
1088 	(void) md_fini_handle(mdp);
1089 
1090 	kmem_free(listp, listsz);
1091 
1092 	D1(vswp, "%s: exit", __func__);
1093 }
1094 
1095 /*
1096  * Get the mac address of the physical device.
1097  *
1098  * Returns 0 on success, 1 on failure.
1099  */
1100 static int
1101 vsw_get_physaddr(vsw_t *vswp)
1102 {
1103 	mac_handle_t	mh;
1104 	char		drv[LIFNAMSIZ];
1105 	uint_t		ddi_instance;
1106 
1107 	D1(vswp, "%s: enter", __func__);
1108 
1109 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS)
1110 		return (1);
1111 
1112 	if (mac_open(vswp->physname, ddi_instance, &mh) != 0) {
1113 		cmn_err(CE_WARN, "!mac_open %s failed", vswp->physname);
1114 		return (1);
1115 	}
1116 
1117 	READ_ENTER(&vswp->if_lockrw);
1118 	mac_unicst_get(mh, vswp->if_addr.ether_addr_octet);
1119 	RW_EXIT(&vswp->if_lockrw);
1120 
1121 	mac_close(mh);
1122 
1123 	vswp->mdprops |= VSW_DEV_MACADDR;
1124 
1125 	D1(vswp, "%s: exit", __func__);
1126 
1127 	return (0);
1128 }
1129 
1130 /*
1131  * Check to see if the card supports the setting of multiple unicst
1132  * addresses.
1133  *
1134  * Returns 0 if card supports the programming of multiple unicast addresses
1135  * and there are free address slots available, otherwise returns 1.
1136  */
1137 static int
1138 vsw_get_hw_maddr(vsw_t *vswp)
1139 {
1140 	D1(vswp, "%s: enter", __func__);
1141 
1142 	if (vswp->mh == NULL) {
1143 		return (1);
1144 	}
1145 
1146 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
1147 		DWARN(vswp, "Unable to get capabilities of"
1148 			" underlying device (%s)", vswp->physname);
1149 		return (1);
1150 	}
1151 
1152 	if (vswp->maddr.maddr_naddrfree == 0) {
1153 		cmn_err(CE_WARN,
1154 			"!device %s has no free unicast address slots",
1155 			vswp->physname);
1156 		return (1);
1157 	}
1158 
1159 	D2(vswp, "%s: %d addrs : %d free", __func__,
1160 		vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
1161 
1162 	D1(vswp, "%s: exit", __func__);
1163 
1164 	return (0);
1165 }
1166 
1167 /*
1168  * Setup for layer 2 switching.
1169  *
1170  * Returns 0 on success, 1 on failure.
1171  */
1172 static int
1173 vsw_setup_layer2(vsw_t *vswp)
1174 {
1175 	D1(vswp, "%s: enter", __func__);
1176 
1177 	vsw_switch_frame = vsw_switch_l2_frame;
1178 
1179 	/*
1180 	 * Attempt to link into the MAC layer so we can get
1181 	 * and send packets out over the physical adapter.
1182 	 */
1183 	if (vswp->mdprops & VSW_MD_PHYSNAME) {
1184 		if (vsw_mac_attach(vswp) != 0) {
1185 			/*
1186 			 * Registration with the MAC layer has failed,
1187 			 * so return 1 so that can fall back to next
1188 			 * prefered switching method.
1189 			 */
1190 			cmn_err(CE_WARN, "!Unable to join as MAC layer "
1191 				"client");
1192 			return (1);
1193 		}
1194 
1195 		if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
1196 			/*
1197 			 * Verify that underlying device can support multiple
1198 			 * unicast mac addresses, and has free capacity.
1199 			 */
1200 			if (vsw_get_hw_maddr(vswp) != 0) {
1201 				cmn_err(CE_WARN, "!unable to setup switching");
1202 				vsw_mac_detach(vswp);
1203 				return (1);
1204 			}
1205 		}
1206 
1207 	} else {
1208 		/*
1209 		 * No physical device name found in MD which is
1210 		 * required for layer 2.
1211 		 */
1212 		cmn_err(CE_WARN, "!no physical device name specified");
1213 		return (1);
1214 	}
1215 
1216 	D1(vswp, "%s: exit", __func__);
1217 
1218 	return (0);
1219 }
1220 
1221 static int
1222 vsw_setup_layer3(vsw_t *vswp)
1223 {
1224 	D1(vswp, "%s: enter", __func__);
1225 
1226 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1227 	vsw_switch_frame = vsw_switch_l3_frame;
1228 
1229 	D1(vswp, "%s: exit", __func__);
1230 
1231 	return (0);
1232 }
1233 
1234 /*
1235  * Link into the MAC layer to gain access to the services provided by
1236  * the underlying physical device driver (which should also have
1237  * registered with the MAC layer).
1238  *
1239  * Only when in layer 2 mode.
1240  */
1241 static int
1242 vsw_mac_attach(vsw_t *vswp)
1243 {
1244 	char	drv[LIFNAMSIZ];
1245 	uint_t	ddi_instance;
1246 
1247 	D1(vswp, "%s: enter", __func__);
1248 
1249 	vswp->mh = NULL;
1250 	vswp->mrh = NULL;
1251 	vswp->mstarted = B_FALSE;
1252 	vswp->mresources = B_FALSE;
1253 
1254 	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
1255 
1256 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
1257 		cmn_err(CE_WARN, "invalid device name: %s", vswp->physname);
1258 		goto mac_fail_exit;
1259 	}
1260 	if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
1261 		cmn_err(CE_WARN, "mac_open %s failed", vswp->physname);
1262 		goto mac_fail_exit;
1263 	}
1264 
1265 	ASSERT(vswp->mh != NULL);
1266 
1267 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1268 
1269 	if (vsw_multi_ring_enable) {
1270 		vsw_mac_ring_tbl_init(vswp);
1271 
1272 		/*
1273 		 * Register our receive callback.
1274 		 */
1275 		vswp->mrh = mac_rx_add(vswp->mh,
1276 			vsw_rx_queue_cb, (void *)vswp);
1277 
1278 		/*
1279 		 * Register our mac resource callback.
1280 		 */
1281 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
1282 		vswp->mresources = B_TRUE;
1283 
1284 		/*
1285 		 * Get the ring resources available to us from
1286 		 * the mac below us.
1287 		 */
1288 		mac_resources(vswp->mh);
1289 	} else {
1290 		/*
1291 		 * Just register our rx callback function
1292 		 */
1293 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1294 	}
1295 
1296 	ASSERT(vswp->mrh != NULL);
1297 
1298 	/* Get the MAC tx fn */
1299 	vswp->txinfo = mac_tx_get(vswp->mh);
1300 
1301 	/* start the interface */
1302 	if (mac_start(vswp->mh) != 0) {
1303 		cmn_err(CE_WARN, "could not start mac interface");
1304 		goto mac_fail_exit;
1305 	}
1306 
1307 	vswp->mstarted = B_TRUE;
1308 
1309 	D1(vswp, "%s: exit", __func__);
1310 	return (0);
1311 
1312 mac_fail_exit:
1313 	vsw_mac_detach(vswp);
1314 
1315 	D1(vswp, "%s: exit", __func__);
1316 	return (1);
1317 }
1318 
1319 static void
1320 vsw_mac_detach(vsw_t *vswp)
1321 {
1322 	D1(vswp, "vsw_mac_detach: enter");
1323 
1324 	ASSERT(vswp != NULL);
1325 	ASSERT(vswp->mh != NULL);
1326 
1327 	if (vsw_multi_ring_enable) {
1328 		vsw_mac_ring_tbl_destroy(vswp);
1329 	}
1330 
1331 	if (vswp->mstarted)
1332 		mac_stop(vswp->mh);
1333 	if (vswp->mrh != NULL)
1334 		mac_rx_remove(vswp->mh, vswp->mrh);
1335 	if (vswp->mresources)
1336 		mac_resource_set(vswp->mh, NULL, NULL);
1337 	mac_close(vswp->mh);
1338 
1339 	vswp->mrh = NULL;
1340 	vswp->mh = NULL;
1341 	vswp->txinfo = NULL;
1342 	vswp->mstarted = B_FALSE;
1343 
1344 	D1(vswp, "vsw_mac_detach: exit");
1345 }
1346 
1347 /*
1348  * Depending on the mode specified, the capabilites and capacity
1349  * of the underlying device setup the physical device.
1350  *
1351  * If in layer 3 mode, then do nothing.
1352  *
1353  * If in layer 2 programmed mode attempt to program the unicast address
1354  * associated with the port into the physical device. If this is not
1355  * possible due to resource exhaustion or simply because the device does
1356  * not support multiple unicast addresses then if required fallback onto
1357  * putting the card into promisc mode.
1358  *
1359  * If in promisc mode then simply set the card into promisc mode.
1360  *
1361  * Returns 0 success, 1 on failure.
1362  */
1363 static int
1364 vsw_set_hw(vsw_t *vswp, vsw_port_t *port)
1365 {
1366 	mac_multi_addr_t	mac_addr;
1367 	void			*mah;
1368 	int			err;
1369 
1370 	D1(vswp, "%s: enter", __func__);
1371 
1372 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1373 		return (0);
1374 
1375 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
1376 		return (vsw_set_hw_promisc(vswp, port));
1377 	}
1378 
1379 	if (vswp->maddr.maddr_handle == NULL)
1380 		return (1);
1381 
1382 	mah = vswp->maddr.maddr_handle;
1383 
1384 	/*
1385 	 * Attempt to program the unicast address into the HW.
1386 	 */
1387 	mac_addr.mma_addrlen = ETHERADDRL;
1388 	ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
1389 
1390 	err = vswp->maddr.maddr_add(mah, &mac_addr);
1391 	if (err != 0) {
1392 		cmn_err(CE_WARN, "!failed to program addr "
1393 			"%x:%x:%x:%x:%x:%x for port %d into device %s "
1394 			": err %d", port->p_macaddr.ether_addr_octet[0],
1395 			port->p_macaddr.ether_addr_octet[1],
1396 			port->p_macaddr.ether_addr_octet[2],
1397 			port->p_macaddr.ether_addr_octet[3],
1398 			port->p_macaddr.ether_addr_octet[4],
1399 			port->p_macaddr.ether_addr_octet[5],
1400 			port->p_instance, vswp->physname, err);
1401 
1402 		/*
1403 		 * Mark that attempt should be made to re-config sometime
1404 		 * in future if a port is deleted.
1405 		 */
1406 		vswp->recfg_reqd = B_TRUE;
1407 
1408 		/*
1409 		 * Only 1 mode specified, nothing more to do.
1410 		 */
1411 		if (vswp->smode_num == 1)
1412 			return (err);
1413 
1414 		/*
1415 		 * If promiscuous was next mode specified try to
1416 		 * set the card into that mode.
1417 		 */
1418 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
1419 			(vswp->smode[vswp->smode_idx + 1]
1420 					== VSW_LAYER2_PROMISC)) {
1421 			vswp->smode_idx += 1;
1422 			return (vsw_set_hw_promisc(vswp, port));
1423 		}
1424 		return (err);
1425 	}
1426 
1427 	port->addr_slot = mac_addr.mma_slot;
1428 	port->addr_set = VSW_ADDR_HW;
1429 
1430 	D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x for port %d "
1431 		"into slot %d of device %s",
1432 		port->p_macaddr.ether_addr_octet[0],
1433 		port->p_macaddr.ether_addr_octet[1],
1434 		port->p_macaddr.ether_addr_octet[2],
1435 		port->p_macaddr.ether_addr_octet[3],
1436 		port->p_macaddr.ether_addr_octet[4],
1437 		port->p_macaddr.ether_addr_octet[5],
1438 		port->p_instance, port->addr_slot, vswp->physname);
1439 
1440 	D1(vswp, "%s: exit", __func__);
1441 
1442 	return (0);
1443 }
1444 
1445 /*
1446  * If in layer 3 mode do nothing.
1447  *
1448  * If in layer 2 switched mode remove the address from the physical
1449  * device.
1450  *
1451  * If in layer 2 promiscuous mode disable promisc mode.
1452  *
1453  * Returns 0 on success.
1454  */
1455 static int
1456 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port)
1457 {
1458 	int		err;
1459 	void		*mah;
1460 
1461 	D1(vswp, "%s: enter", __func__);
1462 
1463 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1464 		return (0);
1465 
1466 	if (port->addr_set == VSW_ADDR_PROMISC) {
1467 		return (vsw_unset_hw_promisc(vswp, port));
1468 	}
1469 
1470 	if (port->addr_set == VSW_ADDR_HW) {
1471 		if (vswp->mh == NULL)
1472 			return (1);
1473 
1474 		if (vswp->maddr.maddr_handle == NULL)
1475 			return (1);
1476 
1477 		mah = vswp->maddr.maddr_handle;
1478 
1479 		err = vswp->maddr.maddr_remove(mah, port->addr_slot);
1480 		if (err != 0) {
1481 			cmn_err(CE_WARN, "!Unable to remove addr "
1482 				"%x:%x:%x:%x:%x:%x for port %d from device %s"
1483 				" : (err %d)",
1484 				port->p_macaddr.ether_addr_octet[0],
1485 				port->p_macaddr.ether_addr_octet[1],
1486 				port->p_macaddr.ether_addr_octet[2],
1487 				port->p_macaddr.ether_addr_octet[3],
1488 				port->p_macaddr.ether_addr_octet[4],
1489 				port->p_macaddr.ether_addr_octet[5],
1490 				port->p_instance, vswp->physname, err);
1491 			return (err);
1492 		}
1493 
1494 		port->addr_set = VSW_ADDR_UNSET;
1495 
1496 		D2(vswp, "removed addr %x:%x:%x:%x:%x:%x for "
1497 			"port %d from device %s",
1498 			port->p_macaddr.ether_addr_octet[0],
1499 			port->p_macaddr.ether_addr_octet[1],
1500 			port->p_macaddr.ether_addr_octet[2],
1501 			port->p_macaddr.ether_addr_octet[3],
1502 			port->p_macaddr.ether_addr_octet[4],
1503 			port->p_macaddr.ether_addr_octet[5],
1504 			port->p_instance, vswp->physname);
1505 	}
1506 
1507 	D1(vswp, "%s: exit", __func__);
1508 	return (0);
1509 }
1510 
1511 /*
1512  * Set network card into promisc mode.
1513  *
1514  * Returns 0 on success, 1 on failure.
1515  */
1516 static int
1517 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port)
1518 {
1519 	D1(vswp, "%s: enter", __func__);
1520 
1521 	if (vswp->mh == NULL)
1522 		return (1);
1523 
1524 	if (vswp->promisc_cnt++ == 0) {
1525 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1526 			vswp->promisc_cnt--;
1527 			return (1);
1528 		}
1529 		cmn_err(CE_NOTE, "!switching device %s into promiscuous mode",
1530 				vswp->physname);
1531 	}
1532 	port->addr_set = VSW_ADDR_PROMISC;
1533 
1534 	D1(vswp, "%s: exit", __func__);
1535 
1536 	return (0);
1537 }
1538 
1539 /*
1540  * Turn off promiscuous mode on network card.
1541  *
1542  * Returns 0 on success, 1 on failure.
1543  */
1544 static int
1545 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port)
1546 {
1547 	vsw_port_list_t 	*plist = &vswp->plist;
1548 
1549 	D1(vswp, "%s: enter", __func__);
1550 
1551 	if (vswp->mh == NULL)
1552 		return (1);
1553 
1554 	ASSERT(port->addr_set == VSW_ADDR_PROMISC);
1555 
1556 	if (--vswp->promisc_cnt == 0) {
1557 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
1558 			vswp->promisc_cnt++;
1559 			return (1);
1560 		}
1561 
1562 		/*
1563 		 * We are exiting promisc mode either because we were
1564 		 * only in promisc mode because we had failed over from
1565 		 * switched mode due to HW resource issues, or the user
1566 		 * wanted the card in promisc mode for all the ports and
1567 		 * the last port is now being deleted. Tweak the message
1568 		 * accordingly.
1569 		 */
1570 		if (plist->num_ports != 0) {
1571 			cmn_err(CE_NOTE, "!switching device %s back to "
1572 				"programmed mode", vswp->physname);
1573 		} else {
1574 			cmn_err(CE_NOTE, "!switching device %s out of "
1575 				"promiscuous mode", vswp->physname);
1576 		}
1577 	}
1578 	port->addr_set = VSW_ADDR_UNSET;
1579 
1580 	D1(vswp, "%s: exit", __func__);
1581 	return (0);
1582 }
1583 
1584 /*
1585  * Determine whether or not we are operating in our prefered
1586  * mode and if not whether the physical resources now allow us
1587  * to operate in it.
1588  *
1589  * Should only be invoked after port which is being deleted has been
1590  * removed from the port list.
1591  */
1592 static int
1593 vsw_reconfig_hw(vsw_t *vswp)
1594 {
1595 	vsw_port_list_t 	*plist = &vswp->plist;
1596 	mac_multi_addr_t	mac_addr;
1597 	vsw_port_t		*tp;
1598 	void			*mah;
1599 	int			rv = 0;
1600 	int			s_idx;
1601 
1602 	D1(vswp, "%s: enter", __func__);
1603 
1604 	if (vswp->maddr.maddr_handle == NULL)
1605 		return (1);
1606 
1607 	/*
1608 	 * Check if there are now sufficient HW resources to
1609 	 * attempt a re-config.
1610 	 */
1611 	if (plist->num_ports > vswp->maddr.maddr_naddrfree)
1612 		return (1);
1613 
1614 	/*
1615 	 * If we are in layer 2 (i.e. switched) or would like to be
1616 	 * in layer 2 then check if any ports need to be programmed
1617 	 * into the HW.
1618 	 *
1619 	 * This can happen in two cases - switched was specified as
1620 	 * the prefered mode of operation but we exhausted the HW
1621 	 * resources and so failed over to the next specifed mode,
1622 	 * or switched was the only mode specified so after HW
1623 	 * resources were exhausted there was nothing more we
1624 	 * could do.
1625 	 */
1626 	if (vswp->smode_idx > 0)
1627 		s_idx = vswp->smode_idx - 1;
1628 	else
1629 		s_idx = vswp->smode_idx;
1630 
1631 	if (vswp->smode[s_idx] == VSW_LAYER2) {
1632 		mah = vswp->maddr.maddr_handle;
1633 
1634 		D2(vswp, "%s: attempting reconfig..", __func__);
1635 
1636 		/*
1637 		 * Scan the port list for any port whose address has not
1638 		 * be programmed in HW - there should be a max of one.
1639 		 */
1640 		for (tp = plist->head; tp != NULL; tp = tp->p_next) {
1641 			if (tp->addr_set != VSW_ADDR_HW) {
1642 				mac_addr.mma_addrlen = ETHERADDRL;
1643 				ether_copy(&tp->p_macaddr, &mac_addr.mma_addr);
1644 
1645 				rv = vswp->maddr.maddr_add(mah, &mac_addr);
1646 				if (rv != 0) {
1647 					DWARN(vswp, "Error setting addr in "
1648 						"HW for port %d err %d",
1649 						tp->p_instance, rv);
1650 					goto reconfig_err_exit;
1651 				}
1652 				tp->addr_slot = mac_addr.mma_slot;
1653 
1654 				D2(vswp, "re-programmed port %d "
1655 					"addr %x:%x:%x:%x:%x:%x into slot %d"
1656 					" of device %s", tp->p_instance,
1657 					tp->p_macaddr.ether_addr_octet[0],
1658 					tp->p_macaddr.ether_addr_octet[1],
1659 					tp->p_macaddr.ether_addr_octet[2],
1660 					tp->p_macaddr.ether_addr_octet[3],
1661 					tp->p_macaddr.ether_addr_octet[4],
1662 					tp->p_macaddr.ether_addr_octet[5],
1663 					tp->addr_slot, vswp->physname);
1664 
1665 				/*
1666 				 * If up to now we had to put the card into
1667 				 * promisc mode to see this address, we
1668 				 * can now safely disable promisc mode.
1669 				 */
1670 				if (tp->addr_set == VSW_ADDR_PROMISC)
1671 					(void) vsw_unset_hw_promisc(vswp, tp);
1672 
1673 				tp->addr_set = VSW_ADDR_HW;
1674 			}
1675 		}
1676 
1677 		/* no further re-config needed */
1678 		vswp->recfg_reqd = B_FALSE;
1679 
1680 		vswp->smode_idx = s_idx;
1681 
1682 		return (0);
1683 	}
1684 
1685 reconfig_err_exit:
1686 	return (rv);
1687 }
1688 
1689 static void
1690 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
1691 {
1692 	ringp->ring_state = VSW_MAC_RING_FREE;
1693 	ringp->ring_arg = NULL;
1694 	ringp->ring_blank = NULL;
1695 	ringp->ring_vqp = NULL;
1696 	ringp->ring_vswp = vswp;
1697 }
1698 
1699 static void
1700 vsw_mac_ring_tbl_init(vsw_t *vswp)
1701 {
1702 	int		i;
1703 
1704 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
1705 
1706 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
1707 	vswp->mac_ring_tbl  =
1708 		kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t),
1709 		KM_SLEEP);
1710 
1711 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
1712 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
1713 }
1714 
1715 static void
1716 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
1717 {
1718 	int	i;
1719 
1720 	mutex_enter(&vswp->mac_ring_lock);
1721 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1722 		if (vswp->mac_ring_tbl[i].ring_state != VSW_MAC_RING_FREE) {
1723 			/*
1724 			 * Destroy the queue.
1725 			 */
1726 			vsw_queue_stop(vswp->mac_ring_tbl[i].ring_vqp);
1727 			vsw_queue_destroy(vswp->mac_ring_tbl[i].ring_vqp);
1728 
1729 			/*
1730 			 * Re-initialize the structure.
1731 			 */
1732 			vsw_mac_ring_tbl_entry_init(vswp,
1733 				&vswp->mac_ring_tbl[i]);
1734 		}
1735 	}
1736 	mutex_exit(&vswp->mac_ring_lock);
1737 
1738 	mutex_destroy(&vswp->mac_ring_lock);
1739 	kmem_free(vswp->mac_ring_tbl,
1740 		vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
1741 	vswp->mac_ring_tbl_sz = 0;
1742 }
1743 
1744 /*
1745  * Handle resource add callbacks from the driver below.
1746  */
1747 static mac_resource_handle_t
1748 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
1749 {
1750 	vsw_t		*vswp = (vsw_t *)arg;
1751 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
1752 	vsw_mac_ring_t	*ringp;
1753 	vsw_queue_t	*vqp;
1754 	int		i;
1755 
1756 	ASSERT(vswp != NULL);
1757 	ASSERT(mrp != NULL);
1758 	ASSERT(vswp->mac_ring_tbl != NULL);
1759 
1760 	D1(vswp, "%s: enter", __func__);
1761 
1762 	/*
1763 	 * Check to make sure we have the correct resource type.
1764 	 */
1765 	if (mrp->mr_type != MAC_RX_FIFO)
1766 		return (NULL);
1767 
1768 	/*
1769 	 * Find a open entry in the ring table.
1770 	 */
1771 	mutex_enter(&vswp->mac_ring_lock);
1772 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1773 		ringp = &vswp->mac_ring_tbl[i];
1774 
1775 		/*
1776 		 * Check for an empty slot, if found, then setup queue
1777 		 * and thread.
1778 		 */
1779 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
1780 			/*
1781 			 * Create the queue for this ring.
1782 			 */
1783 			vqp = vsw_queue_create();
1784 
1785 			/*
1786 			 * Initialize the ring data structure.
1787 			 */
1788 			ringp->ring_vqp = vqp;
1789 			ringp->ring_arg = mrfp->mrf_arg;
1790 			ringp->ring_blank = mrfp->mrf_blank;
1791 			ringp->ring_state = VSW_MAC_RING_INUSE;
1792 
1793 			/*
1794 			 * Create the worker thread.
1795 			 */
1796 			vqp->vq_worker = thread_create(NULL, 0,
1797 				vsw_queue_worker, ringp, 0, &p0,
1798 				TS_RUN, minclsyspri);
1799 			if (vqp->vq_worker == NULL) {
1800 				vsw_queue_destroy(vqp);
1801 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
1802 				ringp = NULL;
1803 			}
1804 
1805 			mutex_exit(&vswp->mac_ring_lock);
1806 			D1(vswp, "%s: exit", __func__);
1807 			return ((mac_resource_handle_t)ringp);
1808 		}
1809 	}
1810 	mutex_exit(&vswp->mac_ring_lock);
1811 
1812 	/*
1813 	 * No slots in the ring table available.
1814 	 */
1815 	D1(vswp, "%s: exit", __func__);
1816 	return (NULL);
1817 }
1818 
1819 static void
1820 vsw_queue_stop(vsw_queue_t *vqp)
1821 {
1822 	mutex_enter(&vqp->vq_lock);
1823 
1824 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
1825 		vqp->vq_state = VSW_QUEUE_STOP;
1826 		cv_signal(&vqp->vq_cv);
1827 
1828 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
1829 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1830 	}
1831 
1832 	mutex_exit(&vqp->vq_lock);
1833 }
1834 
1835 static vsw_queue_t *
1836 vsw_queue_create()
1837 {
1838 	vsw_queue_t *vqp;
1839 
1840 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
1841 
1842 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
1843 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
1844 	vqp->vq_first = NULL;
1845 	vqp->vq_last = NULL;
1846 	vqp->vq_state = VSW_QUEUE_STOP;
1847 
1848 	return (vqp);
1849 }
1850 
1851 static void
1852 vsw_queue_destroy(vsw_queue_t *vqp)
1853 {
1854 	cv_destroy(&vqp->vq_cv);
1855 	mutex_destroy(&vqp->vq_lock);
1856 	kmem_free(vqp, sizeof (vsw_queue_t));
1857 }
1858 
1859 static void
1860 vsw_queue_worker(vsw_mac_ring_t *rrp)
1861 {
1862 	mblk_t		*mp;
1863 	vsw_queue_t	*vqp = rrp->ring_vqp;
1864 	vsw_t		*vswp = rrp->ring_vswp;
1865 
1866 	mutex_enter(&vqp->vq_lock);
1867 
1868 	ASSERT(vqp->vq_state == VSW_QUEUE_STOP);
1869 
1870 	/*
1871 	 * Set the state to running, since the thread is now active.
1872 	 */
1873 	vqp->vq_state = VSW_QUEUE_RUNNING;
1874 
1875 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
1876 		/*
1877 		 * Wait for work to do or the state has changed
1878 		 * to not running.
1879 		 */
1880 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
1881 				(vqp->vq_first == NULL)) {
1882 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1883 		}
1884 
1885 		/*
1886 		 * Process packets that we received from the interface.
1887 		 */
1888 		if (vqp->vq_first != NULL) {
1889 			mp = vqp->vq_first;
1890 
1891 			vqp->vq_first = NULL;
1892 			vqp->vq_last = NULL;
1893 
1894 			mutex_exit(&vqp->vq_lock);
1895 
1896 			/* switch the chain of packets received */
1897 			vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1898 
1899 			mutex_enter(&vqp->vq_lock);
1900 		}
1901 	}
1902 
1903 	/*
1904 	 * We are drained and signal we are done.
1905 	 */
1906 	vqp->vq_state = VSW_QUEUE_DRAINED;
1907 	cv_signal(&vqp->vq_cv);
1908 
1909 	/*
1910 	 * Exit lock and drain the remaining packets.
1911 	 */
1912 	mutex_exit(&vqp->vq_lock);
1913 
1914 	/*
1915 	 * Exit the thread
1916 	 */
1917 	thread_exit();
1918 }
1919 
1920 /*
1921  * static void
1922  * vsw_rx_queue_cb() - Receive callback routine when
1923  *	vsw_multi_ring_enable is non-zero.  Queue the packets
1924  *	to a packet queue for a worker thread to process.
1925  */
1926 static void
1927 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1928 {
1929 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
1930 	vsw_t		*vswp = (vsw_t *)arg;
1931 	vsw_queue_t	*vqp;
1932 	mblk_t		*bp, *last;
1933 
1934 	ASSERT(mrh != NULL);
1935 	ASSERT(vswp != NULL);
1936 	ASSERT(mp != NULL);
1937 
1938 	D1(vswp, "%s: enter", __func__);
1939 
1940 	/*
1941 	 * Find the last element in the mblk chain.
1942 	 */
1943 	bp = mp;
1944 	do {
1945 		last = bp;
1946 		bp = bp->b_next;
1947 	} while (bp != NULL);
1948 
1949 	/* Get the queue for the packets */
1950 	vqp = ringp->ring_vqp;
1951 
1952 	/*
1953 	 * Grab the lock such we can queue the packets.
1954 	 */
1955 	mutex_enter(&vqp->vq_lock);
1956 
1957 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
1958 		freemsg(mp);
1959 		goto vsw_rx_queue_cb_exit;
1960 	}
1961 
1962 	/*
1963 	 * Add the mblk chain to the queue.  If there
1964 	 * is some mblks in the queue, then add the new
1965 	 * chain to the end.
1966 	 */
1967 	if (vqp->vq_first == NULL)
1968 		vqp->vq_first = mp;
1969 	else
1970 		vqp->vq_last->b_next = mp;
1971 
1972 	vqp->vq_last = last;
1973 
1974 	/*
1975 	 * Signal the worker thread that there is work to
1976 	 * do.
1977 	 */
1978 	cv_signal(&vqp->vq_cv);
1979 
1980 	/*
1981 	 * Let go of the lock and exit.
1982 	 */
1983 vsw_rx_queue_cb_exit:
1984 	mutex_exit(&vqp->vq_lock);
1985 	D1(vswp, "%s: exit", __func__);
1986 }
1987 
1988 /*
1989  * receive callback routine. Invoked by MAC layer when there
1990  * are pkts being passed up from physical device.
1991  *
1992  * PERF: It may be more efficient when the card is in promisc
1993  * mode to check the dest address of the pkts here (against
1994  * the FDB) rather than checking later. Needs to be investigated.
1995  */
1996 static void
1997 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1998 {
1999 	_NOTE(ARGUNUSED(mrh))
2000 
2001 	vsw_t		*vswp = (vsw_t *)arg;
2002 
2003 	ASSERT(vswp != NULL);
2004 
2005 	D1(vswp, "vsw_rx_cb: enter");
2006 
2007 	/* switch the chain of packets received */
2008 	vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
2009 
2010 	D1(vswp, "vsw_rx_cb: exit");
2011 }
2012 
2013 /*
2014  * Send a message out over the physical device via the MAC layer.
2015  *
2016  * Returns any mblks that it was unable to transmit.
2017  */
2018 static mblk_t *
2019 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
2020 {
2021 	const mac_txinfo_t	*mtp;
2022 	mblk_t			*nextp;
2023 
2024 	if (vswp->mh == NULL) {
2025 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
2026 		return (mp);
2027 	} else {
2028 		for (;;) {
2029 			nextp = mp->b_next;
2030 			mp->b_next = NULL;
2031 
2032 			mtp = vswp->txinfo;
2033 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
2034 				mp->b_next = nextp;
2035 				break;
2036 			}
2037 
2038 			if ((mp = nextp) == NULL)
2039 				break;
2040 
2041 		}
2042 
2043 	}
2044 
2045 	return (mp);
2046 }
2047 
2048 /*
2049  * Register with the MAC layer as a network device, so we
2050  * can be plumbed if necessary.
2051  */
2052 static int
2053 vsw_mac_register(vsw_t *vswp)
2054 {
2055 	mac_register_t	*macp;
2056 	int		rv;
2057 
2058 	D1(vswp, "%s: enter", __func__);
2059 
2060 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
2061 		return (EINVAL);
2062 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2063 	macp->m_driver = vswp;
2064 	macp->m_dip = vswp->dip;
2065 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
2066 	macp->m_callbacks = &vsw_m_callbacks;
2067 	macp->m_min_sdu = 0;
2068 	macp->m_max_sdu = ETHERMTU;
2069 	rv = mac_register(macp, &vswp->if_mh);
2070 	mac_free(macp);
2071 	if (rv == 0)
2072 		vswp->if_state |= VSW_IF_REG;
2073 
2074 	D1(vswp, "%s: exit", __func__);
2075 
2076 	return (rv);
2077 }
2078 
2079 static int
2080 vsw_mac_unregister(vsw_t *vswp)
2081 {
2082 	int		rv = 0;
2083 
2084 	D1(vswp, "%s: enter", __func__);
2085 
2086 	WRITE_ENTER(&vswp->if_lockrw);
2087 
2088 	if (vswp->if_state & VSW_IF_REG) {
2089 		rv = mac_unregister(vswp->if_mh);
2090 		if (rv != 0) {
2091 			DWARN(vswp, "%s: unable to unregister from MAC "
2092 				"framework", __func__);
2093 
2094 			RW_EXIT(&vswp->if_lockrw);
2095 			D1(vswp, "%s: fail exit", __func__);
2096 			return (rv);
2097 		}
2098 
2099 		/* mark i/f as down and unregistered */
2100 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
2101 	}
2102 	RW_EXIT(&vswp->if_lockrw);
2103 
2104 	vswp->mdprops &= ~(VSW_MD_MACADDR | VSW_DEV_MACADDR);
2105 
2106 	D1(vswp, "%s: exit", __func__);
2107 
2108 	return (rv);
2109 }
2110 
2111 static int
2112 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
2113 {
2114 	vsw_t			*vswp = (vsw_t *)arg;
2115 
2116 	D1(vswp, "%s: enter", __func__);
2117 
2118 	if (vswp->mh == NULL)
2119 		return (EINVAL);
2120 
2121 	/* return stats from underlying device */
2122 	*val = mac_stat_get(vswp->mh, stat);
2123 	return (0);
2124 }
2125 
2126 static void
2127 vsw_m_stop(void *arg)
2128 {
2129 	vsw_t		*vswp = (vsw_t *)arg;
2130 
2131 	D1(vswp, "%s: enter", __func__);
2132 
2133 	WRITE_ENTER(&vswp->if_lockrw);
2134 	vswp->if_state &= ~VSW_IF_UP;
2135 	RW_EXIT(&vswp->if_lockrw);
2136 
2137 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2138 }
2139 
2140 static int
2141 vsw_m_start(void *arg)
2142 {
2143 	vsw_t		*vswp = (vsw_t *)arg;
2144 
2145 	D1(vswp, "%s: enter", __func__);
2146 
2147 	WRITE_ENTER(&vswp->if_lockrw);
2148 	vswp->if_state |= VSW_IF_UP;
2149 	RW_EXIT(&vswp->if_lockrw);
2150 
2151 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2152 	return (0);
2153 }
2154 
2155 /*
2156  * Change the local interface address.
2157  */
2158 static int
2159 vsw_m_unicst(void *arg, const uint8_t *macaddr)
2160 {
2161 	vsw_t		*vswp = (vsw_t *)arg;
2162 
2163 	D1(vswp, "%s: enter", __func__);
2164 
2165 	WRITE_ENTER(&vswp->if_lockrw);
2166 	ether_copy(macaddr, &vswp->if_addr);
2167 	RW_EXIT(&vswp->if_lockrw);
2168 
2169 	D1(vswp, "%s: exit", __func__);
2170 
2171 	return (0);
2172 }
2173 
2174 static int
2175 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
2176 {
2177 	vsw_t		*vswp = (vsw_t *)arg;
2178 	mcst_addr_t	*mcst_p = NULL;
2179 	uint64_t	addr = 0x0;
2180 	int		i, ret = 0;
2181 
2182 	D1(vswp, "%s: enter", __func__);
2183 
2184 	/*
2185 	 * Convert address into form that can be used
2186 	 * as hash table key.
2187 	 */
2188 	for (i = 0; i < ETHERADDRL; i++) {
2189 		addr = (addr << 8) | mca[i];
2190 	}
2191 
2192 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
2193 
2194 	if (add) {
2195 		D2(vswp, "%s: adding multicast", __func__);
2196 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2197 			/*
2198 			 * Update the list of multicast addresses
2199 			 * contained within the vsw_t structure to
2200 			 * include this new one.
2201 			 */
2202 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
2203 			if (mcst_p == NULL) {
2204 				DERR(vswp, "%s unable to alloc mem", __func__);
2205 				return (1);
2206 			}
2207 			mcst_p->addr = addr;
2208 
2209 			mutex_enter(&vswp->mca_lock);
2210 			mcst_p->nextp = vswp->mcap;
2211 			vswp->mcap = mcst_p;
2212 			mutex_exit(&vswp->mca_lock);
2213 
2214 			/*
2215 			 * Call into the underlying driver to program the
2216 			 * address into HW.
2217 			 */
2218 			if (vswp->mh != NULL) {
2219 				ret = mac_multicst_add(vswp->mh, mca);
2220 				if (ret != 0) {
2221 					cmn_err(CE_WARN, "!unable to add "
2222 						"multicast address");
2223 					goto vsw_remove_addr;
2224 				}
2225 			}
2226 		} else {
2227 			cmn_err(CE_WARN, "!unable to add multicast address");
2228 		}
2229 		return (ret);
2230 	}
2231 
2232 vsw_remove_addr:
2233 
2234 	D2(vswp, "%s: removing multicast", __func__);
2235 	/*
2236 	 * Remove the address from the hash table..
2237 	 */
2238 	if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2239 
2240 		/*
2241 		 * ..and then from the list maintained in the
2242 		 * vsw_t structure.
2243 		 */
2244 		vsw_del_addr(VSW_LOCALDEV, vswp, addr);
2245 
2246 		if (vswp->mh != NULL)
2247 			(void) mac_multicst_remove(vswp->mh, mca);
2248 	}
2249 
2250 	D1(vswp, "%s: exit", __func__);
2251 
2252 	return (0);
2253 }
2254 
2255 static int
2256 vsw_m_promisc(void *arg, boolean_t on)
2257 {
2258 	vsw_t		*vswp = (vsw_t *)arg;
2259 
2260 	D1(vswp, "%s: enter", __func__);
2261 
2262 	WRITE_ENTER(&vswp->if_lockrw);
2263 	if (on)
2264 		vswp->if_state |= VSW_IF_PROMISC;
2265 	else
2266 		vswp->if_state &= ~VSW_IF_PROMISC;
2267 	RW_EXIT(&vswp->if_lockrw);
2268 
2269 	D1(vswp, "%s: exit", __func__);
2270 
2271 	return (0);
2272 }
2273 
2274 static mblk_t *
2275 vsw_m_tx(void *arg, mblk_t *mp)
2276 {
2277 	vsw_t		*vswp = (vsw_t *)arg;
2278 
2279 	D1(vswp, "%s: enter", __func__);
2280 
2281 	vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
2282 
2283 	D1(vswp, "%s: exit", __func__);
2284 
2285 	return (NULL);
2286 }
2287 
2288 /*
2289  * Register for machine description (MD) updates.
2290  */
2291 static void
2292 vsw_mdeg_register(vsw_t *vswp)
2293 {
2294 	mdeg_prop_spec_t	*pspecp;
2295 	mdeg_node_spec_t	*inst_specp;
2296 	mdeg_handle_t		mdeg_hdl;
2297 	size_t			templatesz;
2298 	int			inst, rv;
2299 
2300 	D1(vswp, "%s: enter", __func__);
2301 
2302 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
2303 		DDI_PROP_DONTPASS, reg_propname, -1);
2304 	if (inst == -1) {
2305 		DERR(vswp, "%s: unable to get %s property",
2306 						__func__, reg_propname);
2307 		return;
2308 	}
2309 
2310 	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
2311 
2312 	/*
2313 	 * Allocate and initialize a per-instance copy
2314 	 * of the global property spec array that will
2315 	 * uniquely identify this vsw instance.
2316 	 */
2317 	templatesz = sizeof (vsw_prop_template);
2318 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
2319 
2320 	bcopy(vsw_prop_template, pspecp, templatesz);
2321 
2322 	VSW_SET_MDEG_PROP_INST(pspecp, inst);
2323 
2324 	/* initialize the complete prop spec structure */
2325 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
2326 	inst_specp->namep = "virtual-device";
2327 	inst_specp->specp = pspecp;
2328 
2329 	/* perform the registration */
2330 	rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb,
2331 	    (void *)vswp, &mdeg_hdl);
2332 
2333 	if (rv != MDEG_SUCCESS) {
2334 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
2335 		kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
2336 		kmem_free(pspecp, templatesz);
2337 		return;
2338 	}
2339 
2340 	/* save off data that will be needed later */
2341 	vswp->inst_spec = inst_specp;
2342 	vswp->mdeg_hdl = mdeg_hdl;
2343 
2344 	D1(vswp, "%s: exit", __func__);
2345 }
2346 
2347 static void
2348 vsw_mdeg_unregister(vsw_t *vswp)
2349 {
2350 	D1(vswp, "vsw_mdeg_unregister: enter");
2351 
2352 	(void) mdeg_unregister(vswp->mdeg_hdl);
2353 
2354 	if (vswp->inst_spec->specp != NULL) {
2355 		(void) kmem_free(vswp->inst_spec->specp,
2356 			sizeof (vsw_prop_template));
2357 		vswp->inst_spec->specp = NULL;
2358 	}
2359 
2360 	if (vswp->inst_spec != NULL) {
2361 		(void) kmem_free(vswp->inst_spec,
2362 			sizeof (mdeg_node_spec_t));
2363 		vswp->inst_spec = NULL;
2364 	}
2365 
2366 	D1(vswp, "vsw_mdeg_unregister: exit");
2367 }
2368 
2369 static int
2370 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2371 {
2372 	vsw_t		*vswp;
2373 	int		idx;
2374 	md_t		*mdp;
2375 	mde_cookie_t	node;
2376 	uint64_t	inst;
2377 
2378 	if (resp == NULL)
2379 		return (MDEG_FAILURE);
2380 
2381 	vswp = (vsw_t *)cb_argp;
2382 
2383 	D1(vswp, "%s: added %d : removed %d : matched %d",
2384 		__func__, resp->added.nelem, resp->removed.nelem,
2385 		resp->match_prev.nelem);
2386 
2387 	/* process added ports */
2388 	for (idx = 0; idx < resp->added.nelem; idx++) {
2389 		mdp = resp->added.mdp;
2390 		node = resp->added.mdep[idx];
2391 
2392 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
2393 
2394 		if (vsw_port_add(vswp, mdp, &node) != 0) {
2395 			cmn_err(CE_WARN, "Unable to add new port (0x%lx)",
2396 					node);
2397 		}
2398 	}
2399 
2400 	/* process removed ports */
2401 	for (idx = 0; idx < resp->removed.nelem; idx++) {
2402 		mdp = resp->removed.mdp;
2403 		node = resp->removed.mdep[idx];
2404 
2405 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
2406 			DERR(vswp, "%s: prop(%s) not found port(%d)",
2407 				__func__, id_propname, idx);
2408 			continue;
2409 		}
2410 
2411 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
2412 
2413 		if (vsw_port_detach(vswp, inst) != 0) {
2414 			cmn_err(CE_WARN, "Unable to remove port %ld", inst);
2415 		}
2416 	}
2417 
2418 	/*
2419 	 * Currently no support for updating already active ports.
2420 	 * So, ignore the match_curr and match_priv arrays for now.
2421 	 */
2422 
2423 	D1(vswp, "%s: exit", __func__);
2424 
2425 	return (MDEG_SUCCESS);
2426 }
2427 
2428 /*
2429  * Add a new port to the system.
2430  *
2431  * Returns 0 on success, 1 on failure.
2432  */
2433 int
2434 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
2435 {
2436 	uint64_t		ldc_id;
2437 	uint8_t			*addrp;
2438 	int			i, addrsz;
2439 	int			num_nodes = 0, nchan = 0;
2440 	int			listsz = 0;
2441 	mde_cookie_t		*listp = NULL;
2442 	struct ether_addr	ea;
2443 	uint64_t		macaddr;
2444 	uint64_t		inst = 0;
2445 	vsw_port_t		*port;
2446 
2447 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
2448 		DWARN(vswp, "%s: prop(%s) not found", __func__,
2449 			id_propname);
2450 		return (1);
2451 	}
2452 
2453 	/*
2454 	 * Find the channel endpoint node(s) (which should be under this
2455 	 * port node) which contain the channel id(s).
2456 	 */
2457 	if ((num_nodes = md_node_count(mdp)) <= 0) {
2458 		DERR(vswp, "%s: invalid number of nodes found (%d)",
2459 			__func__, num_nodes);
2460 		return (1);
2461 	}
2462 
2463 	/* allocate enough space for node list */
2464 	listsz = num_nodes * sizeof (mde_cookie_t);
2465 	listp = kmem_zalloc(listsz, KM_SLEEP);
2466 
2467 	nchan = md_scan_dag(mdp, *node,
2468 		md_find_name(mdp, chan_propname),
2469 		md_find_name(mdp, "fwd"), listp);
2470 
2471 	if (nchan <= 0) {
2472 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
2473 		kmem_free(listp, listsz);
2474 		return (1);
2475 	}
2476 
2477 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
2478 
2479 	/* use property from first node found */
2480 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
2481 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
2482 			id_propname);
2483 		kmem_free(listp, listsz);
2484 		return (1);
2485 	}
2486 
2487 	/* don't need list any more */
2488 	kmem_free(listp, listsz);
2489 
2490 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
2491 
2492 	/* read mac-address property */
2493 	if (md_get_prop_data(mdp, *node, remaddr_propname,
2494 					&addrp, &addrsz)) {
2495 		DWARN(vswp, "%s: prop(%s) not found",
2496 				__func__, remaddr_propname);
2497 		return (1);
2498 	}
2499 
2500 	if (addrsz < ETHERADDRL) {
2501 		DWARN(vswp, "%s: invalid address size", __func__);
2502 		return (1);
2503 	}
2504 
2505 	macaddr = *((uint64_t *)addrp);
2506 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
2507 
2508 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2509 		ea.ether_addr_octet[i] = macaddr & 0xFF;
2510 		macaddr >>= 8;
2511 	}
2512 
2513 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
2514 		DERR(vswp, "%s: failed to attach port", __func__);
2515 		return (1);
2516 	}
2517 
2518 	port = vsw_lookup_port(vswp, (int)inst);
2519 
2520 	/* just successfuly created the port, so it should exist */
2521 	ASSERT(port != NULL);
2522 
2523 	return (0);
2524 }
2525 
2526 /*
2527  * Attach the specified port.
2528  *
2529  * Returns 0 on success, 1 on failure.
2530  */
2531 static int
2532 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
2533 struct ether_addr *macaddr)
2534 {
2535 	vsw_port_list_t		*plist = &vswp->plist;
2536 	vsw_port_t		*port, **prev_port;
2537 	int			i;
2538 
2539 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
2540 
2541 	/* port already exists? */
2542 	READ_ENTER(&plist->lockrw);
2543 	for (port = plist->head; port != NULL; port = port->p_next) {
2544 		if (port->p_instance == p_instance) {
2545 			DWARN(vswp, "%s: port instance %d already attached",
2546 				__func__, p_instance);
2547 			RW_EXIT(&plist->lockrw);
2548 			return (1);
2549 		}
2550 	}
2551 	RW_EXIT(&plist->lockrw);
2552 
2553 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
2554 	port->p_vswp = vswp;
2555 	port->p_instance = p_instance;
2556 	port->p_ldclist.num_ldcs = 0;
2557 	port->p_ldclist.head = NULL;
2558 	port->addr_set = VSW_ADDR_UNSET;
2559 
2560 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
2561 
2562 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
2563 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
2564 
2565 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
2566 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
2567 
2568 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
2569 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
2570 	port->state = VSW_PORT_INIT;
2571 
2572 	if (nids > VSW_PORT_MAX_LDCS) {
2573 		D2(vswp, "%s: using first of %d ldc ids",
2574 			__func__, nids);
2575 		nids = VSW_PORT_MAX_LDCS;
2576 	}
2577 
2578 	D2(vswp, "%s: %d nids", __func__, nids);
2579 	for (i = 0; i < nids; i++) {
2580 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
2581 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
2582 			DERR(vswp, "%s: ldc_attach failed", __func__);
2583 
2584 			rw_destroy(&port->p_ldclist.lockrw);
2585 
2586 			cv_destroy(&port->ref_cv);
2587 			mutex_destroy(&port->ref_lock);
2588 
2589 			cv_destroy(&port->state_cv);
2590 			mutex_destroy(&port->state_lock);
2591 
2592 			mutex_destroy(&port->tx_lock);
2593 			mutex_destroy(&port->mca_lock);
2594 			kmem_free(port, sizeof (vsw_port_t));
2595 			return (1);
2596 		}
2597 	}
2598 
2599 	ether_copy(macaddr, &port->p_macaddr);
2600 
2601 	WRITE_ENTER(&plist->lockrw);
2602 
2603 	/* create the fdb entry for this port/mac address */
2604 	(void) vsw_add_fdb(vswp, port);
2605 
2606 	(void) vsw_set_hw(vswp, port);
2607 
2608 	/* link it into the list of ports for this vsw instance */
2609 	prev_port = (vsw_port_t **)(&plist->head);
2610 	port->p_next = *prev_port;
2611 	*prev_port = port;
2612 	plist->num_ports++;
2613 	RW_EXIT(&plist->lockrw);
2614 
2615 	/*
2616 	 * Initialise the port and any ldc's under it.
2617 	 */
2618 	(void) vsw_init_ldcs(port);
2619 
2620 	D1(vswp, "%s: exit", __func__);
2621 	return (0);
2622 }
2623 
2624 /*
2625  * Detach the specified port.
2626  *
2627  * Returns 0 on success, 1 on failure.
2628  */
2629 static int
2630 vsw_port_detach(vsw_t *vswp, int p_instance)
2631 {
2632 	vsw_port_t	*port = NULL;
2633 	vsw_port_list_t	*plist = &vswp->plist;
2634 
2635 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
2636 
2637 	WRITE_ENTER(&plist->lockrw);
2638 
2639 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
2640 		RW_EXIT(&plist->lockrw);
2641 		return (1);
2642 	}
2643 
2644 	if (vsw_plist_del_node(vswp, port)) {
2645 		RW_EXIT(&plist->lockrw);
2646 		return (1);
2647 	}
2648 
2649 	/* Remove address if was programmed into HW. */
2650 	(void) vsw_unset_hw(vswp, port);
2651 
2652 	/* Remove the fdb entry for this port/mac address */
2653 	(void) vsw_del_fdb(vswp, port);
2654 
2655 	/* Remove any multicast addresses.. */
2656 	vsw_del_mcst_port(port);
2657 
2658 	/*
2659 	 * No longer need to hold writer lock on port list now
2660 	 * that we have unlinked the target port from the list.
2661 	 */
2662 	RW_EXIT(&plist->lockrw);
2663 
2664 	READ_ENTER(&plist->lockrw);
2665 
2666 	if (vswp->recfg_reqd)
2667 		(void) vsw_reconfig_hw(vswp);
2668 
2669 	RW_EXIT(&plist->lockrw);
2670 
2671 	if (vsw_port_delete(port)) {
2672 		return (1);
2673 	}
2674 
2675 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
2676 	return (0);
2677 }
2678 
2679 /*
2680  * Detach all active ports.
2681  *
2682  * Returns 0 on success, 1 on failure.
2683  */
2684 static int
2685 vsw_detach_ports(vsw_t *vswp)
2686 {
2687 	vsw_port_list_t 	*plist = &vswp->plist;
2688 	vsw_port_t		*port = NULL;
2689 
2690 	D1(vswp, "%s: enter", __func__);
2691 
2692 	WRITE_ENTER(&plist->lockrw);
2693 
2694 	while ((port = plist->head) != NULL) {
2695 		if (vsw_plist_del_node(vswp, port)) {
2696 			DERR(vswp, "%s: Error deleting port %d"
2697 				" from port list", __func__,
2698 				port->p_instance);
2699 			RW_EXIT(&plist->lockrw);
2700 			return (1);
2701 		}
2702 
2703 		/* Remove address if was programmed into HW. */
2704 		(void) vsw_unset_hw(vswp, port);
2705 
2706 		/* Remove the fdb entry for this port/mac address */
2707 		(void) vsw_del_fdb(vswp, port);
2708 
2709 		/* Remove any multicast addresses.. */
2710 		vsw_del_mcst_port(port);
2711 
2712 		/*
2713 		 * No longer need to hold the lock on the port list
2714 		 * now that we have unlinked the target port from the
2715 		 * list.
2716 		 */
2717 		RW_EXIT(&plist->lockrw);
2718 		if (vsw_port_delete(port)) {
2719 			DERR(vswp, "%s: Error deleting port %d",
2720 				__func__, port->p_instance);
2721 			return (1);
2722 		}
2723 		WRITE_ENTER(&plist->lockrw);
2724 	}
2725 	RW_EXIT(&plist->lockrw);
2726 
2727 	D1(vswp, "%s: exit", __func__);
2728 
2729 	return (0);
2730 }
2731 
2732 /*
2733  * Delete the specified port.
2734  *
2735  * Returns 0 on success, 1 on failure.
2736  */
2737 static int
2738 vsw_port_delete(vsw_port_t *port)
2739 {
2740 	vsw_ldc_list_t 		*ldcl;
2741 	vsw_t			*vswp = port->p_vswp;
2742 
2743 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
2744 
2745 	(void) vsw_uninit_ldcs(port);
2746 
2747 	/*
2748 	 * Wait for any pending ctrl msg tasks which reference this
2749 	 * port to finish.
2750 	 */
2751 	if (vsw_drain_port_taskq(port))
2752 		return (1);
2753 
2754 	/*
2755 	 * Wait for port reference count to hit zero.
2756 	 */
2757 	mutex_enter(&port->ref_lock);
2758 	while (port->ref_cnt != 0)
2759 		cv_wait(&port->ref_cv, &port->ref_lock);
2760 	mutex_exit(&port->ref_lock);
2761 
2762 	/*
2763 	 * Wait for any active callbacks to finish
2764 	 */
2765 	if (vsw_drain_ldcs(port))
2766 		return (1);
2767 
2768 	ldcl = &port->p_ldclist;
2769 	WRITE_ENTER(&ldcl->lockrw);
2770 	while (ldcl->num_ldcs > 0) {
2771 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
2772 			cmn_err(CE_WARN, "unable to detach ldc %ld",
2773 					ldcl->head->ldc_id);
2774 			RW_EXIT(&ldcl->lockrw);
2775 			return (1);
2776 		}
2777 	}
2778 	RW_EXIT(&ldcl->lockrw);
2779 
2780 	rw_destroy(&port->p_ldclist.lockrw);
2781 
2782 	mutex_destroy(&port->mca_lock);
2783 	mutex_destroy(&port->tx_lock);
2784 	cv_destroy(&port->ref_cv);
2785 	mutex_destroy(&port->ref_lock);
2786 
2787 	cv_destroy(&port->state_cv);
2788 	mutex_destroy(&port->state_lock);
2789 
2790 	kmem_free(port, sizeof (vsw_port_t));
2791 
2792 	D1(vswp, "%s: exit", __func__);
2793 
2794 	return (0);
2795 }
2796 
2797 /*
2798  * Attach a logical domain channel (ldc) under a specified port.
2799  *
2800  * Returns 0 on success, 1 on failure.
2801  */
2802 static int
2803 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
2804 {
2805 	vsw_t 		*vswp = port->p_vswp;
2806 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
2807 	vsw_ldc_t 	*ldcp = NULL;
2808 	ldc_attr_t 	attr;
2809 	ldc_status_t	istatus;
2810 	int 		status = DDI_FAILURE;
2811 	int		rv;
2812 
2813 	D1(vswp, "%s: enter", __func__);
2814 
2815 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
2816 	if (ldcp == NULL) {
2817 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
2818 		return (1);
2819 	}
2820 	ldcp->ldc_id = ldc_id;
2821 
2822 	/* allocate pool of receive mblks */
2823 	rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
2824 	if (rv) {
2825 		DWARN(vswp, "%s: unable to create free mblk pool for"
2826 			" channel %ld (rv %d)", __func__, ldc_id, rv);
2827 		kmem_free(ldcp, sizeof (vsw_ldc_t));
2828 		return (1);
2829 	}
2830 
2831 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
2832 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
2833 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
2834 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
2835 
2836 	/* required for handshake with peer */
2837 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
2838 	ldcp->peer_session = 0;
2839 	ldcp->session_status = 0;
2840 
2841 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
2842 	ldcp->hss_id = 1;	/* Initial handshake session id */
2843 
2844 	/* only set for outbound lane, inbound set by peer */
2845 	mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
2846 	mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
2847 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
2848 
2849 	attr.devclass = LDC_DEV_NT_SVC;
2850 	attr.instance = ddi_get_instance(vswp->dip);
2851 	attr.mode = LDC_MODE_UNRELIABLE;
2852 	attr.mtu = VSW_LDC_MTU;
2853 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
2854 	if (status != 0) {
2855 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
2856 		    __func__, ldc_id, status);
2857 		goto ldc_attach_fail;
2858 	}
2859 
2860 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
2861 	if (status != 0) {
2862 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
2863 		    __func__, ldc_id, status);
2864 		(void) ldc_fini(ldcp->ldc_handle);
2865 		goto ldc_attach_fail;
2866 	}
2867 
2868 
2869 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2870 		DERR(vswp, "%s: ldc_status failed", __func__);
2871 		return (1);
2872 	}
2873 
2874 	ldcp->ldc_status = istatus;
2875 	ldcp->ldc_port = port;
2876 	ldcp->ldc_vswp = vswp;
2877 
2878 	/* link it into the list of channels for this port */
2879 	WRITE_ENTER(&ldcl->lockrw);
2880 	ldcp->ldc_next = ldcl->head;
2881 	ldcl->head = ldcp;
2882 	ldcl->num_ldcs++;
2883 	RW_EXIT(&ldcl->lockrw);
2884 
2885 	D1(vswp, "%s: exit", __func__);
2886 	return (0);
2887 
2888 ldc_attach_fail:
2889 	mutex_destroy(&ldcp->ldc_txlock);
2890 	mutex_destroy(&ldcp->ldc_cblock);
2891 
2892 	cv_destroy(&ldcp->drain_cv);
2893 
2894 	if (ldcp->rxh != NULL) {
2895 		if (vio_destroy_mblks(ldcp->rxh) != 0) {
2896 			/*
2897 			 * Something odd has happened, as the destroy
2898 			 * will only fail if some mblks have been allocated
2899 			 * from the pool already (which shouldn't happen)
2900 			 * and have not been returned.
2901 			 *
2902 			 * Add the pool pointer to a list maintained in
2903 			 * the device instance. Another attempt will be made
2904 			 * to free the pool when the device itself detaches.
2905 			 */
2906 			cmn_err(CE_WARN, "Creation of ldc channel %ld failed"
2907 				" and cannot destroy associated mblk pool",
2908 				ldc_id);
2909 			ldcp->rxh->nextp =  vswp->rxh;
2910 			vswp->rxh = ldcp->rxh;
2911 		}
2912 	}
2913 	mutex_destroy(&ldcp->drain_cv_lock);
2914 	mutex_destroy(&ldcp->hss_lock);
2915 
2916 	mutex_destroy(&ldcp->lane_in.seq_lock);
2917 	mutex_destroy(&ldcp->lane_out.seq_lock);
2918 	kmem_free(ldcp, sizeof (vsw_ldc_t));
2919 
2920 	return (1);
2921 }
2922 
2923 /*
2924  * Detach a logical domain channel (ldc) belonging to a
2925  * particular port.
2926  *
2927  * Returns 0 on success, 1 on failure.
2928  */
2929 static int
2930 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
2931 {
2932 	vsw_t 		*vswp = port->p_vswp;
2933 	vsw_ldc_t 	*ldcp, *prev_ldcp;
2934 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2935 	int 		rv;
2936 
2937 	prev_ldcp = ldcl->head;
2938 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
2939 		if (ldcp->ldc_id == ldc_id) {
2940 			break;
2941 		}
2942 	}
2943 
2944 	/* specified ldc id not found */
2945 	if (ldcp == NULL) {
2946 		DERR(vswp, "%s: ldcp = NULL", __func__);
2947 		return (1);
2948 	}
2949 
2950 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
2951 
2952 	/*
2953 	 * Before we can close the channel we must release any mapped
2954 	 * resources (e.g. drings).
2955 	 */
2956 	vsw_free_lane_resources(ldcp, INBOUND);
2957 	vsw_free_lane_resources(ldcp, OUTBOUND);
2958 
2959 	/*
2960 	 * If the close fails we are in serious trouble, as won't
2961 	 * be able to delete the parent port.
2962 	 */
2963 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
2964 		DERR(vswp, "%s: error %d closing channel %lld",
2965 			__func__, rv, ldcp->ldc_id);
2966 		return (1);
2967 	}
2968 
2969 	(void) ldc_fini(ldcp->ldc_handle);
2970 
2971 	ldcp->ldc_status = LDC_INIT;
2972 	ldcp->ldc_handle = NULL;
2973 	ldcp->ldc_vswp = NULL;
2974 
2975 	if (ldcp->rxh != NULL) {
2976 		if (vio_destroy_mblks(ldcp->rxh)) {
2977 			/*
2978 			 * Mostly likely some mblks are still in use and
2979 			 * have not been returned to the pool. Add the pool
2980 			 * to the list maintained in the device instance.
2981 			 * Another attempt will be made to destroy the pool
2982 			 * when the device detaches.
2983 			 */
2984 			ldcp->rxh->nextp =  vswp->rxh;
2985 			vswp->rxh = ldcp->rxh;
2986 		}
2987 	}
2988 
2989 	mutex_destroy(&ldcp->ldc_txlock);
2990 	mutex_destroy(&ldcp->ldc_cblock);
2991 	cv_destroy(&ldcp->drain_cv);
2992 	mutex_destroy(&ldcp->drain_cv_lock);
2993 	mutex_destroy(&ldcp->hss_lock);
2994 	mutex_destroy(&ldcp->lane_in.seq_lock);
2995 	mutex_destroy(&ldcp->lane_out.seq_lock);
2996 
2997 	/* unlink it from the list */
2998 	prev_ldcp = ldcp->ldc_next;
2999 	ldcl->num_ldcs--;
3000 	kmem_free(ldcp, sizeof (vsw_ldc_t));
3001 
3002 	return (0);
3003 }
3004 
3005 /*
3006  * Open and attempt to bring up the channel. Note that channel
3007  * can only be brought up if peer has also opened channel.
3008  *
3009  * Returns 0 if can open and bring up channel, otherwise
3010  * returns 1.
3011  */
3012 static int
3013 vsw_ldc_init(vsw_ldc_t *ldcp)
3014 {
3015 	vsw_t 		*vswp = ldcp->ldc_vswp;
3016 	ldc_status_t	istatus = 0;
3017 	int		rv;
3018 
3019 	D1(vswp, "%s: enter", __func__);
3020 
3021 	LDC_ENTER_LOCK(ldcp);
3022 
3023 	/* don't start at 0 in case clients don't like that */
3024 	ldcp->next_ident = 1;
3025 
3026 	rv = ldc_open(ldcp->ldc_handle);
3027 	if (rv != 0) {
3028 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
3029 		    __func__, ldcp->ldc_id, rv);
3030 		LDC_EXIT_LOCK(ldcp);
3031 		return (1);
3032 	}
3033 
3034 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
3035 		DERR(vswp, "%s: unable to get status", __func__);
3036 		LDC_EXIT_LOCK(ldcp);
3037 		return (1);
3038 
3039 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
3040 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
3041 		    __func__, ldcp->ldc_id, istatus);
3042 		LDC_EXIT_LOCK(ldcp);
3043 		return (1);
3044 	}
3045 
3046 	ldcp->ldc_status = istatus;
3047 	rv = ldc_up(ldcp->ldc_handle);
3048 	if (rv != 0) {
3049 		/*
3050 		 * Not a fatal error for ldc_up() to fail, as peer
3051 		 * end point may simply not be ready yet.
3052 		 */
3053 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
3054 			ldcp->ldc_id, rv);
3055 		LDC_EXIT_LOCK(ldcp);
3056 		return (1);
3057 	}
3058 
3059 	/*
3060 	 * ldc_up() call is non-blocking so need to explicitly
3061 	 * check channel status to see if in fact the channel
3062 	 * is UP.
3063 	 */
3064 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
3065 		DERR(vswp, "%s: unable to get status", __func__);
3066 		LDC_EXIT_LOCK(ldcp);
3067 		return (1);
3068 
3069 	} else if (istatus != LDC_UP) {
3070 		DERR(vswp, "%s: id(%lld) status(%d) is not UP",
3071 		    __func__, ldcp->ldc_id, istatus);
3072 	} else {
3073 		ldcp->ldc_status = istatus;
3074 	}
3075 
3076 	LDC_EXIT_LOCK(ldcp);
3077 
3078 	D1(vswp, "%s: exit", __func__);
3079 	return (0);
3080 }
3081 
3082 /* disable callbacks on the channel */
3083 static int
3084 vsw_ldc_uninit(vsw_ldc_t *ldcp)
3085 {
3086 	vsw_t	*vswp = ldcp->ldc_vswp;
3087 	int	rv;
3088 
3089 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
3090 
3091 	LDC_ENTER_LOCK(ldcp);
3092 
3093 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
3094 	if (rv != 0) {
3095 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
3096 			"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
3097 		LDC_EXIT_LOCK(ldcp);
3098 		return (1);
3099 	}
3100 
3101 	ldcp->ldc_status = LDC_INIT;
3102 
3103 	LDC_EXIT_LOCK(ldcp);
3104 
3105 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
3106 
3107 	return (0);
3108 }
3109 
3110 static int
3111 vsw_init_ldcs(vsw_port_t *port)
3112 {
3113 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3114 	vsw_ldc_t	*ldcp;
3115 
3116 	READ_ENTER(&ldcl->lockrw);
3117 	ldcp =  ldcl->head;
3118 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3119 		(void) vsw_ldc_init(ldcp);
3120 	}
3121 	RW_EXIT(&ldcl->lockrw);
3122 
3123 	return (0);
3124 }
3125 
3126 static int
3127 vsw_uninit_ldcs(vsw_port_t *port)
3128 {
3129 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3130 	vsw_ldc_t	*ldcp;
3131 
3132 	D1(NULL, "vsw_uninit_ldcs: enter\n");
3133 
3134 	READ_ENTER(&ldcl->lockrw);
3135 	ldcp =  ldcl->head;
3136 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3137 		(void) vsw_ldc_uninit(ldcp);
3138 	}
3139 	RW_EXIT(&ldcl->lockrw);
3140 
3141 	D1(NULL, "vsw_uninit_ldcs: exit\n");
3142 
3143 	return (0);
3144 }
3145 
3146 /*
3147  * Wait until the callback(s) associated with the ldcs under the specified
3148  * port have completed.
3149  *
3150  * Prior to this function being invoked each channel under this port
3151  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3152  *
3153  * A short explaination of what we are doing below..
3154  *
3155  * The simplest approach would be to have a reference counter in
3156  * the ldc structure which is increment/decremented by the callbacks as
3157  * they use the channel. The drain function could then simply disable any
3158  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
3159  * there is a tiny window here - before the callback is able to get the lock
3160  * on the channel it is interrupted and this function gets to execute. It
3161  * sees that the ref count is zero and believes its free to delete the
3162  * associated data structures.
3163  *
3164  * We get around this by taking advantage of the fact that before the ldc
3165  * framework invokes a callback it sets a flag to indicate that there is a
3166  * callback active (or about to become active). If when we attempt to
3167  * unregister a callback when this active flag is set then the unregister
3168  * will fail with EWOULDBLOCK.
3169  *
3170  * If the unregister fails we do a cv_timedwait. We will either be signaled
3171  * by the callback as it is exiting (note we have to wait a short period to
3172  * allow the callback to return fully to the ldc framework and it to clear
3173  * the active flag), or by the timer expiring. In either case we again attempt
3174  * the unregister. We repeat this until we can succesfully unregister the
3175  * callback.
3176  *
3177  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
3178  * the case where the callback has finished but the ldc framework has not yet
3179  * cleared the active flag. In this case we would never get a cv_signal.
3180  */
3181 static int
3182 vsw_drain_ldcs(vsw_port_t *port)
3183 {
3184 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3185 	vsw_ldc_t	*ldcp;
3186 	vsw_t		*vswp = port->p_vswp;
3187 
3188 	D1(vswp, "%s: enter", __func__);
3189 
3190 	READ_ENTER(&ldcl->lockrw);
3191 
3192 	ldcp = ldcl->head;
3193 
3194 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3195 		/*
3196 		 * If we can unregister the channel callback then we
3197 		 * know that there is no callback either running or
3198 		 * scheduled to run for this channel so move on to next
3199 		 * channel in the list.
3200 		 */
3201 		mutex_enter(&ldcp->drain_cv_lock);
3202 
3203 		/* prompt active callbacks to quit */
3204 		ldcp->drain_state = VSW_LDC_DRAINING;
3205 
3206 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
3207 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
3208 				ldcp->ldc_id);
3209 			mutex_exit(&ldcp->drain_cv_lock);
3210 			continue;
3211 		} else {
3212 			/*
3213 			 * If we end up here we know that either 1) a callback
3214 			 * is currently executing, 2) is about to start (i.e.
3215 			 * the ldc framework has set the active flag but
3216 			 * has not actually invoked the callback yet, or 3)
3217 			 * has finished and has returned to the ldc framework
3218 			 * but the ldc framework has not yet cleared the
3219 			 * active bit.
3220 			 *
3221 			 * Wait for it to finish.
3222 			 */
3223 			while (ldc_unreg_callback(ldcp->ldc_handle)
3224 								== EWOULDBLOCK)
3225 				(void) cv_timedwait(&ldcp->drain_cv,
3226 					&ldcp->drain_cv_lock, lbolt + hz);
3227 
3228 			mutex_exit(&ldcp->drain_cv_lock);
3229 			D2(vswp, "%s: unreg callback for chan %ld after "
3230 				"timeout", __func__, ldcp->ldc_id);
3231 		}
3232 	}
3233 	RW_EXIT(&ldcl->lockrw);
3234 
3235 	D1(vswp, "%s: exit", __func__);
3236 	return (0);
3237 }
3238 
3239 /*
3240  * Wait until all tasks which reference this port have completed.
3241  *
3242  * Prior to this function being invoked each channel under this port
3243  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3244  */
3245 static int
3246 vsw_drain_port_taskq(vsw_port_t *port)
3247 {
3248 	vsw_t		*vswp = port->p_vswp;
3249 
3250 	D1(vswp, "%s: enter", __func__);
3251 
3252 	/*
3253 	 * Mark the port as in the process of being detached, and
3254 	 * dispatch a marker task to the queue so we know when all
3255 	 * relevant tasks have completed.
3256 	 */
3257 	mutex_enter(&port->state_lock);
3258 	port->state = VSW_PORT_DETACHING;
3259 
3260 	if ((vswp->taskq_p == NULL) ||
3261 		(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
3262 			port, DDI_NOSLEEP) != DDI_SUCCESS)) {
3263 		DERR(vswp, "%s: unable to dispatch marker task",
3264 			__func__);
3265 		mutex_exit(&port->state_lock);
3266 		return (1);
3267 	}
3268 
3269 	/*
3270 	 * Wait for the marker task to finish.
3271 	 */
3272 	while (port->state != VSW_PORT_DETACHABLE)
3273 		cv_wait(&port->state_cv, &port->state_lock);
3274 
3275 	mutex_exit(&port->state_lock);
3276 
3277 	D1(vswp, "%s: exit", __func__);
3278 
3279 	return (0);
3280 }
3281 
3282 static void
3283 vsw_marker_task(void *arg)
3284 {
3285 	vsw_port_t	*port = arg;
3286 	vsw_t		*vswp = port->p_vswp;
3287 
3288 	D1(vswp, "%s: enter", __func__);
3289 
3290 	mutex_enter(&port->state_lock);
3291 
3292 	/*
3293 	 * No further tasks should be dispatched which reference
3294 	 * this port so ok to mark it as safe to detach.
3295 	 */
3296 	port->state = VSW_PORT_DETACHABLE;
3297 
3298 	cv_signal(&port->state_cv);
3299 
3300 	mutex_exit(&port->state_lock);
3301 
3302 	D1(vswp, "%s: exit", __func__);
3303 }
3304 
3305 static vsw_port_t *
3306 vsw_lookup_port(vsw_t *vswp, int p_instance)
3307 {
3308 	vsw_port_list_t *plist = &vswp->plist;
3309 	vsw_port_t	*port;
3310 
3311 	for (port = plist->head; port != NULL; port = port->p_next) {
3312 		if (port->p_instance == p_instance) {
3313 			D2(vswp, "vsw_lookup_port: found p_instance\n");
3314 			return (port);
3315 		}
3316 	}
3317 
3318 	return (NULL);
3319 }
3320 
3321 /*
3322  * Search for and remove the specified port from the port
3323  * list. Returns 0 if able to locate and remove port, otherwise
3324  * returns 1.
3325  */
3326 static int
3327 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
3328 {
3329 	vsw_port_list_t *plist = &vswp->plist;
3330 	vsw_port_t	*curr_p, *prev_p;
3331 
3332 	if (plist->head == NULL)
3333 		return (1);
3334 
3335 	curr_p = prev_p = plist->head;
3336 
3337 	while (curr_p != NULL) {
3338 		if (curr_p == port) {
3339 			if (prev_p == curr_p) {
3340 				plist->head = curr_p->p_next;
3341 			} else {
3342 				prev_p->p_next = curr_p->p_next;
3343 			}
3344 			plist->num_ports--;
3345 			break;
3346 		} else {
3347 			prev_p = curr_p;
3348 			curr_p = curr_p->p_next;
3349 		}
3350 	}
3351 	return (0);
3352 }
3353 
3354 /*
3355  * Interrupt handler for ldc messages.
3356  */
3357 static uint_t
3358 vsw_ldc_cb(uint64_t event, caddr_t arg)
3359 {
3360 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
3361 	vsw_t 		*vswp = ldcp->ldc_vswp;
3362 	ldc_status_t	lstatus;
3363 	int		rv;
3364 
3365 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3366 
3367 	mutex_enter(&ldcp->ldc_cblock);
3368 
3369 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
3370 		mutex_exit(&ldcp->ldc_cblock);
3371 		return (LDC_SUCCESS);
3372 	}
3373 
3374 	if (event & LDC_EVT_UP) {
3375 		/*
3376 		 * Channel has come up, get the state and then start
3377 		 * the handshake.
3378 		 */
3379 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
3380 		if (rv != 0) {
3381 			cmn_err(CE_WARN, "Unable to read channel state");
3382 		}
3383 		ldcp->ldc_status = lstatus;
3384 
3385 		D2(vswp, "%s: id(%ld) event(%llx) UP:  status(%ld)",
3386 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3387 
3388 		vsw_restart_handshake(ldcp);
3389 
3390 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
3391 	}
3392 
3393 	if (event & LDC_EVT_READ) {
3394 		/*
3395 		 * Data available for reading.
3396 		 */
3397 		D2(vswp, "%s: id(ld) event(%llx) data READ",
3398 				__func__, ldcp->ldc_id, event);
3399 
3400 		vsw_process_pkt(ldcp);
3401 
3402 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
3403 
3404 		goto vsw_cb_exit;
3405 	}
3406 
3407 	if (event & LDC_EVT_RESET) {
3408 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
3409 		if (rv != 0) {
3410 			cmn_err(CE_WARN, "Unable to read channel state");
3411 		} else {
3412 			ldcp->ldc_status = lstatus;
3413 		}
3414 		D2(vswp, "%s: id(%ld) event(%llx) RESET:  status (%ld)",
3415 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3416 	}
3417 
3418 	if (event & LDC_EVT_DOWN) {
3419 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
3420 		if (rv != 0) {
3421 			cmn_err(CE_WARN, "Unable to read channel state");
3422 		} else {
3423 			ldcp->ldc_status = lstatus;
3424 		}
3425 
3426 		D2(vswp, "%s: id(%ld) event(%llx) DOWN:  status (%ld)",
3427 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3428 
3429 	}
3430 
3431 	/*
3432 	 * Catch either LDC_EVT_WRITE which we don't support or any
3433 	 * unknown event.
3434 	 */
3435 	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
3436 					| LDC_EVT_DOWN | LDC_EVT_READ)) {
3437 
3438 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
3439 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3440 	}
3441 
3442 vsw_cb_exit:
3443 	mutex_exit(&ldcp->ldc_cblock);
3444 
3445 	/*
3446 	 * Let the drain function know we are finishing if it
3447 	 * is waiting.
3448 	 */
3449 	mutex_enter(&ldcp->drain_cv_lock);
3450 	if (ldcp->drain_state == VSW_LDC_DRAINING)
3451 		cv_signal(&ldcp->drain_cv);
3452 	mutex_exit(&ldcp->drain_cv_lock);
3453 
3454 	return (LDC_SUCCESS);
3455 }
3456 
3457 /*
3458  * (Re)start a handshake with our peer by sending them
3459  * our version info.
3460  */
3461 static void
3462 vsw_restart_handshake(vsw_ldc_t *ldcp)
3463 {
3464 	vsw_t		*vswp = ldcp->ldc_vswp;
3465 	vsw_port_t	*port;
3466 	vsw_ldc_list_t	*ldcl;
3467 
3468 	D1(vswp, "vsw_restart_handshake: enter");
3469 
3470 	port = ldcp->ldc_port;
3471 	ldcl = &port->p_ldclist;
3472 
3473 	WRITE_ENTER(&ldcl->lockrw);
3474 
3475 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
3476 		ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3477 
3478 	vsw_free_lane_resources(ldcp, INBOUND);
3479 	vsw_free_lane_resources(ldcp, OUTBOUND);
3480 	RW_EXIT(&ldcl->lockrw);
3481 
3482 	ldcp->lane_in.lstate = 0;
3483 	ldcp->lane_out.lstate = 0;
3484 
3485 	/*
3486 	 * Remove parent port from any multicast groups
3487 	 * it may have registered with. Client must resend
3488 	 * multicast add command after handshake completes.
3489 	 */
3490 	(void) vsw_del_fdb(vswp, port);
3491 
3492 	vsw_del_mcst_port(port);
3493 
3494 	ldcp->hphase = VSW_MILESTONE0;
3495 
3496 	ldcp->peer_session = 0;
3497 	ldcp->session_status = 0;
3498 
3499 	/*
3500 	 * We now increment the transaction group id. This allows
3501 	 * us to identify and disard any tasks which are still pending
3502 	 * on the taskq and refer to the handshake session we are about
3503 	 * to restart. These stale messages no longer have any real
3504 	 * meaning.
3505 	 */
3506 	mutex_enter(&ldcp->hss_lock);
3507 	ldcp->hss_id++;
3508 	mutex_exit(&ldcp->hss_lock);
3509 
3510 	if (ldcp->hcnt++ > vsw_num_handshakes) {
3511 		cmn_err(CE_WARN, "exceeded number of permitted "
3512 			"handshake attempts (%d) on channel %ld",
3513 			ldcp->hcnt, ldcp->ldc_id);
3514 		return;
3515 	}
3516 
3517 	vsw_send_ver(ldcp);
3518 
3519 	D1(vswp, "vsw_restart_handshake: exit");
3520 }
3521 
3522 /*
3523  * returns 0 if legal for event signified by flag to have
3524  * occured at the time it did. Otherwise returns 1.
3525  */
3526 int
3527 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
3528 {
3529 	vsw_t		*vswp = ldcp->ldc_vswp;
3530 	uint64_t	state;
3531 	uint64_t	phase;
3532 
3533 	if (dir == INBOUND)
3534 		state = ldcp->lane_in.lstate;
3535 	else
3536 		state = ldcp->lane_out.lstate;
3537 
3538 	phase = ldcp->hphase;
3539 
3540 	switch (flag) {
3541 	case VSW_VER_INFO_RECV:
3542 		if (phase > VSW_MILESTONE0) {
3543 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
3544 				" when in state %d\n", ldcp->ldc_id, phase);
3545 			vsw_restart_handshake(ldcp);
3546 			return (1);
3547 		}
3548 		break;
3549 
3550 	case VSW_VER_ACK_RECV:
3551 	case VSW_VER_NACK_RECV:
3552 		if (!(state & VSW_VER_INFO_SENT)) {
3553 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
3554 				" or VER_NACK when in state %d\n",
3555 				ldcp->ldc_id, phase);
3556 			vsw_restart_handshake(ldcp);
3557 			return (1);
3558 		} else
3559 			state &= ~VSW_VER_INFO_SENT;
3560 		break;
3561 
3562 	case VSW_ATTR_INFO_RECV:
3563 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
3564 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
3565 				" when in state %d\n", ldcp->ldc_id, phase);
3566 			vsw_restart_handshake(ldcp);
3567 			return (1);
3568 		}
3569 		break;
3570 
3571 	case VSW_ATTR_ACK_RECV:
3572 	case VSW_ATTR_NACK_RECV:
3573 		if (!(state & VSW_ATTR_INFO_SENT)) {
3574 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
3575 				" or ATTR_NACK when in state %d\n",
3576 				ldcp->ldc_id, phase);
3577 			vsw_restart_handshake(ldcp);
3578 			return (1);
3579 		} else
3580 			state &= ~VSW_ATTR_INFO_SENT;
3581 		break;
3582 
3583 	case VSW_DRING_INFO_RECV:
3584 		if (phase < VSW_MILESTONE1) {
3585 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
3586 				" when in state %d\n", ldcp->ldc_id, phase);
3587 			vsw_restart_handshake(ldcp);
3588 			return (1);
3589 		}
3590 		break;
3591 
3592 	case VSW_DRING_ACK_RECV:
3593 	case VSW_DRING_NACK_RECV:
3594 		if (!(state & VSW_DRING_INFO_SENT)) {
3595 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
3596 				" or DRING_NACK when in state %d\n",
3597 				ldcp->ldc_id, phase);
3598 			vsw_restart_handshake(ldcp);
3599 			return (1);
3600 		} else
3601 			state &= ~VSW_DRING_INFO_SENT;
3602 		break;
3603 
3604 	case VSW_RDX_INFO_RECV:
3605 		if (phase < VSW_MILESTONE3) {
3606 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
3607 				" when in state %d\n", ldcp->ldc_id, phase);
3608 			vsw_restart_handshake(ldcp);
3609 			return (1);
3610 		}
3611 		break;
3612 
3613 	case VSW_RDX_ACK_RECV:
3614 	case VSW_RDX_NACK_RECV:
3615 		if (!(state & VSW_RDX_INFO_SENT)) {
3616 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
3617 				" or RDX_NACK when in state %d\n",
3618 				ldcp->ldc_id, phase);
3619 			vsw_restart_handshake(ldcp);
3620 			return (1);
3621 		} else
3622 			state &= ~VSW_RDX_INFO_SENT;
3623 		break;
3624 
3625 	case VSW_MCST_INFO_RECV:
3626 		if (phase < VSW_MILESTONE3) {
3627 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
3628 				" when in state %d\n", ldcp->ldc_id, phase);
3629 			vsw_restart_handshake(ldcp);
3630 			return (1);
3631 		}
3632 		break;
3633 
3634 	default:
3635 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
3636 				ldcp->ldc_id, flag);
3637 		return (1);
3638 	}
3639 
3640 	if (dir == INBOUND)
3641 		ldcp->lane_in.lstate = state;
3642 	else
3643 		ldcp->lane_out.lstate = state;
3644 
3645 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
3646 
3647 	return (0);
3648 }
3649 
3650 void
3651 vsw_next_milestone(vsw_ldc_t *ldcp)
3652 {
3653 	vsw_t		*vswp = ldcp->ldc_vswp;
3654 
3655 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
3656 		ldcp->ldc_id, ldcp->hphase);
3657 
3658 	DUMP_FLAGS(ldcp->lane_in.lstate);
3659 	DUMP_FLAGS(ldcp->lane_out.lstate);
3660 
3661 	switch (ldcp->hphase) {
3662 
3663 	case VSW_MILESTONE0:
3664 		/*
3665 		 * If we haven't started to handshake with our peer,
3666 		 * start to do so now.
3667 		 */
3668 		if (ldcp->lane_out.lstate == 0) {
3669 			D2(vswp, "%s: (chan %lld) starting handshake "
3670 				"with peer", __func__, ldcp->ldc_id);
3671 			vsw_restart_handshake(ldcp);
3672 		}
3673 
3674 		/*
3675 		 * Only way to pass this milestone is to have successfully
3676 		 * negotiated version info.
3677 		 */
3678 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
3679 			(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
3680 
3681 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
3682 				__func__, ldcp->ldc_id);
3683 
3684 			/*
3685 			 * Next milestone is passed when attribute
3686 			 * information has been successfully exchanged.
3687 			 */
3688 			ldcp->hphase = VSW_MILESTONE1;
3689 			vsw_send_attr(ldcp);
3690 
3691 		}
3692 		break;
3693 
3694 	case VSW_MILESTONE1:
3695 		/*
3696 		 * Only way to pass this milestone is to have successfully
3697 		 * negotiated attribute information.
3698 		 */
3699 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
3700 
3701 			ldcp->hphase = VSW_MILESTONE2;
3702 
3703 			/*
3704 			 * If the peer device has said it wishes to
3705 			 * use descriptor rings then we send it our ring
3706 			 * info, otherwise we just set up a private ring
3707 			 * which we use an internal buffer
3708 			 */
3709 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
3710 				vsw_send_dring_info(ldcp);
3711 		}
3712 		break;
3713 
3714 
3715 	case VSW_MILESTONE2:
3716 		/*
3717 		 * If peer has indicated in its attribute message that
3718 		 * it wishes to use descriptor rings then the only way
3719 		 * to pass this milestone is for us to have received
3720 		 * valid dring info.
3721 		 *
3722 		 * If peer is not using descriptor rings then just fall
3723 		 * through.
3724 		 */
3725 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
3726 			(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
3727 			break;
3728 
3729 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
3730 				__func__, ldcp->ldc_id);
3731 
3732 		ldcp->hphase = VSW_MILESTONE3;
3733 		vsw_send_rdx(ldcp);
3734 		break;
3735 
3736 	case VSW_MILESTONE3:
3737 		/*
3738 		 * Pass this milestone when all paramaters have been
3739 		 * successfully exchanged and RDX sent in both directions.
3740 		 *
3741 		 * Mark outbound lane as available to transmit data.
3742 		 */
3743 		if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) &&
3744 			(ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) {
3745 
3746 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
3747 				__func__, ldcp->ldc_id);
3748 			D2(vswp, "%s: ** handshake complete **", __func__);
3749 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
3750 			ldcp->hphase = VSW_MILESTONE4;
3751 			ldcp->hcnt = 0;
3752 			DISPLAY_STATE();
3753 		}
3754 		break;
3755 
3756 	case VSW_MILESTONE4:
3757 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
3758 							ldcp->ldc_id);
3759 		break;
3760 
3761 	default:
3762 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
3763 			ldcp->ldc_id, ldcp->hphase);
3764 	}
3765 
3766 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
3767 		ldcp->hphase);
3768 }
3769 
3770 /*
3771  * Check if major version is supported.
3772  *
3773  * Returns 0 if finds supported major number, and if necessary
3774  * adjusts the minor field.
3775  *
3776  * Returns 1 if can't match major number exactly. Sets mjor/minor
3777  * to next lowest support values, or to zero if no other values possible.
3778  */
3779 static int
3780 vsw_supported_version(vio_ver_msg_t *vp)
3781 {
3782 	int	i;
3783 
3784 	D1(NULL, "vsw_supported_version: enter");
3785 
3786 	for (i = 0; i < VSW_NUM_VER; i++) {
3787 		if (vsw_versions[i].ver_major == vp->ver_major) {
3788 			/*
3789 			 * Matching or lower major version found. Update
3790 			 * minor number if necessary.
3791 			 */
3792 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3793 				D2(NULL, "%s: adjusting minor value"
3794 					" from %d to %d", __func__,
3795 					vp->ver_minor,
3796 					vsw_versions[i].ver_minor);
3797 				vp->ver_minor = vsw_versions[i].ver_minor;
3798 			}
3799 
3800 			return (0);
3801 		}
3802 
3803 		if (vsw_versions[i].ver_major < vp->ver_major) {
3804 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3805 				D2(NULL, "%s: adjusting minor value"
3806 					" from %d to %d", __func__,
3807 					vp->ver_minor,
3808 					vsw_versions[i].ver_minor);
3809 				vp->ver_minor = vsw_versions[i].ver_minor;
3810 			}
3811 			return (1);
3812 		}
3813 	}
3814 
3815 	/* No match was possible, zero out fields */
3816 	vp->ver_major = 0;
3817 	vp->ver_minor = 0;
3818 
3819 	D1(NULL, "vsw_supported_version: exit");
3820 
3821 	return (1);
3822 }
3823 
3824 /*
3825  * Main routine for processing messages received over LDC.
3826  */
3827 static void
3828 vsw_process_pkt(void *arg)
3829 {
3830 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
3831 	vsw_t 		*vswp = ldcp->ldc_vswp;
3832 	size_t		msglen;
3833 	vio_msg_tag_t	tag;
3834 	def_msg_t	dmsg;
3835 	int 		rv = 0;
3836 
3837 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3838 
3839 	/*
3840 	 * If channel is up read messages until channel is empty.
3841 	 */
3842 	do {
3843 		msglen = sizeof (dmsg);
3844 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
3845 
3846 		if (rv != 0) {
3847 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
3848 				"len(%d)\n", __func__, ldcp->ldc_id,
3849 							rv, msglen);
3850 			break;
3851 		}
3852 
3853 		if (msglen == 0) {
3854 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
3855 			ldcp->ldc_id);
3856 			break;
3857 		}
3858 
3859 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
3860 		    ldcp->ldc_id, msglen);
3861 
3862 		/*
3863 		 * Figure out what sort of packet we have gotten by
3864 		 * examining the msg tag, and then switch it appropriately.
3865 		 */
3866 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
3867 
3868 		switch (tag.vio_msgtype) {
3869 		case VIO_TYPE_CTRL:
3870 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
3871 			break;
3872 		case VIO_TYPE_DATA:
3873 			vsw_process_data_pkt(ldcp, &dmsg, tag);
3874 			break;
3875 		case VIO_TYPE_ERR:
3876 			vsw_process_err_pkt(ldcp, &dmsg, tag);
3877 			break;
3878 		default:
3879 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
3880 				"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
3881 			break;
3882 		}
3883 	} while (msglen);
3884 
3885 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3886 }
3887 
3888 /*
3889  * Dispatch a task to process a VIO control message.
3890  */
3891 static void
3892 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
3893 {
3894 	vsw_ctrl_task_t		*ctaskp = NULL;
3895 	vsw_port_t		*port = ldcp->ldc_port;
3896 	vsw_t			*vswp = port->p_vswp;
3897 
3898 	D1(vswp, "%s: enter", __func__);
3899 
3900 	/*
3901 	 * We need to handle RDX ACK messages in-band as once they
3902 	 * are exchanged it is possible that we will get an
3903 	 * immediate (legitimate) data packet.
3904 	 */
3905 	if ((tag.vio_subtype_env == VIO_RDX) &&
3906 		(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
3907 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV))
3908 			return;
3909 
3910 		ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV;
3911 		vsw_next_milestone(ldcp);
3912 		D2(vswp, "%s (%ld) handling RDX_ACK in place", __func__,
3913 			ldcp->ldc_id);
3914 		return;
3915 	}
3916 
3917 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
3918 
3919 	if (ctaskp == NULL) {
3920 		DERR(vswp, "%s: unable to alloc space for ctrl"
3921 			" msg", __func__);
3922 		vsw_restart_handshake(ldcp);
3923 		return;
3924 	}
3925 
3926 	ctaskp->ldcp = ldcp;
3927 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
3928 	mutex_enter(&ldcp->hss_lock);
3929 	ctaskp->hss_id = ldcp->hss_id;
3930 	mutex_exit(&ldcp->hss_lock);
3931 
3932 	/*
3933 	 * Dispatch task to processing taskq if port is not in
3934 	 * the process of being detached.
3935 	 */
3936 	mutex_enter(&port->state_lock);
3937 	if (port->state == VSW_PORT_INIT) {
3938 		if ((vswp->taskq_p == NULL) ||
3939 			(ddi_taskq_dispatch(vswp->taskq_p,
3940 			vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
3941 							!= DDI_SUCCESS)) {
3942 			DERR(vswp, "%s: unable to dispatch task to taskq",
3943 				__func__);
3944 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3945 			mutex_exit(&port->state_lock);
3946 			vsw_restart_handshake(ldcp);
3947 			return;
3948 		}
3949 	} else {
3950 		DWARN(vswp, "%s: port %d detaching, not dispatching "
3951 			"task", __func__, port->p_instance);
3952 	}
3953 
3954 	mutex_exit(&port->state_lock);
3955 
3956 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
3957 			ldcp->ldc_id);
3958 	D1(vswp, "%s: exit", __func__);
3959 }
3960 
3961 /*
3962  * Process a VIO ctrl message. Invoked from taskq.
3963  */
3964 static void
3965 vsw_process_ctrl_pkt(void *arg)
3966 {
3967 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
3968 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
3969 	vsw_t 		*vswp = ldcp->ldc_vswp;
3970 	vio_msg_tag_t	tag;
3971 	uint16_t	env;
3972 
3973 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3974 
3975 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
3976 	env = tag.vio_subtype_env;
3977 
3978 	/* stale pkt check */
3979 	mutex_enter(&ldcp->hss_lock);
3980 	if (ctaskp->hss_id < ldcp->hss_id) {
3981 		DWARN(vswp, "%s: discarding stale packet belonging to"
3982 			" earlier (%ld) handshake session", __func__,
3983 			ctaskp->hss_id);
3984 		mutex_exit(&ldcp->hss_lock);
3985 		return;
3986 	}
3987 	mutex_exit(&ldcp->hss_lock);
3988 
3989 	/* session id check */
3990 	if (ldcp->session_status & VSW_PEER_SESSION) {
3991 		if (ldcp->peer_session != tag.vio_sid) {
3992 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3993 				__func__, ldcp->ldc_id, tag.vio_sid);
3994 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3995 			vsw_restart_handshake(ldcp);
3996 			return;
3997 		}
3998 	}
3999 
4000 	/*
4001 	 * Switch on vio_subtype envelope, then let lower routines
4002 	 * decide if its an INFO, ACK or NACK packet.
4003 	 */
4004 	switch (env) {
4005 	case VIO_VER_INFO:
4006 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
4007 		break;
4008 	case VIO_DRING_REG:
4009 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
4010 		break;
4011 	case VIO_DRING_UNREG:
4012 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
4013 		break;
4014 	case VIO_ATTR_INFO:
4015 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
4016 		break;
4017 	case VNET_MCAST_INFO:
4018 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
4019 		break;
4020 	case VIO_RDX:
4021 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
4022 		break;
4023 	default:
4024 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
4025 							__func__, env);
4026 	}
4027 
4028 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4029 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4030 }
4031 
4032 /*
4033  * Version negotiation. We can end up here either because our peer
4034  * has responded to a handshake message we have sent it, or our peer
4035  * has initiated a handshake with us. If its the former then can only
4036  * be ACK or NACK, if its the later can only be INFO.
4037  *
4038  * If its an ACK we move to the next stage of the handshake, namely
4039  * attribute exchange. If its a NACK we see if we can specify another
4040  * version, if we can't we stop.
4041  *
4042  * If it is an INFO we reset all params associated with communication
4043  * in that direction over this channel (remember connection is
4044  * essentially 2 independent simplex channels).
4045  */
4046 void
4047 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
4048 {
4049 	vio_ver_msg_t	*ver_pkt;
4050 	vsw_t 		*vswp = ldcp->ldc_vswp;
4051 
4052 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4053 
4054 	/*
4055 	 * We know this is a ctrl/version packet so
4056 	 * cast it into the correct structure.
4057 	 */
4058 	ver_pkt = (vio_ver_msg_t *)pkt;
4059 
4060 	switch (ver_pkt->tag.vio_subtype) {
4061 	case VIO_SUBTYPE_INFO:
4062 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
4063 
4064 		/*
4065 		 * Record the session id, which we will use from now
4066 		 * until we see another VER_INFO msg. Even then the
4067 		 * session id in most cases will be unchanged, execpt
4068 		 * if channel was reset.
4069 		 */
4070 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
4071 			(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
4072 			DERR(vswp, "%s: updating session id for chan %lld "
4073 				"from %llx to %llx", __func__, ldcp->ldc_id,
4074 				ldcp->peer_session, ver_pkt->tag.vio_sid);
4075 		}
4076 
4077 		ldcp->peer_session = ver_pkt->tag.vio_sid;
4078 		ldcp->session_status |= VSW_PEER_SESSION;
4079 
4080 		/* Legal message at this time ? */
4081 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
4082 			return;
4083 
4084 		/*
4085 		 * First check the device class. Currently only expect
4086 		 * to be talking to a network device. In the future may
4087 		 * also talk to another switch.
4088 		 */
4089 		if (ver_pkt->dev_class != VDEV_NETWORK) {
4090 			DERR(vswp, "%s: illegal device class %d", __func__,
4091 				ver_pkt->dev_class);
4092 
4093 			ver_pkt->tag.vio_sid = ldcp->local_session;
4094 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4095 
4096 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4097 
4098 			vsw_send_msg(ldcp, (void *)ver_pkt,
4099 					sizeof (vio_ver_msg_t));
4100 
4101 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
4102 			vsw_next_milestone(ldcp);
4103 			return;
4104 		} else {
4105 			ldcp->dev_class = ver_pkt->dev_class;
4106 		}
4107 
4108 		/*
4109 		 * Now check the version.
4110 		 */
4111 		if (vsw_supported_version(ver_pkt) == 0) {
4112 			/*
4113 			 * Support this major version and possibly
4114 			 * adjusted minor version.
4115 			 */
4116 
4117 			D2(vswp, "%s: accepted ver %d:%d", __func__,
4118 				ver_pkt->ver_major, ver_pkt->ver_minor);
4119 
4120 			/* Store accepted values */
4121 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4122 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4123 
4124 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4125 
4126 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
4127 		} else {
4128 			/*
4129 			 * NACK back with the next lower major/minor
4130 			 * pairing we support (if don't suuport any more
4131 			 * versions then they will be set to zero.
4132 			 */
4133 
4134 			D2(vswp, "%s: replying with ver %d:%d", __func__,
4135 				ver_pkt->ver_major, ver_pkt->ver_minor);
4136 
4137 			/* Store updated values */
4138 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4139 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4140 
4141 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4142 
4143 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
4144 		}
4145 
4146 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4147 		ver_pkt->tag.vio_sid = ldcp->local_session;
4148 		vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t));
4149 
4150 		vsw_next_milestone(ldcp);
4151 		break;
4152 
4153 	case VIO_SUBTYPE_ACK:
4154 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
4155 
4156 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
4157 			return;
4158 
4159 		/* Store updated values */
4160 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
4161 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4162 
4163 
4164 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
4165 		vsw_next_milestone(ldcp);
4166 
4167 		break;
4168 
4169 	case VIO_SUBTYPE_NACK:
4170 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
4171 
4172 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
4173 			return;
4174 
4175 		/*
4176 		 * If our peer sent us a NACK with the ver fields set to
4177 		 * zero then there is nothing more we can do. Otherwise see
4178 		 * if we support either the version suggested, or a lesser
4179 		 * one.
4180 		 */
4181 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
4182 			DERR(vswp, "%s: peer unable to negotiate any "
4183 				"further.", __func__);
4184 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
4185 			vsw_next_milestone(ldcp);
4186 			return;
4187 		}
4188 
4189 		/*
4190 		 * Check to see if we support this major version or
4191 		 * a lower one. If we don't then maj/min will be set
4192 		 * to zero.
4193 		 */
4194 		(void) vsw_supported_version(ver_pkt);
4195 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
4196 			/* Nothing more we can do */
4197 			DERR(vswp, "%s: version negotiation failed.\n",
4198 								__func__);
4199 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
4200 			vsw_next_milestone(ldcp);
4201 		} else {
4202 			/* found a supported major version */
4203 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
4204 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
4205 
4206 			D2(vswp, "%s: resending with updated values (%x, %x)",
4207 				__func__, ver_pkt->ver_major,
4208 				ver_pkt->ver_minor);
4209 
4210 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
4211 			ver_pkt->tag.vio_sid = ldcp->local_session;
4212 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4213 
4214 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4215 
4216 			vsw_send_msg(ldcp, (void *)ver_pkt,
4217 					sizeof (vio_ver_msg_t));
4218 
4219 			vsw_next_milestone(ldcp);
4220 
4221 		}
4222 		break;
4223 
4224 	default:
4225 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4226 			ver_pkt->tag.vio_subtype);
4227 	}
4228 
4229 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4230 }
4231 
4232 /*
4233  * Process an attribute packet. We can end up here either because our peer
4234  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
4235  * peer has sent us an attribute INFO message
4236  *
4237  * If its an ACK we then move to the next stage of the handshake which
4238  * is to send our descriptor ring info to our peer. If its a NACK then
4239  * there is nothing more we can (currently) do.
4240  *
4241  * If we get a valid/acceptable INFO packet (and we have already negotiated
4242  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
4243  * NACK back and reset channel state to INACTIV.
4244  *
4245  * FUTURE: in time we will probably negotiate over attributes, but for
4246  * the moment unacceptable attributes are regarded as a fatal error.
4247  *
4248  */
4249 void
4250 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
4251 {
4252 	vnet_attr_msg_t		*attr_pkt;
4253 	vsw_t			*vswp = ldcp->ldc_vswp;
4254 	vsw_port_t		*port = ldcp->ldc_port;
4255 	uint64_t		macaddr = 0;
4256 	int			i;
4257 
4258 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4259 
4260 	/*
4261 	 * We know this is a ctrl/attr packet so
4262 	 * cast it into the correct structure.
4263 	 */
4264 	attr_pkt = (vnet_attr_msg_t *)pkt;
4265 
4266 	switch (attr_pkt->tag.vio_subtype) {
4267 	case VIO_SUBTYPE_INFO:
4268 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4269 
4270 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
4271 			return;
4272 
4273 		/*
4274 		 * If the attributes are unacceptable then we NACK back.
4275 		 */
4276 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
4277 
4278 			DERR(vswp, "%s (chan %d): invalid attributes",
4279 				__func__, ldcp->ldc_id);
4280 
4281 			vsw_free_lane_resources(ldcp, INBOUND);
4282 
4283 			attr_pkt->tag.vio_sid = ldcp->local_session;
4284 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4285 
4286 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
4287 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
4288 			vsw_send_msg(ldcp, (void *)attr_pkt,
4289 					sizeof (vnet_attr_msg_t));
4290 
4291 			vsw_next_milestone(ldcp);
4292 			return;
4293 		}
4294 
4295 		/*
4296 		 * Otherwise store attributes for this lane and update
4297 		 * lane state.
4298 		 */
4299 		ldcp->lane_in.mtu = attr_pkt->mtu;
4300 		ldcp->lane_in.addr = attr_pkt->addr;
4301 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
4302 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
4303 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
4304 
4305 		macaddr = ldcp->lane_in.addr;
4306 		for (i = ETHERADDRL - 1; i >= 0; i--) {
4307 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
4308 			macaddr >>= 8;
4309 		}
4310 
4311 		/* create the fdb entry for this port/mac address */
4312 		(void) vsw_add_fdb(vswp, port);
4313 
4314 		/* setup device specifc xmit routines */
4315 		mutex_enter(&port->tx_lock);
4316 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
4317 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
4318 			port->transmit = vsw_dringsend;
4319 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
4320 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
4321 			vsw_create_privring(ldcp);
4322 			port->transmit = vsw_descrsend;
4323 		}
4324 		mutex_exit(&port->tx_lock);
4325 
4326 		attr_pkt->tag.vio_sid = ldcp->local_session;
4327 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4328 
4329 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
4330 
4331 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
4332 
4333 		vsw_send_msg(ldcp, (void *)attr_pkt,
4334 					sizeof (vnet_attr_msg_t));
4335 
4336 		vsw_next_milestone(ldcp);
4337 		break;
4338 
4339 	case VIO_SUBTYPE_ACK:
4340 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4341 
4342 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
4343 			return;
4344 
4345 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
4346 		vsw_next_milestone(ldcp);
4347 		break;
4348 
4349 	case VIO_SUBTYPE_NACK:
4350 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4351 
4352 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
4353 			return;
4354 
4355 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
4356 		vsw_next_milestone(ldcp);
4357 		break;
4358 
4359 	default:
4360 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4361 			attr_pkt->tag.vio_subtype);
4362 	}
4363 
4364 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4365 }
4366 
4367 /*
4368  * Process a dring info packet. We can end up here either because our peer
4369  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
4370  * peer has sent us a dring INFO message.
4371  *
4372  * If we get a valid/acceptable INFO packet (and we have already negotiated
4373  * a version) we ACK back and update the lane state, otherwise we NACK back.
4374  *
4375  * FUTURE: nothing to stop client from sending us info on multiple dring's
4376  * but for the moment we will just use the first one we are given.
4377  *
4378  */
4379 void
4380 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
4381 {
4382 	vio_dring_reg_msg_t	*dring_pkt;
4383 	vsw_t			*vswp = ldcp->ldc_vswp;
4384 	ldc_mem_info_t		minfo;
4385 	dring_info_t		*dp, *dbp;
4386 	int			dring_found = 0;
4387 
4388 	/*
4389 	 * We know this is a ctrl/dring packet so
4390 	 * cast it into the correct structure.
4391 	 */
4392 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
4393 
4394 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4395 
4396 	switch (dring_pkt->tag.vio_subtype) {
4397 	case VIO_SUBTYPE_INFO:
4398 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4399 
4400 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
4401 			return;
4402 
4403 		/*
4404 		 * If the dring params are unacceptable then we NACK back.
4405 		 */
4406 		if (vsw_check_dring_info(dring_pkt)) {
4407 
4408 			DERR(vswp, "%s (%lld): invalid dring info",
4409 				__func__, ldcp->ldc_id);
4410 
4411 			vsw_free_lane_resources(ldcp, INBOUND);
4412 
4413 			dring_pkt->tag.vio_sid = ldcp->local_session;
4414 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4415 
4416 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
4417 
4418 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
4419 
4420 			vsw_send_msg(ldcp, (void *)dring_pkt,
4421 					sizeof (vio_dring_reg_msg_t));
4422 
4423 			vsw_next_milestone(ldcp);
4424 			return;
4425 		}
4426 
4427 		/*
4428 		 * Otherwise, attempt to map in the dring using the
4429 		 * cookie. If that succeeds we send back a unique dring
4430 		 * identifier that the sending side will use in future
4431 		 * to refer to this descriptor ring.
4432 		 */
4433 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4434 
4435 		dp->num_descriptors = dring_pkt->num_descriptors;
4436 		dp->descriptor_size = dring_pkt->descriptor_size;
4437 		dp->options = dring_pkt->options;
4438 		dp->ncookies = dring_pkt->ncookies;
4439 
4440 		/*
4441 		 * Note: should only get one cookie. Enforced in
4442 		 * the ldc layer.
4443 		 */
4444 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
4445 			sizeof (ldc_mem_cookie_t));
4446 
4447 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
4448 			dp->num_descriptors, dp->descriptor_size);
4449 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
4450 			dp->options, dp->ncookies);
4451 
4452 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
4453 			dp->ncookies, dp->num_descriptors,
4454 			dp->descriptor_size, LDC_SHADOW_MAP,
4455 			&(dp->handle))) != 0) {
4456 
4457 			DERR(vswp, "%s: dring_map failed\n", __func__);
4458 
4459 			kmem_free(dp, sizeof (dring_info_t));
4460 			vsw_free_lane_resources(ldcp, INBOUND);
4461 
4462 			dring_pkt->tag.vio_sid = ldcp->local_session;
4463 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4464 
4465 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
4466 
4467 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
4468 			vsw_send_msg(ldcp, (void *)dring_pkt,
4469 				sizeof (vio_dring_reg_msg_t));
4470 
4471 			vsw_next_milestone(ldcp);
4472 			return;
4473 		}
4474 
4475 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4476 
4477 			DERR(vswp, "%s: dring_addr failed\n", __func__);
4478 
4479 			kmem_free(dp, sizeof (dring_info_t));
4480 			vsw_free_lane_resources(ldcp, INBOUND);
4481 
4482 			dring_pkt->tag.vio_sid = ldcp->local_session;
4483 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4484 
4485 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
4486 
4487 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
4488 			vsw_send_msg(ldcp, (void *)dring_pkt,
4489 				sizeof (vio_dring_reg_msg_t));
4490 
4491 			vsw_next_milestone(ldcp);
4492 			return;
4493 		} else {
4494 			/* store the address of the pub part of ring */
4495 			dp->pub_addr = minfo.vaddr;
4496 		}
4497 
4498 		/* no private section as we are importing */
4499 		dp->priv_addr = NULL;
4500 
4501 		/*
4502 		 * Using simple mono increasing int for ident at
4503 		 * the moment.
4504 		 */
4505 		dp->ident = ldcp->next_ident;
4506 		ldcp->next_ident++;
4507 
4508 		dp->end_idx = 0;
4509 		dp->next = NULL;
4510 
4511 		/*
4512 		 * Link it onto the end of the list of drings
4513 		 * for this lane.
4514 		 */
4515 		if (ldcp->lane_in.dringp == NULL) {
4516 			D2(vswp, "%s: adding first INBOUND dring", __func__);
4517 			ldcp->lane_in.dringp = dp;
4518 		} else {
4519 			dbp = ldcp->lane_in.dringp;
4520 
4521 			while (dbp->next != NULL)
4522 				dbp = dbp->next;
4523 
4524 			dbp->next = dp;
4525 		}
4526 
4527 		/* acknowledge it */
4528 		dring_pkt->tag.vio_sid = ldcp->local_session;
4529 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4530 		dring_pkt->dring_ident = dp->ident;
4531 
4532 		vsw_send_msg(ldcp, (void *)dring_pkt,
4533 				sizeof (vio_dring_reg_msg_t));
4534 
4535 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
4536 		vsw_next_milestone(ldcp);
4537 		break;
4538 
4539 	case VIO_SUBTYPE_ACK:
4540 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4541 
4542 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
4543 			return;
4544 
4545 		/*
4546 		 * Peer is acknowledging our dring info and will have
4547 		 * sent us a dring identifier which we will use to
4548 		 * refer to this ring w.r.t. our peer.
4549 		 */
4550 		dp = ldcp->lane_out.dringp;
4551 		if (dp != NULL) {
4552 			/*
4553 			 * Find the ring this ident should be associated
4554 			 * with.
4555 			 */
4556 			if (vsw_dring_match(dp, dring_pkt)) {
4557 				dring_found = 1;
4558 
4559 			} else while (dp != NULL) {
4560 				if (vsw_dring_match(dp, dring_pkt)) {
4561 					dring_found = 1;
4562 					break;
4563 				}
4564 				dp = dp->next;
4565 			}
4566 
4567 			if (dring_found == 0) {
4568 				DERR(NULL, "%s: unrecognised ring cookie",
4569 					__func__);
4570 				vsw_restart_handshake(ldcp);
4571 				return;
4572 			}
4573 
4574 		} else {
4575 			DERR(vswp, "%s: DRING ACK received but no drings "
4576 				"allocated", __func__);
4577 			vsw_restart_handshake(ldcp);
4578 			return;
4579 		}
4580 
4581 		/* store ident */
4582 		dp->ident = dring_pkt->dring_ident;
4583 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
4584 		vsw_next_milestone(ldcp);
4585 		break;
4586 
4587 	case VIO_SUBTYPE_NACK:
4588 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4589 
4590 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
4591 			return;
4592 
4593 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
4594 		vsw_next_milestone(ldcp);
4595 		break;
4596 
4597 	default:
4598 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4599 			dring_pkt->tag.vio_subtype);
4600 	}
4601 
4602 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4603 }
4604 
4605 /*
4606  * Process a request from peer to unregister a dring.
4607  *
4608  * For the moment we just restart the handshake if our
4609  * peer endpoint attempts to unregister a dring.
4610  */
4611 void
4612 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
4613 {
4614 	vsw_t			*vswp = ldcp->ldc_vswp;
4615 	vio_dring_unreg_msg_t	*dring_pkt;
4616 
4617 	/*
4618 	 * We know this is a ctrl/dring packet so
4619 	 * cast it into the correct structure.
4620 	 */
4621 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
4622 
4623 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4624 
4625 	switch (dring_pkt->tag.vio_subtype) {
4626 	case VIO_SUBTYPE_INFO:
4627 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4628 
4629 		DWARN(vswp, "%s: restarting handshake..", __func__);
4630 		vsw_restart_handshake(ldcp);
4631 		break;
4632 
4633 	case VIO_SUBTYPE_ACK:
4634 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4635 
4636 		DWARN(vswp, "%s: restarting handshake..", __func__);
4637 		vsw_restart_handshake(ldcp);
4638 		break;
4639 
4640 	case VIO_SUBTYPE_NACK:
4641 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4642 
4643 		DWARN(vswp, "%s: restarting handshake..", __func__);
4644 		vsw_restart_handshake(ldcp);
4645 		break;
4646 
4647 	default:
4648 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4649 			dring_pkt->tag.vio_subtype);
4650 		vsw_restart_handshake(ldcp);
4651 	}
4652 
4653 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4654 }
4655 
4656 #define	SND_MCST_NACK(ldcp, pkt) \
4657 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4658 	pkt->tag.vio_sid = ldcp->local_session; \
4659 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t));
4660 
4661 /*
4662  * Process a multicast request from a vnet.
4663  *
4664  * Vnet's specify a multicast address that they are interested in. This
4665  * address is used as a key into the hash table which forms the multicast
4666  * forwarding database (mFDB).
4667  *
4668  * The table keys are the multicast addresses, while the table entries
4669  * are pointers to lists of ports which wish to receive packets for the
4670  * specified multicast address.
4671  *
4672  * When a multicast packet is being switched we use the address as a key
4673  * into the hash table, and then walk the appropriate port list forwarding
4674  * the pkt to each port in turn.
4675  *
4676  * If a vnet is no longer interested in a particular multicast grouping
4677  * we simply find the correct location in the hash table and then delete
4678  * the relevant port from the port list.
4679  *
4680  * To deal with the case whereby a port is being deleted without first
4681  * removing itself from the lists in the hash table, we maintain a list
4682  * of multicast addresses the port has registered an interest in, within
4683  * the port structure itself. We then simply walk that list of addresses
4684  * using them as keys into the hash table and remove the port from the
4685  * appropriate lists.
4686  */
4687 static void
4688 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
4689 {
4690 	vnet_mcast_msg_t	*mcst_pkt;
4691 	vsw_port_t		*port = ldcp->ldc_port;
4692 	vsw_t			*vswp = ldcp->ldc_vswp;
4693 	int			i;
4694 
4695 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4696 
4697 	/*
4698 	 * We know this is a ctrl/mcast packet so
4699 	 * cast it into the correct structure.
4700 	 */
4701 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
4702 
4703 	switch (mcst_pkt->tag.vio_subtype) {
4704 	case VIO_SUBTYPE_INFO:
4705 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4706 
4707 		/*
4708 		 * Check if in correct state to receive a multicast
4709 		 * message (i.e. handshake complete). If not reset
4710 		 * the handshake.
4711 		 */
4712 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
4713 			return;
4714 
4715 		/*
4716 		 * Before attempting to add or remove address check
4717 		 * that they are valid multicast addresses.
4718 		 * If not, then NACK back.
4719 		 */
4720 		for (i = 0; i < mcst_pkt->count; i++) {
4721 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
4722 				DERR(vswp, "%s: invalid multicast address",
4723 								__func__);
4724 				SND_MCST_NACK(ldcp, mcst_pkt);
4725 				return;
4726 			}
4727 		}
4728 
4729 		/*
4730 		 * Now add/remove the addresses. If this fails we
4731 		 * NACK back.
4732 		 */
4733 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
4734 			SND_MCST_NACK(ldcp, mcst_pkt);
4735 			return;
4736 		}
4737 
4738 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4739 		mcst_pkt->tag.vio_sid = ldcp->local_session;
4740 
4741 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
4742 
4743 		vsw_send_msg(ldcp, (void *)mcst_pkt,
4744 					sizeof (vnet_mcast_msg_t));
4745 		break;
4746 
4747 	case VIO_SUBTYPE_ACK:
4748 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4749 
4750 		/*
4751 		 * We shouldn't ever get a multicast ACK message as
4752 		 * at the moment we never request multicast addresses
4753 		 * to be set on some other device. This may change in
4754 		 * the future if we have cascading switches.
4755 		 */
4756 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
4757 			return;
4758 
4759 				/* Do nothing */
4760 		break;
4761 
4762 	case VIO_SUBTYPE_NACK:
4763 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4764 
4765 		/*
4766 		 * We shouldn't get a multicast NACK packet for the
4767 		 * same reasons as we shouldn't get a ACK packet.
4768 		 */
4769 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
4770 			return;
4771 
4772 				/* Do nothing */
4773 		break;
4774 
4775 	default:
4776 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4777 			mcst_pkt->tag.vio_subtype);
4778 	}
4779 
4780 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4781 }
4782 
4783 static void
4784 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
4785 {
4786 	vio_rdx_msg_t	*rdx_pkt;
4787 	vsw_t		*vswp = ldcp->ldc_vswp;
4788 
4789 	/*
4790 	 * We know this is a ctrl/rdx packet so
4791 	 * cast it into the correct structure.
4792 	 */
4793 	rdx_pkt = (vio_rdx_msg_t *)pkt;
4794 
4795 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4796 
4797 	switch (rdx_pkt->tag.vio_subtype) {
4798 	case VIO_SUBTYPE_INFO:
4799 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4800 
4801 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV))
4802 			return;
4803 
4804 		rdx_pkt->tag.vio_sid = ldcp->local_session;
4805 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4806 
4807 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
4808 
4809 		ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT;
4810 
4811 		vsw_send_msg(ldcp, (void *)rdx_pkt,
4812 				sizeof (vio_rdx_msg_t));
4813 
4814 		vsw_next_milestone(ldcp);
4815 		break;
4816 
4817 	case VIO_SUBTYPE_ACK:
4818 		/*
4819 		 * Should be handled in-band by callback handler.
4820 		 */
4821 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
4822 		vsw_restart_handshake(ldcp);
4823 		break;
4824 
4825 	case VIO_SUBTYPE_NACK:
4826 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4827 
4828 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV))
4829 			return;
4830 
4831 		ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV;
4832 		vsw_next_milestone(ldcp);
4833 		break;
4834 
4835 	default:
4836 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4837 			rdx_pkt->tag.vio_subtype);
4838 	}
4839 
4840 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4841 }
4842 
4843 static void
4844 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
4845 {
4846 	uint16_t	env = tag.vio_subtype_env;
4847 	vsw_t		*vswp = ldcp->ldc_vswp;
4848 
4849 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4850 
4851 	/* session id check */
4852 	if (ldcp->session_status & VSW_PEER_SESSION) {
4853 		if (ldcp->peer_session != tag.vio_sid) {
4854 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
4855 				__func__, ldcp->ldc_id, tag.vio_sid);
4856 			vsw_restart_handshake(ldcp);
4857 			return;
4858 		}
4859 	}
4860 
4861 	/*
4862 	 * It is an error for us to be getting data packets
4863 	 * before the handshake has completed.
4864 	 */
4865 	if (ldcp->hphase != VSW_MILESTONE4) {
4866 		DERR(vswp, "%s: got data packet before handshake complete "
4867 			"hphase %d (%x: %x)", __func__, ldcp->hphase,
4868 			ldcp->lane_in.lstate, ldcp->lane_out.lstate);
4869 		DUMP_FLAGS(ldcp->lane_in.lstate);
4870 		DUMP_FLAGS(ldcp->lane_out.lstate);
4871 		vsw_restart_handshake(ldcp);
4872 		return;
4873 	}
4874 
4875 	/*
4876 	 * Switch on vio_subtype envelope, then let lower routines
4877 	 * decide if its an INFO, ACK or NACK packet.
4878 	 */
4879 	if (env == VIO_DRING_DATA) {
4880 		vsw_process_data_dring_pkt(ldcp, dpkt);
4881 	} else if (env == VIO_PKT_DATA) {
4882 		vsw_process_data_raw_pkt(ldcp, dpkt);
4883 	} else if (env == VIO_DESC_DATA) {
4884 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
4885 	} else {
4886 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
4887 							__func__, env);
4888 	}
4889 
4890 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4891 }
4892 
4893 #define	SND_DRING_NACK(ldcp, pkt) \
4894 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4895 	pkt->tag.vio_sid = ldcp->local_session; \
4896 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t));
4897 
4898 static void
4899 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
4900 {
4901 	vio_dring_msg_t		*dring_pkt;
4902 	vnet_public_desc_t	*pub_addr = NULL;
4903 	vsw_private_desc_t	*priv_addr = NULL;
4904 	dring_info_t		*dp = NULL;
4905 	vsw_t			*vswp = ldcp->ldc_vswp;
4906 	mblk_t			*mp = NULL;
4907 	mblk_t			*bp = NULL;
4908 	mblk_t			*bpt = NULL;
4909 	size_t			nbytes = 0;
4910 	size_t			off = 0;
4911 	uint64_t		ncookies = 0;
4912 	uint64_t		chain = 0;
4913 	uint64_t		j, len;
4914 	uint32_t		pos, start, datalen;
4915 	uint32_t		range_start, range_end;
4916 	int32_t			end, num, cnt = 0;
4917 	int			i, rv;
4918 	boolean_t		ack_needed = B_FALSE;
4919 	boolean_t		prev_desc_ack = B_FALSE;
4920 	int			read_attempts = 0;
4921 
4922 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4923 
4924 	/*
4925 	 * We know this is a data/dring packet so
4926 	 * cast it into the correct structure.
4927 	 */
4928 	dring_pkt = (vio_dring_msg_t *)dpkt;
4929 
4930 	/*
4931 	 * Switch on the vio_subtype. If its INFO then we need to
4932 	 * process the data. If its an ACK we need to make sure
4933 	 * it makes sense (i.e did we send an earlier data/info),
4934 	 * and if its a NACK then we maybe attempt a retry.
4935 	 */
4936 	switch (dring_pkt->tag.vio_subtype) {
4937 	case VIO_SUBTYPE_INFO:
4938 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
4939 
4940 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
4941 				dring_pkt->dring_ident)) == NULL) {
4942 
4943 			DERR(vswp, "%s(%lld): unable to find dring from "
4944 				"ident 0x%llx", __func__, ldcp->ldc_id,
4945 				dring_pkt->dring_ident);
4946 
4947 			SND_DRING_NACK(ldcp, dring_pkt);
4948 			return;
4949 		}
4950 
4951 		start = pos = dring_pkt->start_idx;
4952 		end = dring_pkt->end_idx;
4953 		len = dp->num_descriptors;
4954 
4955 		range_start = range_end = pos;
4956 
4957 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
4958 			__func__, ldcp->ldc_id, start, end);
4959 
4960 		if (end == -1) {
4961 			num = -1;
4962 		} else if (end >= 0) {
4963 			num = end >= pos ?
4964 				end - pos + 1: (len - pos + 1) + end;
4965 
4966 			/* basic sanity check */
4967 			if (end > len) {
4968 				DERR(vswp, "%s(%lld): endpoint %lld outside "
4969 					"ring length %lld", __func__,
4970 					ldcp->ldc_id, end, len);
4971 
4972 				SND_DRING_NACK(ldcp, dring_pkt);
4973 				return;
4974 			}
4975 		} else {
4976 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
4977 				__func__, ldcp->ldc_id, end);
4978 			SND_DRING_NACK(ldcp, dring_pkt);
4979 			return;
4980 		}
4981 
4982 		while (cnt != num) {
4983 vsw_recheck_desc:
4984 			if ((rv = ldc_mem_dring_acquire(dp->handle,
4985 							pos, pos)) != 0) {
4986 				DERR(vswp, "%s(%lld): unable to acquire "
4987 					"descriptor at pos %d: err %d",
4988 					__func__, pos, ldcp->ldc_id, rv);
4989 				SND_DRING_NACK(ldcp, dring_pkt);
4990 				return;
4991 			}
4992 
4993 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
4994 
4995 			/*
4996 			 * When given a bounded range of descriptors
4997 			 * to process, its an error to hit a descriptor
4998 			 * which is not ready. In the non-bounded case
4999 			 * (end_idx == -1) this simply indicates we have
5000 			 * reached the end of the current active range.
5001 			 */
5002 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
5003 				/* unbound - no error */
5004 				if (end == -1) {
5005 					if (read_attempts == vsw_read_attempts)
5006 						break;
5007 
5008 					delay(drv_usectohz(vsw_desc_delay));
5009 					read_attempts++;
5010 					goto vsw_recheck_desc;
5011 				}
5012 
5013 				/* bounded - error - so NACK back */
5014 				DERR(vswp, "%s(%lld): descriptor not READY "
5015 					"(%d)", __func__, ldcp->ldc_id,
5016 					pub_addr->hdr.dstate);
5017 				SND_DRING_NACK(ldcp, dring_pkt);
5018 				return;
5019 			}
5020 
5021 			DTRACE_PROBE1(read_attempts, int, read_attempts);
5022 
5023 			range_end = pos;
5024 
5025 			/*
5026 			 * If we ACK'd the previous descriptor then now
5027 			 * record the new range start position for later
5028 			 * ACK's.
5029 			 */
5030 			if (prev_desc_ack) {
5031 				range_start = pos;
5032 
5033 				D2(vswp, "%s(%lld): updating range start "
5034 					"to be %d", __func__, ldcp->ldc_id,
5035 					range_start);
5036 
5037 				prev_desc_ack = B_FALSE;
5038 			}
5039 
5040 			/*
5041 			 * Data is padded to align on 8 byte boundary,
5042 			 * datalen is actual data length, i.e. minus that
5043 			 * padding.
5044 			 */
5045 			datalen = pub_addr->nbytes;
5046 
5047 			/*
5048 			 * Does peer wish us to ACK when we have finished
5049 			 * with this descriptor ?
5050 			 */
5051 			if (pub_addr->hdr.ack)
5052 				ack_needed = B_TRUE;
5053 
5054 			D2(vswp, "%s(%lld): processing desc %lld at pos"
5055 				" 0x%llx : dstate 0x%lx : datalen 0x%lx",
5056 				__func__, ldcp->ldc_id, pos, pub_addr,
5057 				pub_addr->hdr.dstate, datalen);
5058 
5059 			/*
5060 			 * Mark that we are starting to process descriptor.
5061 			 */
5062 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
5063 
5064 			mp = vio_allocb(ldcp->rxh);
5065 			if (mp == NULL) {
5066 				/*
5067 				 * No free receive buffers available, so
5068 				 * fallback onto allocb(9F). Make sure that
5069 				 * we get a data buffer which is a multiple
5070 				 * of 8 as this is required by ldc_mem_copy.
5071 				 */
5072 				DTRACE_PROBE(allocb);
5073 				mp = allocb(datalen + VNET_IPALIGN + 8,
5074 								BPRI_MED);
5075 			}
5076 
5077 			/*
5078 			 * Ensure that we ask ldc for an aligned
5079 			 * number of bytes.
5080 			 */
5081 			nbytes = datalen + VNET_IPALIGN;
5082 			if (nbytes & 0x7) {
5083 				off = 8 - (nbytes & 0x7);
5084 				nbytes += off;
5085 			}
5086 
5087 			ncookies = pub_addr->ncookies;
5088 			rv = ldc_mem_copy(ldcp->ldc_handle,
5089 				(caddr_t)mp->b_rptr, 0, &nbytes,
5090 				pub_addr->memcookie, ncookies,
5091 				LDC_COPY_IN);
5092 
5093 			if (rv != 0) {
5094 				DERR(vswp, "%s(%d): unable to copy in "
5095 					"data from %d cookies in desc %d"
5096 					" (rv %d)", __func__, ldcp->ldc_id,
5097 					ncookies, pos, rv);
5098 				freemsg(mp);
5099 
5100 				pub_addr->hdr.dstate = VIO_DESC_DONE;
5101 				(void) ldc_mem_dring_release(dp->handle,
5102 								pos, pos);
5103 				break;
5104 			} else {
5105 				D2(vswp, "%s(%d): copied in %ld bytes"
5106 					" using %d cookies", __func__,
5107 					ldcp->ldc_id, nbytes, ncookies);
5108 			}
5109 
5110 			/* adjust the read pointer to skip over the padding */
5111 			mp->b_rptr += VNET_IPALIGN;
5112 
5113 			/* point to the actual end of data */
5114 			mp->b_wptr = mp->b_rptr + datalen;
5115 
5116 			/* build a chain of received packets */
5117 			if (bp == NULL) {
5118 				/* first pkt */
5119 				bp = mp;
5120 				bp->b_next = bp->b_prev = NULL;
5121 				bpt = bp;
5122 				chain = 1;
5123 			} else {
5124 				mp->b_next = NULL;
5125 				mp->b_prev = bpt;
5126 				bpt->b_next = mp;
5127 				bpt = mp;
5128 				chain++;
5129 			}
5130 
5131 			/* mark we are finished with this descriptor */
5132 			pub_addr->hdr.dstate = VIO_DESC_DONE;
5133 
5134 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
5135 
5136 			/*
5137 			 * Send an ACK back to peer if requested.
5138 			 */
5139 			if (ack_needed) {
5140 				ack_needed = B_FALSE;
5141 
5142 				dring_pkt->start_idx = range_start;
5143 				dring_pkt->end_idx = range_end;
5144 
5145 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
5146 					" requested", __func__, ldcp->ldc_id,
5147 					dring_pkt->start_idx,
5148 					dring_pkt->end_idx);
5149 
5150 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
5151 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5152 				dring_pkt->tag.vio_sid = ldcp->local_session;
5153 				vsw_send_msg(ldcp, (void *)dring_pkt,
5154 					sizeof (vio_dring_msg_t));
5155 
5156 				prev_desc_ack = B_TRUE;
5157 				range_start = pos;
5158 			}
5159 
5160 			/* next descriptor */
5161 			pos = (pos + 1) % len;
5162 			cnt++;
5163 
5164 			/*
5165 			 * Break out of loop here and stop processing to
5166 			 * allow some other network device (or disk) to
5167 			 * get access to the cpu.
5168 			 */
5169 			/* send the chain of packets to be switched */
5170 			if (chain > vsw_chain_len) {
5171 				D3(vswp, "%s(%lld): switching chain of %d "
5172 					"msgs", __func__, ldcp->ldc_id, chain);
5173 				vsw_switch_frame(vswp, bp, VSW_VNETPORT,
5174 							ldcp->ldc_port, NULL);
5175 				bp = NULL;
5176 				break;
5177 			}
5178 		}
5179 
5180 		/* send the chain of packets to be switched */
5181 		if (bp != NULL) {
5182 			D3(vswp, "%s(%lld): switching chain of %d msgs",
5183 					__func__, ldcp->ldc_id, chain);
5184 			vsw_switch_frame(vswp, bp, VSW_VNETPORT,
5185 							ldcp->ldc_port, NULL);
5186 		}
5187 
5188 		DTRACE_PROBE1(msg_cnt, int, cnt);
5189 
5190 		/*
5191 		 * We are now finished so ACK back with the state
5192 		 * set to STOPPING so our peer knows we are finished
5193 		 */
5194 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5195 		dring_pkt->tag.vio_sid = ldcp->local_session;
5196 
5197 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
5198 
5199 		DTRACE_PROBE(stop_process_sent);
5200 
5201 		/*
5202 		 * We have not processed any more descriptors beyond
5203 		 * the last one we ACK'd.
5204 		 */
5205 		if (prev_desc_ack)
5206 			range_start = range_end;
5207 
5208 		dring_pkt->start_idx = range_start;
5209 		dring_pkt->end_idx = range_end;
5210 
5211 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
5212 			__func__, ldcp->ldc_id, dring_pkt->start_idx,
5213 			dring_pkt->end_idx);
5214 
5215 		vsw_send_msg(ldcp, (void *)dring_pkt,
5216 					sizeof (vio_dring_msg_t));
5217 		break;
5218 
5219 	case VIO_SUBTYPE_ACK:
5220 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
5221 		/*
5222 		 * Verify that the relevant descriptors are all
5223 		 * marked as DONE
5224 		 */
5225 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
5226 			dring_pkt->dring_ident)) == NULL) {
5227 			DERR(vswp, "%s: unknown ident in ACK", __func__);
5228 			return;
5229 		}
5230 
5231 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5232 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5233 
5234 		start = end = 0;
5235 		start = dring_pkt->start_idx;
5236 		end = dring_pkt->end_idx;
5237 		len = dp->num_descriptors;
5238 
5239 		j = num = 0;
5240 		/* calculate # descriptors taking into a/c wrap around */
5241 		num = end >= start ? end - start + 1: (len - start + 1) + end;
5242 
5243 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
5244 			__func__, ldcp->ldc_id, start, end, num);
5245 
5246 		mutex_enter(&dp->dlock);
5247 		dp->last_ack_recv = end;
5248 		mutex_exit(&dp->dlock);
5249 
5250 		for (i = start; j < num; i = (i + 1) % len, j++) {
5251 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5252 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5253 
5254 			/*
5255 			 * If the last descriptor in a range has the ACK
5256 			 * bit set then we will get two messages from our
5257 			 * peer relating to it. The normal ACK msg and then
5258 			 * a subsequent STOP msg. The first message will have
5259 			 * resulted in the descriptor being reclaimed and
5260 			 * its state set to FREE so when we encounter a non
5261 			 * DONE descriptor we need to check to see if its
5262 			 * because we have just reclaimed it.
5263 			 */
5264 			mutex_enter(&priv_addr->dstate_lock);
5265 			if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
5266 				/* clear all the fields */
5267 				bzero(priv_addr->datap, priv_addr->datalen);
5268 				priv_addr->datalen = 0;
5269 
5270 				pub_addr->hdr.dstate = VIO_DESC_FREE;
5271 				pub_addr->hdr.ack = 0;
5272 
5273 				priv_addr->dstate = VIO_DESC_FREE;
5274 				mutex_exit(&priv_addr->dstate_lock);
5275 
5276 				D3(vswp, "clearing descp %d : pub state "
5277 					"0x%llx : priv state 0x%llx", i,
5278 					pub_addr->hdr.dstate,
5279 					priv_addr->dstate);
5280 
5281 			} else {
5282 				mutex_exit(&priv_addr->dstate_lock);
5283 
5284 				if (dring_pkt->dring_process_state !=
5285 							VIO_DP_STOPPED) {
5286 					DERR(vswp, "%s: descriptor %lld at pos "
5287 						" 0x%llx not DONE (0x%lx)\n",
5288 						__func__, i, pub_addr,
5289 						pub_addr->hdr.dstate);
5290 					return;
5291 				}
5292 			}
5293 		}
5294 
5295 		/*
5296 		 * If our peer is stopping processing descriptors then
5297 		 * we check to make sure it has processed all the descriptors
5298 		 * we have updated. If not then we send it a new message
5299 		 * to prompt it to restart.
5300 		 */
5301 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
5302 			DTRACE_PROBE(stop_process_recv);
5303 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
5304 				__func__, ldcp->ldc_id, dring_pkt->start_idx,
5305 				dring_pkt->end_idx);
5306 
5307 			/*
5308 			 * Check next descriptor in public section of ring.
5309 			 * If its marked as READY then we need to prompt our
5310 			 * peer to start processing the ring again.
5311 			 */
5312 			i = (end + 1) % len;
5313 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5314 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5315 
5316 			/*
5317 			 * Hold the restart lock across all of this to
5318 			 * make sure that its not possible for us to
5319 			 * decide that a msg needs to be sent in the future
5320 			 * but the sending code having already checked is
5321 			 * about to exit.
5322 			 */
5323 			mutex_enter(&dp->restart_lock);
5324 			mutex_enter(&priv_addr->dstate_lock);
5325 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
5326 
5327 				mutex_exit(&priv_addr->dstate_lock);
5328 
5329 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
5330 				dring_pkt->tag.vio_sid = ldcp->local_session;
5331 
5332 				mutex_enter(&ldcp->lane_out.seq_lock);
5333 				dring_pkt->seq_num = ldcp->lane_out.seq_num++;
5334 				mutex_exit(&ldcp->lane_out.seq_lock);
5335 
5336 				dring_pkt->start_idx = (end + 1) % len;
5337 				dring_pkt->end_idx = -1;
5338 
5339 				D2(vswp, "%s(%lld) : sending restart msg:"
5340 					" %d : %d", __func__, ldcp->ldc_id,
5341 					dring_pkt->start_idx,
5342 					dring_pkt->end_idx);
5343 
5344 				vsw_send_msg(ldcp, (void *)dring_pkt,
5345 						sizeof (vio_dring_msg_t));
5346 			} else {
5347 				mutex_exit(&priv_addr->dstate_lock);
5348 				dp->restart_reqd = B_TRUE;
5349 			}
5350 			mutex_exit(&dp->restart_lock);
5351 		}
5352 		break;
5353 
5354 	case VIO_SUBTYPE_NACK:
5355 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
5356 						__func__, ldcp->ldc_id);
5357 		/*
5358 		 * Something is badly wrong if we are getting NACK's
5359 		 * for our data pkts. So reset the channel.
5360 		 */
5361 		vsw_restart_handshake(ldcp);
5362 
5363 		break;
5364 
5365 	default:
5366 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
5367 			ldcp->ldc_id, dring_pkt->tag.vio_subtype);
5368 	}
5369 
5370 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5371 }
5372 
5373 /*
5374  * VIO_PKT_DATA (a.k.a raw data mode )
5375  *
5376  * Note - currently not supported. Do nothing.
5377  */
5378 static void
5379 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
5380 {
5381 	_NOTE(ARGUNUSED(dpkt))
5382 
5383 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
5384 
5385 	DERR(NULL, "%s (%lld): currently  not supported",
5386 						__func__, ldcp->ldc_id);
5387 
5388 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
5389 }
5390 
5391 #define	SND_IBND_DESC_NACK(ldcp, pkt) \
5392 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5393 	pkt->tag.vio_sid = ldcp->local_session; \
5394 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t));
5395 
5396 /*
5397  * Process an in-band descriptor message (most likely from
5398  * OBP).
5399  */
5400 static void
5401 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
5402 {
5403 	vio_ibnd_desc_t		*ibnd_desc;
5404 	dring_info_t		*dp = NULL;
5405 	vsw_private_desc_t	*priv_addr = NULL;
5406 	vsw_t			*vswp = ldcp->ldc_vswp;
5407 	mblk_t			*mp = NULL;
5408 	size_t			nbytes = 0;
5409 	size_t			off = 0;
5410 	uint64_t		idx = 0;
5411 	uint32_t		num = 1, len, datalen = 0;
5412 	uint64_t		ncookies = 0;
5413 	int			i, rv;
5414 	int			j = 0;
5415 
5416 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5417 
5418 	ibnd_desc = (vio_ibnd_desc_t *)pkt;
5419 
5420 	switch (ibnd_desc->hdr.tag.vio_subtype) {
5421 	case VIO_SUBTYPE_INFO:
5422 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5423 
5424 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
5425 			return;
5426 
5427 		/*
5428 		 * Data is padded to align on a 8 byte boundary,
5429 		 * nbytes is actual data length, i.e. minus that
5430 		 * padding.
5431 		 */
5432 		datalen = ibnd_desc->nbytes;
5433 
5434 		D2(vswp, "%s(%lld): processing inband desc : "
5435 			": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
5436 
5437 		ncookies = ibnd_desc->ncookies;
5438 
5439 		/*
5440 		 * allocb(9F) returns an aligned data block. We
5441 		 * need to ensure that we ask ldc for an aligned
5442 		 * number of bytes also.
5443 		 */
5444 		nbytes = datalen;
5445 		if (nbytes & 0x7) {
5446 			off = 8 - (nbytes & 0x7);
5447 			nbytes += off;
5448 		}
5449 
5450 		mp = allocb(datalen, BPRI_MED);
5451 		if (mp == NULL) {
5452 			DERR(vswp, "%s(%lld): allocb failed",
5453 					__func__, ldcp->ldc_id);
5454 			return;
5455 		}
5456 
5457 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
5458 			0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
5459 			LDC_COPY_IN);
5460 
5461 		if (rv != 0) {
5462 			DERR(vswp, "%s(%d): unable to copy in data from "
5463 				"%d cookie(s)", __func__,
5464 				ldcp->ldc_id, ncookies);
5465 			freemsg(mp);
5466 			return;
5467 		} else {
5468 			D2(vswp, "%s(%d): copied in %ld bytes using %d "
5469 				"cookies", __func__, ldcp->ldc_id, nbytes,
5470 				ncookies);
5471 		}
5472 
5473 		/* point to the actual end of data */
5474 		mp->b_wptr = mp->b_rptr + datalen;
5475 
5476 		/*
5477 		 * We ACK back every in-band descriptor message we process
5478 		 */
5479 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
5480 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
5481 		vsw_send_msg(ldcp, (void *)ibnd_desc,
5482 				sizeof (vio_ibnd_desc_t));
5483 
5484 		/* send the packet to be switched */
5485 		vsw_switch_frame(vswp, mp, VSW_VNETPORT,
5486 					ldcp->ldc_port, NULL);
5487 
5488 		break;
5489 
5490 	case VIO_SUBTYPE_ACK:
5491 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5492 
5493 		/* Verify the ACK is valid */
5494 		idx = ibnd_desc->hdr.desc_handle;
5495 
5496 		if (idx >= VSW_RING_NUM_EL) {
5497 			cmn_err(CE_WARN, "%s: corrupted ACK received "
5498 				"(idx %ld)", __func__, idx);
5499 			return;
5500 		}
5501 
5502 		if ((dp = ldcp->lane_out.dringp) == NULL) {
5503 			DERR(vswp, "%s: no dring found", __func__);
5504 			return;
5505 		}
5506 
5507 		len = dp->num_descriptors;
5508 		/*
5509 		 * If the descriptor we are being ACK'ed for is not the
5510 		 * one we expected, then pkts were lost somwhere, either
5511 		 * when we tried to send a msg, or a previous ACK msg from
5512 		 * our peer. In either case we now reclaim the descriptors
5513 		 * in the range from the last ACK we received up to the
5514 		 * current ACK.
5515 		 */
5516 		if (idx != dp->last_ack_recv) {
5517 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
5518 				__func__, dp->last_ack_recv, idx);
5519 			num = idx >= dp->last_ack_recv ?
5520 				idx - dp->last_ack_recv + 1:
5521 				(len - dp->last_ack_recv + 1) + idx;
5522 		}
5523 
5524 		/*
5525 		 * When we sent the in-band message to our peer we
5526 		 * marked the copy in our private ring as READY. We now
5527 		 * check that the descriptor we are being ACK'ed for is in
5528 		 * fact READY, i.e. it is one we have shared with our peer.
5529 		 *
5530 		 * If its not we flag an error, but still reset the descr
5531 		 * back to FREE.
5532 		 */
5533 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
5534 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5535 			mutex_enter(&priv_addr->dstate_lock);
5536 			if (priv_addr->dstate != VIO_DESC_READY) {
5537 				DERR(vswp, "%s: (%ld) desc at index %ld not "
5538 					"READY (0x%lx)", __func__,
5539 					ldcp->ldc_id, idx, priv_addr->dstate);
5540 				DERR(vswp, "%s: bound %d: ncookies %ld : "
5541 					"datalen %ld", __func__,
5542 					priv_addr->bound, priv_addr->ncookies,
5543 					priv_addr->datalen);
5544 			}
5545 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
5546 				ldcp->ldc_id, idx);
5547 			/* release resources associated with sent msg */
5548 			bzero(priv_addr->datap, priv_addr->datalen);
5549 			priv_addr->datalen = 0;
5550 			priv_addr->dstate = VIO_DESC_FREE;
5551 			mutex_exit(&priv_addr->dstate_lock);
5552 		}
5553 		/* update to next expected value */
5554 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
5555 
5556 		break;
5557 
5558 	case VIO_SUBTYPE_NACK:
5559 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5560 
5561 		/*
5562 		 * We should only get a NACK if our peer doesn't like
5563 		 * something about a message we have sent it. If this
5564 		 * happens we just release the resources associated with
5565 		 * the message. (We are relying on higher layers to decide
5566 		 * whether or not to resend.
5567 		 */
5568 
5569 		/* limit check */
5570 		idx = ibnd_desc->hdr.desc_handle;
5571 
5572 		if (idx >= VSW_RING_NUM_EL) {
5573 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
5574 				__func__, idx);
5575 			return;
5576 		}
5577 
5578 		if ((dp = ldcp->lane_out.dringp) == NULL) {
5579 			DERR(vswp, "%s: no dring found", __func__);
5580 			return;
5581 		}
5582 
5583 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5584 
5585 		/* move to correct location in ring */
5586 		priv_addr += idx;
5587 
5588 		/* release resources associated with sent msg */
5589 		mutex_enter(&priv_addr->dstate_lock);
5590 		bzero(priv_addr->datap, priv_addr->datalen);
5591 		priv_addr->datalen = 0;
5592 		priv_addr->dstate = VIO_DESC_FREE;
5593 		mutex_exit(&priv_addr->dstate_lock);
5594 
5595 		break;
5596 
5597 	default:
5598 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
5599 			ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
5600 	}
5601 
5602 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5603 }
5604 
5605 static void
5606 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
5607 {
5608 	_NOTE(ARGUNUSED(epkt))
5609 
5610 	vsw_t		*vswp = ldcp->ldc_vswp;
5611 	uint16_t	env = tag.vio_subtype_env;
5612 
5613 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
5614 
5615 	/*
5616 	 * Error vio_subtypes have yet to be defined. So for
5617 	 * the moment we can't do anything.
5618 	 */
5619 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
5620 
5621 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
5622 }
5623 
5624 /*
5625  * Switch the given ethernet frame when operating in layer 2 mode.
5626  *
5627  * vswp: pointer to the vsw instance
5628  * mp: pointer to chain of ethernet frame(s) to be switched
5629  * caller: identifies the source of this frame as:
5630  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
5631  *		2. VSW_PHYSDEV - the physical ethernet device
5632  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
5633  * arg: argument provided by the caller.
5634  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
5635  *		2. for PHYSDEV - NULL
5636  *		3. for LOCALDEV - pointer to to this vsw_t(self)
5637  */
5638 void
5639 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
5640 			vsw_port_t *arg, mac_resource_handle_t mrh)
5641 {
5642 	struct ether_header	*ehp;
5643 	vsw_port_t		*port = NULL;
5644 	mblk_t			*bp, *ret_m;
5645 	mblk_t			*nmp = NULL;
5646 	vsw_port_list_t		*plist = &vswp->plist;
5647 
5648 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
5649 
5650 	/*
5651 	 * PERF: rather than breaking up the chain here, scan it
5652 	 * to find all mblks heading to same destination and then
5653 	 * pass that sub-chain to the lower transmit functions.
5654 	 */
5655 
5656 	/* process the chain of packets */
5657 	bp = mp;
5658 	while (bp) {
5659 		mp = bp;
5660 		bp = bp->b_next;
5661 		mp->b_next = mp->b_prev = NULL;
5662 		ehp = (struct ether_header *)mp->b_rptr;
5663 
5664 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
5665 			__func__, MBLKSIZE(mp), MBLKL(mp));
5666 
5667 		READ_ENTER(&vswp->if_lockrw);
5668 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
5669 			/*
5670 			 * If destination is VSW_LOCALDEV (vsw as an eth
5671 			 * interface) and if the device is up & running,
5672 			 * send the packet up the stack on this host.
5673 			 * If the virtual interface is down, drop the packet.
5674 			 */
5675 			if (caller != VSW_LOCALDEV) {
5676 				if (vswp->if_state & VSW_IF_UP) {
5677 					RW_EXIT(&vswp->if_lockrw);
5678 					mac_rx(vswp->if_mh, mrh, mp);
5679 				} else {
5680 					RW_EXIT(&vswp->if_lockrw);
5681 					/* Interface down, drop pkt */
5682 					freemsg(mp);
5683 				}
5684 			} else {
5685 				RW_EXIT(&vswp->if_lockrw);
5686 				freemsg(mp);
5687 			}
5688 			continue;
5689 		}
5690 		RW_EXIT(&vswp->if_lockrw);
5691 
5692 		READ_ENTER(&plist->lockrw);
5693 		port = vsw_lookup_fdb(vswp, ehp);
5694 		if (port) {
5695 			/*
5696 			 * Mark the port as in-use.
5697 			 */
5698 			mutex_enter(&port->ref_lock);
5699 			port->ref_cnt++;
5700 			mutex_exit(&port->ref_lock);
5701 			RW_EXIT(&plist->lockrw);
5702 
5703 			/*
5704 			 * If plumbed and in promisc mode then copy msg
5705 			 * and send up the stack.
5706 			 */
5707 			READ_ENTER(&vswp->if_lockrw);
5708 			if (VSW_U_P(vswp->if_state)) {
5709 				RW_EXIT(&vswp->if_lockrw);
5710 				nmp = copymsg(mp);
5711 				if (nmp)
5712 					mac_rx(vswp->if_mh, mrh, nmp);
5713 			} else {
5714 				RW_EXIT(&vswp->if_lockrw);
5715 			}
5716 
5717 			/*
5718 			 * If the destination is in FDB, the packet
5719 			 * should be forwarded to the correponding
5720 			 * vsw_port (connected to a vnet device -
5721 			 * VSW_VNETPORT)
5722 			 */
5723 			(void) vsw_portsend(port, mp);
5724 
5725 			/*
5726 			 * Decrement use count in port and check if
5727 			 * should wake delete thread.
5728 			 */
5729 			mutex_enter(&port->ref_lock);
5730 			port->ref_cnt--;
5731 			if (port->ref_cnt == 0)
5732 				cv_signal(&port->ref_cv);
5733 			mutex_exit(&port->ref_lock);
5734 		} else {
5735 			RW_EXIT(&plist->lockrw);
5736 			/*
5737 			 * Destination not in FDB.
5738 			 *
5739 			 * If the destination is broadcast or
5740 			 * multicast forward the packet to all
5741 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
5742 			 * except the caller.
5743 			 */
5744 			if (IS_BROADCAST(ehp)) {
5745 				D3(vswp, "%s: BROADCAST pkt", __func__);
5746 				(void) vsw_forward_all(vswp, mp,
5747 								caller, arg);
5748 			} else if (IS_MULTICAST(ehp)) {
5749 				D3(vswp, "%s: MULTICAST pkt", __func__);
5750 				(void) vsw_forward_grp(vswp, mp,
5751 							caller, arg);
5752 			} else {
5753 				/*
5754 				 * If the destination is unicast, and came
5755 				 * from either a logical network device or
5756 				 * the switch itself when it is plumbed, then
5757 				 * send it out on the physical device and also
5758 				 * up the stack if the logical interface is
5759 				 * in promiscious mode.
5760 				 *
5761 				 * NOTE:  The assumption here is that if we
5762 				 * cannot find the destination in our fdb, its
5763 				 * a unicast address, and came from either a
5764 				 * vnet or down the stack (when plumbed) it
5765 				 * must be destinded for an ethernet device
5766 				 * outside our ldoms.
5767 				 */
5768 				if (caller == VSW_VNETPORT) {
5769 					READ_ENTER(&vswp->if_lockrw);
5770 					if (VSW_U_P(vswp->if_state)) {
5771 						RW_EXIT(&vswp->if_lockrw);
5772 						nmp = copymsg(mp);
5773 						if (nmp)
5774 							mac_rx(vswp->if_mh,
5775 								mrh, nmp);
5776 					} else {
5777 						RW_EXIT(&vswp->if_lockrw);
5778 					}
5779 					if ((ret_m = vsw_tx_msg(vswp, mp))
5780 								!= NULL) {
5781 						DERR(vswp, "%s: drop mblks to "
5782 							"phys dev", __func__);
5783 						freemsg(ret_m);
5784 					}
5785 
5786 				} else if (caller == VSW_PHYSDEV) {
5787 					/*
5788 					 * Pkt seen because card in promisc
5789 					 * mode. Send up stack if plumbed in
5790 					 * promisc mode, else drop it.
5791 					 */
5792 					READ_ENTER(&vswp->if_lockrw);
5793 					if (VSW_U_P(vswp->if_state)) {
5794 						RW_EXIT(&vswp->if_lockrw);
5795 						mac_rx(vswp->if_mh, mrh, mp);
5796 					} else {
5797 						RW_EXIT(&vswp->if_lockrw);
5798 						freemsg(mp);
5799 					}
5800 
5801 				} else if (caller == VSW_LOCALDEV) {
5802 					/*
5803 					 * Pkt came down the stack, send out
5804 					 * over physical device.
5805 					 */
5806 					if ((ret_m = vsw_tx_msg(vswp, mp))
5807 								!= NULL) {
5808 						DERR(vswp, "%s: drop mblks to "
5809 							"phys dev", __func__);
5810 						freemsg(ret_m);
5811 					}
5812 				}
5813 			}
5814 		}
5815 	}
5816 	D1(vswp, "%s: exit\n", __func__);
5817 }
5818 
5819 /*
5820  * Switch ethernet frame when in layer 3 mode (i.e. using IP
5821  * layer to do the routing).
5822  *
5823  * There is a large amount of overlap between this function and
5824  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
5825  * both these functions.
5826  */
5827 void
5828 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
5829 			vsw_port_t *arg, mac_resource_handle_t mrh)
5830 {
5831 	struct ether_header	*ehp;
5832 	vsw_port_t		*port = NULL;
5833 	mblk_t			*bp = NULL;
5834 	vsw_port_list_t		*plist = &vswp->plist;
5835 
5836 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
5837 
5838 	/*
5839 	 * In layer 3 mode should only ever be switching packets
5840 	 * between IP layer and vnet devices. So make sure thats
5841 	 * who is invoking us.
5842 	 */
5843 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
5844 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
5845 		freemsgchain(mp);
5846 		return;
5847 	}
5848 
5849 	/* process the chain of packets */
5850 	bp = mp;
5851 	while (bp) {
5852 		mp = bp;
5853 		bp = bp->b_next;
5854 		mp->b_next = mp->b_prev = NULL;
5855 		ehp = (struct ether_header *)mp->b_rptr;
5856 
5857 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
5858 			__func__, MBLKSIZE(mp), MBLKL(mp));
5859 
5860 		READ_ENTER(&plist->lockrw);
5861 		port = vsw_lookup_fdb(vswp, ehp);
5862 		if (port) {
5863 			/*
5864 			 * Mark port as in-use.
5865 			 */
5866 			mutex_enter(&port->ref_lock);
5867 			port->ref_cnt++;
5868 			mutex_exit(&port->ref_lock);
5869 			RW_EXIT(&plist->lockrw);
5870 
5871 			D2(vswp, "%s: sending to target port", __func__);
5872 			(void) vsw_portsend(port, mp);
5873 
5874 			/*
5875 			 * Finished with port so decrement ref count and
5876 			 * check if should wake delete thread.
5877 			 */
5878 			mutex_enter(&port->ref_lock);
5879 			port->ref_cnt--;
5880 			if (port->ref_cnt == 0)
5881 				cv_signal(&port->ref_cv);
5882 			mutex_exit(&port->ref_lock);
5883 		} else {
5884 			RW_EXIT(&plist->lockrw);
5885 			/*
5886 			 * Destination not in FDB
5887 			 *
5888 			 * If the destination is broadcast or
5889 			 * multicast forward the packet to all
5890 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
5891 			 * except the caller.
5892 			 */
5893 			if (IS_BROADCAST(ehp)) {
5894 				D2(vswp, "%s: BROADCAST pkt", __func__);
5895 				(void) vsw_forward_all(vswp, mp,
5896 								caller, arg);
5897 			} else if (IS_MULTICAST(ehp)) {
5898 				D2(vswp, "%s: MULTICAST pkt", __func__);
5899 				(void) vsw_forward_grp(vswp, mp,
5900 							caller, arg);
5901 			} else {
5902 				/*
5903 				 * Unicast pkt from vnet that we don't have
5904 				 * an FDB entry for, so must be destinded for
5905 				 * the outside world. Attempt to send up to the
5906 				 * IP layer to allow it to deal with it.
5907 				 */
5908 				if (caller == VSW_VNETPORT) {
5909 					READ_ENTER(&vswp->if_lockrw);
5910 					if (vswp->if_state & VSW_IF_UP) {
5911 						RW_EXIT(&vswp->if_lockrw);
5912 						D2(vswp, "%s: sending up",
5913 							__func__);
5914 						mac_rx(vswp->if_mh, mrh, mp);
5915 					} else {
5916 						RW_EXIT(&vswp->if_lockrw);
5917 						/* Interface down, drop pkt */
5918 						D2(vswp, "%s I/F down",
5919 								__func__);
5920 						freemsg(mp);
5921 					}
5922 				}
5923 			}
5924 		}
5925 	}
5926 
5927 	D1(vswp, "%s: exit", __func__);
5928 }
5929 
5930 /*
5931  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
5932  * except the caller (port on which frame arrived).
5933  */
5934 static int
5935 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
5936 {
5937 	vsw_port_list_t	*plist = &vswp->plist;
5938 	vsw_port_t	*portp;
5939 	mblk_t		*nmp = NULL;
5940 	mblk_t		*ret_m = NULL;
5941 	int		skip_port = 0;
5942 
5943 	D1(vswp, "vsw_forward_all: enter\n");
5944 
5945 	/*
5946 	 * Broadcast message from inside ldoms so send to outside
5947 	 * world if in either of layer 2 modes.
5948 	 */
5949 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
5950 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
5951 		((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
5952 
5953 		nmp = dupmsg(mp);
5954 		if (nmp) {
5955 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
5956 				DERR(vswp, "%s: dropping pkt(s) "
5957 				"consisting of %ld bytes of data for"
5958 				" physical device", __func__, MBLKL(ret_m));
5959 			freemsg(ret_m);
5960 			}
5961 		}
5962 	}
5963 
5964 	if (caller == VSW_VNETPORT)
5965 		skip_port = 1;
5966 
5967 	/*
5968 	 * Broadcast message from other vnet (layer 2 or 3) or outside
5969 	 * world (layer 2 only), send up stack if plumbed.
5970 	 */
5971 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
5972 		READ_ENTER(&vswp->if_lockrw);
5973 		if (vswp->if_state & VSW_IF_UP) {
5974 			RW_EXIT(&vswp->if_lockrw);
5975 			nmp = copymsg(mp);
5976 			if (nmp)
5977 				mac_rx(vswp->if_mh, NULL, nmp);
5978 		} else {
5979 			RW_EXIT(&vswp->if_lockrw);
5980 		}
5981 	}
5982 
5983 	/* send it to all VNETPORTs */
5984 	READ_ENTER(&plist->lockrw);
5985 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
5986 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
5987 		/*
5988 		 * Caution ! - don't reorder these two checks as arg
5989 		 * will be NULL if the caller is PHYSDEV. skip_port is
5990 		 * only set if caller is VNETPORT.
5991 		 */
5992 		if ((skip_port) && (portp == arg))
5993 			continue;
5994 		else {
5995 			nmp = dupmsg(mp);
5996 			if (nmp) {
5997 				(void) vsw_portsend(portp, nmp);
5998 			} else {
5999 				DERR(vswp, "vsw_forward_all: nmp NULL");
6000 			}
6001 		}
6002 	}
6003 	RW_EXIT(&plist->lockrw);
6004 
6005 	freemsg(mp);
6006 
6007 	D1(vswp, "vsw_forward_all: exit\n");
6008 	return (0);
6009 }
6010 
6011 /*
6012  * Forward pkts to any devices or interfaces which have registered
6013  * an interest in them (i.e. multicast groups).
6014  */
6015 static int
6016 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
6017 {
6018 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
6019 	mfdb_ent_t		*entp = NULL;
6020 	mfdb_ent_t		*tpp = NULL;
6021 	vsw_port_t 		*port;
6022 	uint64_t		key = 0;
6023 	mblk_t			*nmp = NULL;
6024 	mblk_t			*ret_m = NULL;
6025 	boolean_t		check_if = B_TRUE;
6026 
6027 	/*
6028 	 * Convert address to hash table key
6029 	 */
6030 	KEY_HASH(key, ehp->ether_dhost);
6031 
6032 	D1(vswp, "%s: key 0x%llx", __func__, key);
6033 
6034 	/*
6035 	 * If pkt came from either a vnet or down the stack (if we are
6036 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
6037 	 * over the physical adapter, and then check to see if any other
6038 	 * vnets are interested in it.
6039 	 */
6040 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
6041 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
6042 		((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
6043 		nmp = dupmsg(mp);
6044 		if (nmp) {
6045 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
6046 				DERR(vswp, "%s: dropping pkt(s) "
6047 					"consisting of %ld bytes of "
6048 					"data for physical device",
6049 					__func__, MBLKL(ret_m));
6050 				freemsg(ret_m);
6051 			}
6052 		}
6053 	}
6054 
6055 	READ_ENTER(&vswp->mfdbrw);
6056 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
6057 				(mod_hash_val_t *)&entp) != 0) {
6058 		D3(vswp, "%s: no table entry found for addr 0x%llx",
6059 								__func__, key);
6060 	} else {
6061 		/*
6062 		 * Send to list of devices associated with this address...
6063 		 */
6064 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
6065 
6066 			/* dont send to ourselves */
6067 			if ((caller == VSW_VNETPORT) &&
6068 				(tpp->d_addr == (void *)arg)) {
6069 				port = (vsw_port_t *)tpp->d_addr;
6070 				D3(vswp, "%s: not sending to ourselves"
6071 					" : port %d", __func__,
6072 					port->p_instance);
6073 				continue;
6074 
6075 			} else if ((caller == VSW_LOCALDEV) &&
6076 				(tpp->d_type == VSW_LOCALDEV)) {
6077 				D3(vswp, "%s: not sending back up stack",
6078 					__func__);
6079 				continue;
6080 			}
6081 
6082 			if (tpp->d_type == VSW_VNETPORT) {
6083 				port = (vsw_port_t *)tpp->d_addr;
6084 				D3(vswp, "%s: sending to port %ld for "
6085 					" addr 0x%llx", __func__,
6086 					port->p_instance, key);
6087 
6088 				nmp = dupmsg(mp);
6089 				if (nmp)
6090 					(void) vsw_portsend(port, nmp);
6091 			} else {
6092 				if (vswp->if_state & VSW_IF_UP) {
6093 					nmp = copymsg(mp);
6094 					if (nmp)
6095 						mac_rx(vswp->if_mh, NULL, nmp);
6096 					check_if = B_FALSE;
6097 					D3(vswp, "%s: sending up stack"
6098 						" for addr 0x%llx", __func__,
6099 						key);
6100 				}
6101 			}
6102 		}
6103 	}
6104 
6105 	RW_EXIT(&vswp->mfdbrw);
6106 
6107 	/*
6108 	 * If the pkt came from either a vnet or from physical device,
6109 	 * and if we havent already sent the pkt up the stack then we
6110 	 * check now if we can/should (i.e. the interface is plumbed
6111 	 * and in promisc mode).
6112 	 */
6113 	if ((check_if) &&
6114 		((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
6115 		READ_ENTER(&vswp->if_lockrw);
6116 		if (VSW_U_P(vswp->if_state)) {
6117 			RW_EXIT(&vswp->if_lockrw);
6118 			D3(vswp, "%s: (caller %d) finally sending up stack"
6119 				" for addr 0x%llx", __func__, caller, key);
6120 			nmp = copymsg(mp);
6121 			if (nmp)
6122 				mac_rx(vswp->if_mh, NULL, nmp);
6123 		} else {
6124 			RW_EXIT(&vswp->if_lockrw);
6125 		}
6126 	}
6127 
6128 	freemsg(mp);
6129 
6130 	D1(vswp, "%s: exit", __func__);
6131 
6132 	return (0);
6133 }
6134 
6135 /* transmit the packet over the given port */
6136 static int
6137 vsw_portsend(vsw_port_t *port, mblk_t *mp)
6138 {
6139 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
6140 	vsw_ldc_t 	*ldcp;
6141 	int		status = 0;
6142 
6143 
6144 	READ_ENTER(&ldcl->lockrw);
6145 	/*
6146 	 * Note for now, we have a single channel.
6147 	 */
6148 	ldcp = ldcl->head;
6149 	if (ldcp == NULL) {
6150 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
6151 		freemsg(mp);
6152 		RW_EXIT(&ldcl->lockrw);
6153 		return (1);
6154 	}
6155 
6156 	/*
6157 	 * Send the message out using the appropriate
6158 	 * transmit function which will free mblock when it
6159 	 * is finished with it.
6160 	 */
6161 	mutex_enter(&port->tx_lock);
6162 	if (port->transmit != NULL)
6163 		status = (*port->transmit)(ldcp, mp);
6164 	else {
6165 		freemsg(mp);
6166 	}
6167 	mutex_exit(&port->tx_lock);
6168 
6169 	RW_EXIT(&ldcl->lockrw);
6170 
6171 	return (status);
6172 }
6173 
6174 /*
6175  * Send packet out via descriptor ring to a logical device.
6176  */
6177 static int
6178 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
6179 {
6180 	vio_dring_msg_t		dring_pkt;
6181 	dring_info_t		*dp = NULL;
6182 	vsw_private_desc_t	*priv_desc = NULL;
6183 	vnet_public_desc_t	*pub = NULL;
6184 	vsw_t			*vswp = ldcp->ldc_vswp;
6185 	mblk_t			*bp;
6186 	size_t			n, size;
6187 	caddr_t			bufp;
6188 	int			idx;
6189 	int			status = LDC_TX_SUCCESS;
6190 
6191 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
6192 
6193 	/* TODO: make test a macro */
6194 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
6195 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
6196 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
6197 			"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
6198 			ldcp->lane_out.lstate);
6199 		freemsg(mp);
6200 		return (LDC_TX_FAILURE);
6201 	}
6202 
6203 	/*
6204 	 * Note - using first ring only, this may change
6205 	 * in the future.
6206 	 */
6207 	if ((dp = ldcp->lane_out.dringp) == NULL) {
6208 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
6209 			" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
6210 		freemsg(mp);
6211 		return (LDC_TX_FAILURE);
6212 	}
6213 
6214 	size = msgsize(mp);
6215 	if (size > (size_t)ETHERMAX) {
6216 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
6217 		    ldcp->ldc_id, size);
6218 		freemsg(mp);
6219 		return (LDC_TX_FAILURE);
6220 	}
6221 
6222 	/*
6223 	 * Find a free descriptor
6224 	 *
6225 	 * Note: for the moment we are assuming that we will only
6226 	 * have one dring going from the switch to each of its
6227 	 * peers. This may change in the future.
6228 	 */
6229 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
6230 		D2(vswp, "%s(%lld): no descriptor available for ring "
6231 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
6232 
6233 		/* nothing more we can do */
6234 		status = LDC_TX_NORESOURCES;
6235 		goto vsw_dringsend_free_exit;
6236 	} else {
6237 		D2(vswp, "%s(%lld): free private descriptor found at pos "
6238 			"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
6239 			priv_desc);
6240 	}
6241 
6242 	/* copy data into the descriptor */
6243 	bufp = priv_desc->datap;
6244 	bufp += VNET_IPALIGN;
6245 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
6246 		n = MBLKL(bp);
6247 		bcopy(bp->b_rptr, bufp, n);
6248 		bufp += n;
6249 	}
6250 
6251 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
6252 
6253 	pub = priv_desc->descp;
6254 	pub->nbytes = priv_desc->datalen;
6255 
6256 	mutex_enter(&priv_desc->dstate_lock);
6257 	pub->hdr.dstate = VIO_DESC_READY;
6258 	mutex_exit(&priv_desc->dstate_lock);
6259 
6260 	/*
6261 	 * Determine whether or not we need to send a message to our
6262 	 * peer prompting them to read our newly updated descriptor(s).
6263 	 */
6264 	mutex_enter(&dp->restart_lock);
6265 	if (dp->restart_reqd) {
6266 		dp->restart_reqd = B_FALSE;
6267 		mutex_exit(&dp->restart_lock);
6268 
6269 		/*
6270 		 * Send a vio_dring_msg to peer to prompt them to read
6271 		 * the updated descriptor ring.
6272 		 */
6273 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
6274 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
6275 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
6276 		dring_pkt.tag.vio_sid = ldcp->local_session;
6277 
6278 		/* Note - for now using first ring */
6279 		dring_pkt.dring_ident = dp->ident;
6280 
6281 		mutex_enter(&ldcp->lane_out.seq_lock);
6282 		dring_pkt.seq_num = ldcp->lane_out.seq_num++;
6283 		mutex_exit(&ldcp->lane_out.seq_lock);
6284 
6285 		/*
6286 		 * If last_ack_recv is -1 then we know we've not
6287 		 * received any ack's yet, so this must be the first
6288 		 * msg sent, so set the start to the begining of the ring.
6289 		 */
6290 		mutex_enter(&dp->dlock);
6291 		if (dp->last_ack_recv == -1) {
6292 			dring_pkt.start_idx = 0;
6293 		} else {
6294 			dring_pkt.start_idx = (dp->last_ack_recv + 1) %
6295 						dp->num_descriptors;
6296 		}
6297 		dring_pkt.end_idx = -1;
6298 		mutex_exit(&dp->dlock);
6299 
6300 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
6301 			ldcp->ldc_id, dp, dring_pkt.dring_ident);
6302 		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
6303 			__func__, ldcp->ldc_id, dring_pkt.start_idx,
6304 			dring_pkt.end_idx, dring_pkt.seq_num);
6305 
6306 		vsw_send_msg(ldcp, (void *)&dring_pkt,
6307 						sizeof (vio_dring_msg_t));
6308 	} else {
6309 		mutex_exit(&dp->restart_lock);
6310 		D2(vswp, "%s(%lld): updating descp %d", __func__,
6311 			ldcp->ldc_id, idx);
6312 	}
6313 
6314 vsw_dringsend_free_exit:
6315 
6316 	/* free the message block */
6317 	freemsg(mp);
6318 
6319 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
6320 	return (status);
6321 }
6322 
6323 /*
6324  * Send an in-band descriptor message over ldc.
6325  */
6326 static int
6327 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
6328 {
6329 	vsw_t			*vswp = ldcp->ldc_vswp;
6330 	vio_ibnd_desc_t		ibnd_msg;
6331 	vsw_private_desc_t	*priv_desc = NULL;
6332 	dring_info_t		*dp = NULL;
6333 	size_t			n, size = 0;
6334 	caddr_t			bufp;
6335 	mblk_t			*bp;
6336 	int			idx, i;
6337 	int			status = LDC_TX_SUCCESS;
6338 	static int		warn_msg = 1;
6339 
6340 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6341 
6342 	ASSERT(mp != NULL);
6343 
6344 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
6345 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
6346 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
6347 			__func__, ldcp->ldc_id, ldcp->ldc_status,
6348 			ldcp->lane_out.lstate);
6349 		freemsg(mp);
6350 		return (LDC_TX_FAILURE);
6351 	}
6352 
6353 	/*
6354 	 * only expect single dring to exist, which we use
6355 	 * as an internal buffer, rather than a transfer channel.
6356 	 */
6357 	if ((dp = ldcp->lane_out.dringp) == NULL) {
6358 		DERR(vswp, "%s(%lld): no dring for outbound lane",
6359 			__func__, ldcp->ldc_id);
6360 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
6361 			__func__, ldcp->ldc_id, ldcp->ldc_status,
6362 			ldcp->lane_out.lstate);
6363 		freemsg(mp);
6364 		return (LDC_TX_FAILURE);
6365 	}
6366 
6367 	size = msgsize(mp);
6368 	if (size > (size_t)ETHERMAX) {
6369 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
6370 		    ldcp->ldc_id, size);
6371 		freemsg(mp);
6372 		return (LDC_TX_FAILURE);
6373 	}
6374 
6375 	/*
6376 	 * Find a free descriptor in our buffer ring
6377 	 */
6378 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
6379 		if (warn_msg) {
6380 			DERR(vswp, "%s(%lld): no descriptor available for ring "
6381 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
6382 			warn_msg = 0;
6383 		}
6384 
6385 		/* nothing more we can do */
6386 		status = LDC_TX_NORESOURCES;
6387 		goto vsw_descrsend_free_exit;
6388 	} else {
6389 		D2(vswp, "%s(%lld): free private descriptor found at pos "
6390 			"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
6391 			priv_desc);
6392 		warn_msg = 1;
6393 	}
6394 
6395 	/* copy data into the descriptor */
6396 	bufp = priv_desc->datap;
6397 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
6398 		n = MBLKL(bp);
6399 		bcopy(bp->b_rptr, bufp, n);
6400 		bufp += n;
6401 	}
6402 
6403 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
6404 
6405 	/* create and send the in-band descp msg */
6406 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
6407 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
6408 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
6409 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
6410 
6411 	mutex_enter(&ldcp->lane_out.seq_lock);
6412 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
6413 	mutex_exit(&ldcp->lane_out.seq_lock);
6414 
6415 	/*
6416 	 * Copy the mem cookies describing the data from the
6417 	 * private region of the descriptor ring into the inband
6418 	 * descriptor.
6419 	 */
6420 	for (i = 0; i < priv_desc->ncookies; i++) {
6421 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
6422 			sizeof (ldc_mem_cookie_t));
6423 	}
6424 
6425 	ibnd_msg.hdr.desc_handle = idx;
6426 	ibnd_msg.ncookies = priv_desc->ncookies;
6427 	ibnd_msg.nbytes = size;
6428 
6429 	vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t));
6430 
6431 vsw_descrsend_free_exit:
6432 
6433 	/* free the allocated message blocks */
6434 	freemsg(mp);
6435 
6436 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6437 	return (status);
6438 }
6439 
6440 static void
6441 vsw_send_ver(vsw_ldc_t *ldcp)
6442 {
6443 	vsw_t		*vswp = ldcp->ldc_vswp;
6444 	lane_t		*lp = &ldcp->lane_out;
6445 	vio_ver_msg_t	ver_msg;
6446 
6447 	D1(vswp, "%s enter", __func__);
6448 
6449 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
6450 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
6451 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
6452 	ver_msg.tag.vio_sid = ldcp->local_session;
6453 
6454 	ver_msg.ver_major = vsw_versions[0].ver_major;
6455 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
6456 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
6457 
6458 	lp->lstate |= VSW_VER_INFO_SENT;
6459 	lp->ver_major = ver_msg.ver_major;
6460 	lp->ver_minor = ver_msg.ver_minor;
6461 
6462 	DUMP_TAG(ver_msg.tag);
6463 
6464 	vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t));
6465 
6466 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
6467 }
6468 
6469 static void
6470 vsw_send_attr(vsw_ldc_t *ldcp)
6471 {
6472 	vsw_t			*vswp = ldcp->ldc_vswp;
6473 	lane_t			*lp = &ldcp->lane_out;
6474 	vnet_attr_msg_t		attr_msg;
6475 
6476 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
6477 
6478 	/*
6479 	 * Subtype is set to INFO by default
6480 	 */
6481 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
6482 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
6483 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
6484 	attr_msg.tag.vio_sid = ldcp->local_session;
6485 
6486 	/* payload copied from default settings for lane */
6487 	attr_msg.mtu = lp->mtu;
6488 	attr_msg.addr_type = lp->addr_type;
6489 	attr_msg.xfer_mode = lp->xfer_mode;
6490 	attr_msg.ack_freq = lp->xfer_mode;
6491 
6492 	READ_ENTER(&vswp->if_lockrw);
6493 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
6494 	RW_EXIT(&vswp->if_lockrw);
6495 
6496 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
6497 
6498 	DUMP_TAG(attr_msg.tag);
6499 
6500 	vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t));
6501 
6502 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
6503 }
6504 
6505 /*
6506  * Create dring info msg (which also results in the creation of
6507  * a dring).
6508  */
6509 static vio_dring_reg_msg_t *
6510 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
6511 {
6512 	vio_dring_reg_msg_t	*mp;
6513 	dring_info_t		*dp;
6514 	vsw_t			*vswp = ldcp->ldc_vswp;
6515 
6516 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
6517 
6518 	/*
6519 	 * If we can't create a dring, obviously no point sending
6520 	 * a message.
6521 	 */
6522 	if ((dp = vsw_create_dring(ldcp)) == NULL)
6523 		return (NULL);
6524 
6525 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
6526 
6527 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
6528 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
6529 	mp->tag.vio_subtype_env = VIO_DRING_REG;
6530 	mp->tag.vio_sid = ldcp->local_session;
6531 
6532 	/* payload */
6533 	mp->num_descriptors = dp->num_descriptors;
6534 	mp->descriptor_size = dp->descriptor_size;
6535 	mp->options = dp->options;
6536 	mp->ncookies = dp->ncookies;
6537 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
6538 
6539 	mp->dring_ident = 0;
6540 
6541 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
6542 
6543 	return (mp);
6544 }
6545 
6546 static void
6547 vsw_send_dring_info(vsw_ldc_t *ldcp)
6548 {
6549 	vio_dring_reg_msg_t	*dring_msg;
6550 	vsw_t			*vswp = ldcp->ldc_vswp;
6551 
6552 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
6553 
6554 	dring_msg = vsw_create_dring_info_pkt(ldcp);
6555 	if (dring_msg == NULL) {
6556 		cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg");
6557 		return;
6558 	}
6559 
6560 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
6561 
6562 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
6563 
6564 	vsw_send_msg(ldcp, dring_msg,
6565 		sizeof (vio_dring_reg_msg_t));
6566 
6567 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
6568 
6569 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
6570 }
6571 
6572 static void
6573 vsw_send_rdx(vsw_ldc_t *ldcp)
6574 {
6575 	vsw_t		*vswp = ldcp->ldc_vswp;
6576 	vio_rdx_msg_t	rdx_msg;
6577 
6578 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
6579 
6580 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
6581 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
6582 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
6583 	rdx_msg.tag.vio_sid = ldcp->local_session;
6584 
6585 	ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT;
6586 
6587 	DUMP_TAG(rdx_msg.tag);
6588 
6589 	vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t));
6590 
6591 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
6592 }
6593 
6594 /*
6595  * Generic routine to send message out over ldc channel.
6596  */
6597 static void
6598 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size)
6599 {
6600 	int		rv;
6601 	size_t		msglen = size;
6602 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
6603 	vsw_t		*vswp = ldcp->ldc_vswp;
6604 
6605 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
6606 			ldcp->ldc_id, size);
6607 
6608 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
6609 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
6610 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
6611 
6612 	mutex_enter(&ldcp->ldc_txlock);
6613 	do {
6614 		msglen = size;
6615 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
6616 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
6617 
6618 	mutex_exit(&ldcp->ldc_txlock);
6619 
6620 	if ((rv != 0) || (msglen != size)) {
6621 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
6622 			"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
6623 			rv, size, msglen);
6624 	}
6625 
6626 	D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes",
6627 			ldcp->ldc_id, msglen);
6628 }
6629 
6630 /*
6631  * Add an entry into FDB, for the given mac address and port_id.
6632  * Returns 0 on success, 1 on failure.
6633  *
6634  * Lock protecting FDB must be held by calling process.
6635  */
6636 static int
6637 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
6638 {
6639 	uint64_t	addr = 0;
6640 
6641 	D1(vswp, "%s: enter", __func__);
6642 
6643 	KEY_HASH(addr, port->p_macaddr);
6644 
6645 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
6646 
6647 	/*
6648 	 * Note: duplicate keys will be rejected by mod_hash.
6649 	 */
6650 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
6651 				(mod_hash_val_t)port) != 0) {
6652 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
6653 		return (1);
6654 	}
6655 
6656 	D1(vswp, "%s: exit", __func__);
6657 	return (0);
6658 }
6659 
6660 /*
6661  * Remove an entry from FDB.
6662  * Returns 0 on success, 1 on failure.
6663  */
6664 static int
6665 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
6666 {
6667 	uint64_t	addr = 0;
6668 
6669 	D1(vswp, "%s: enter", __func__);
6670 
6671 	KEY_HASH(addr, port->p_macaddr);
6672 
6673 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
6674 
6675 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
6676 
6677 	D1(vswp, "%s: enter", __func__);
6678 
6679 	return (0);
6680 }
6681 
6682 /*
6683  * Search fdb for a given mac address.
6684  * Returns pointer to the entry if found, else returns NULL.
6685  */
6686 static vsw_port_t *
6687 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
6688 {
6689 	uint64_t	key = 0;
6690 	vsw_port_t	*port = NULL;
6691 
6692 	D1(vswp, "%s: enter", __func__);
6693 
6694 	KEY_HASH(key, ehp->ether_dhost);
6695 
6696 	D2(vswp, "%s: key = 0x%llx", __func__, key);
6697 
6698 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
6699 				(mod_hash_val_t *)&port) != 0) {
6700 		return (NULL);
6701 	}
6702 
6703 	D1(vswp, "%s: exit", __func__);
6704 
6705 	return (port);
6706 }
6707 
6708 /*
6709  * Add or remove multicast address(es).
6710  *
6711  * Returns 0 on success, 1 on failure.
6712  */
6713 static int
6714 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
6715 {
6716 	mcst_addr_t		*mcst_p = NULL;
6717 	vsw_t			*vswp = port->p_vswp;
6718 	uint64_t		addr = 0x0;
6719 	int			i, ret;
6720 
6721 	D1(vswp, "%s: enter", __func__);
6722 
6723 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
6724 
6725 	if (vswp->mh == NULL)
6726 		return (1);
6727 
6728 	for (i = 0; i < mcst_pkt->count; i++) {
6729 		/*
6730 		 * Convert address into form that can be used
6731 		 * as hash table key.
6732 		 */
6733 		KEY_HASH(addr, mcst_pkt->mca[i]);
6734 
6735 		/*
6736 		 * Add or delete the specified address/port combination.
6737 		 */
6738 		if (mcst_pkt->set == 0x1) {
6739 			D3(vswp, "%s: adding multicast address 0x%llx for "
6740 				"port %ld", __func__, addr, port->p_instance);
6741 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
6742 				/*
6743 				 * Update the list of multicast
6744 				 * addresses contained within the
6745 				 * port structure to include this new
6746 				 * one.
6747 				 */
6748 				mcst_p = kmem_alloc(sizeof (mcst_addr_t),
6749 								KM_NOSLEEP);
6750 				if (mcst_p == NULL) {
6751 					DERR(vswp, "%s: unable to alloc mem",
6752 						__func__);
6753 					return (1);
6754 				}
6755 
6756 				mcst_p->nextp = NULL;
6757 				mcst_p->addr = addr;
6758 
6759 				mutex_enter(&port->mca_lock);
6760 				mcst_p->nextp = port->mcap;
6761 				port->mcap = mcst_p;
6762 				mutex_exit(&port->mca_lock);
6763 
6764 				/*
6765 				 * Program the address into HW. If the addr
6766 				 * has already been programmed then the MAC
6767 				 * just increments a ref counter (which is
6768 				 * used when the address is being deleted)
6769 				 */
6770 				ret = mac_multicst_add(vswp->mh,
6771 						(uchar_t *)&mcst_pkt->mca[i]);
6772 				if (ret) {
6773 					cmn_err(CE_WARN, "!unable to add "
6774 						"multicast address");
6775 					(void) vsw_del_mcst(vswp, VSW_VNETPORT,
6776 						addr, port);
6777 					vsw_del_addr(VSW_VNETPORT, port, addr);
6778 					return (ret);
6779 				}
6780 
6781 			} else {
6782 				DERR(vswp, "%s: error adding multicast "
6783 					"address 0x%llx for port %ld",
6784 					__func__, addr, port->p_instance);
6785 				return (1);
6786 			}
6787 		} else {
6788 			/*
6789 			 * Delete an entry from the multicast hash
6790 			 * table and update the address list
6791 			 * appropriately.
6792 			 */
6793 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
6794 				D3(vswp, "%s: deleting multicast address "
6795 					"0x%llx for port %ld", __func__, addr,
6796 					port->p_instance);
6797 
6798 				vsw_del_addr(VSW_VNETPORT, port, addr);
6799 
6800 				/*
6801 				 * Remove the address from HW. The address
6802 				 * will actually only be removed once the ref
6803 				 * count within the MAC layer has dropped to
6804 				 * zero. I.e. we can safely call this fn even
6805 				 * if other ports are interested in this
6806 				 * address.
6807 				 */
6808 				(void) mac_multicst_remove(vswp->mh,
6809 						(uchar_t *)&mcst_pkt->mca[i]);
6810 
6811 			} else {
6812 				DERR(vswp, "%s: error deleting multicast "
6813 					"addr 0x%llx for port %ld",
6814 					__func__, addr, port->p_instance);
6815 				return (1);
6816 			}
6817 		}
6818 	}
6819 	D1(vswp, "%s: exit", __func__);
6820 	return (0);
6821 }
6822 
6823 /*
6824  * Add a new multicast entry.
6825  *
6826  * Search hash table based on address. If match found then
6827  * update associated val (which is chain of ports), otherwise
6828  * create new key/val (addr/port) pair and insert into table.
6829  */
6830 static int
6831 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
6832 {
6833 	int		dup = 0;
6834 	int		rv = 0;
6835 	mfdb_ent_t	*ment = NULL;
6836 	mfdb_ent_t	*tmp_ent = NULL;
6837 	mfdb_ent_t	*new_ent = NULL;
6838 	void		*tgt = NULL;
6839 
6840 	if (devtype == VSW_VNETPORT) {
6841 		/*
6842 		 * Being invoked from a vnet.
6843 		 */
6844 		ASSERT(arg != NULL);
6845 		tgt = arg;
6846 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
6847 			((vsw_port_t *)arg)->p_instance, addr);
6848 	} else {
6849 		/*
6850 		 * We are being invoked via the m_multicst mac entry
6851 		 * point.
6852 		 */
6853 		D2(NULL, "%s: address 0x%llx", __func__, addr);
6854 		tgt = (void *)vswp;
6855 	}
6856 
6857 	WRITE_ENTER(&vswp->mfdbrw);
6858 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
6859 				(mod_hash_val_t *)&ment) != 0) {
6860 
6861 		/* address not currently in table */
6862 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
6863 		ment->d_addr = (void *)tgt;
6864 		ment->d_type = devtype;
6865 		ment->nextp = NULL;
6866 
6867 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
6868 			(mod_hash_val_t)ment) != 0) {
6869 			DERR(vswp, "%s: hash table insertion failed", __func__);
6870 			kmem_free(ment, sizeof (mfdb_ent_t));
6871 			rv = 1;
6872 		} else {
6873 			D2(vswp, "%s: added initial entry for 0x%llx to "
6874 				"table", __func__, addr);
6875 		}
6876 	} else {
6877 		/*
6878 		 * Address in table. Check to see if specified port
6879 		 * is already associated with the address. If not add
6880 		 * it now.
6881 		 */
6882 		tmp_ent = ment;
6883 		while (tmp_ent != NULL) {
6884 			if (tmp_ent->d_addr == (void *)tgt) {
6885 				if (devtype == VSW_VNETPORT) {
6886 					DERR(vswp, "%s: duplicate port entry "
6887 						"found for portid %ld and key "
6888 						"0x%llx", __func__,
6889 						((vsw_port_t *)arg)->p_instance,
6890 						addr);
6891 				} else {
6892 					DERR(vswp, "%s: duplicate entry found"
6893 						"for key 0x%llx",
6894 						__func__, addr);
6895 				}
6896 				rv = 1;
6897 				dup = 1;
6898 				break;
6899 			}
6900 			tmp_ent = tmp_ent->nextp;
6901 		}
6902 
6903 		/*
6904 		 * Port not on list so add it to end now.
6905 		 */
6906 		if (0 == dup) {
6907 			D2(vswp, "%s: added entry for 0x%llx to table",
6908 				__func__, addr);
6909 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
6910 			new_ent->d_addr = (void *)tgt;
6911 			new_ent->d_type = devtype;
6912 			new_ent->nextp = NULL;
6913 
6914 			tmp_ent = ment;
6915 			while (tmp_ent->nextp != NULL)
6916 				tmp_ent = tmp_ent->nextp;
6917 
6918 			tmp_ent->nextp = new_ent;
6919 		}
6920 	}
6921 
6922 	RW_EXIT(&vswp->mfdbrw);
6923 	return (rv);
6924 }
6925 
6926 /*
6927  * Remove a multicast entry from the hashtable.
6928  *
6929  * Search hash table based on address. If match found, scan
6930  * list of ports associated with address. If specified port
6931  * found remove it from list.
6932  */
6933 static int
6934 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
6935 {
6936 	mfdb_ent_t	*ment = NULL;
6937 	mfdb_ent_t	*curr_p, *prev_p;
6938 	void		*tgt = NULL;
6939 
6940 	D1(vswp, "%s: enter", __func__);
6941 
6942 	if (devtype == VSW_VNETPORT) {
6943 		tgt = (vsw_port_t *)arg;
6944 		D2(vswp, "%s: removing port %d from mFDB for address"
6945 			" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
6946 			addr);
6947 	} else {
6948 		D2(vswp, "%s: removing entry", __func__);
6949 		tgt = (void *)vswp;
6950 	}
6951 
6952 	WRITE_ENTER(&vswp->mfdbrw);
6953 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
6954 				(mod_hash_val_t *)&ment) != 0) {
6955 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
6956 		RW_EXIT(&vswp->mfdbrw);
6957 		return (1);
6958 	}
6959 
6960 	prev_p = curr_p = ment;
6961 
6962 	while (curr_p != NULL) {
6963 		if (curr_p->d_addr == (void *)tgt) {
6964 			if (devtype == VSW_VNETPORT) {
6965 				D2(vswp, "%s: port %d found", __func__,
6966 					((vsw_port_t *)tgt)->p_instance);
6967 			} else {
6968 				D2(vswp, "%s: instance found", __func__);
6969 			}
6970 
6971 			if (prev_p == curr_p) {
6972 				/*
6973 				 * head of list, if no other element is in
6974 				 * list then destroy this entry, otherwise
6975 				 * just replace it with updated value.
6976 				 */
6977 				ment = curr_p->nextp;
6978 				kmem_free(curr_p, sizeof (mfdb_ent_t));
6979 				if (ment == NULL) {
6980 					(void) mod_hash_destroy(vswp->mfdb,
6981 							(mod_hash_val_t)addr);
6982 				} else {
6983 					(void) mod_hash_replace(vswp->mfdb,
6984 							(mod_hash_key_t)addr,
6985 							(mod_hash_val_t)ment);
6986 				}
6987 			} else {
6988 				/*
6989 				 * Not head of list, no need to do
6990 				 * replacement, just adjust list pointers.
6991 				 */
6992 				prev_p->nextp = curr_p->nextp;
6993 				kmem_free(curr_p, sizeof (mfdb_ent_t));
6994 			}
6995 			break;
6996 		}
6997 
6998 		prev_p = curr_p;
6999 		curr_p = curr_p->nextp;
7000 	}
7001 
7002 	RW_EXIT(&vswp->mfdbrw);
7003 
7004 	D1(vswp, "%s: exit", __func__);
7005 
7006 	return (0);
7007 }
7008 
7009 /*
7010  * Port is being deleted, but has registered an interest in one
7011  * or more multicast groups. Using the list of addresses maintained
7012  * within the port structure find the appropriate entry in the hash
7013  * table and remove this port from the list of interested ports.
7014  */
7015 static void
7016 vsw_del_mcst_port(vsw_port_t *port)
7017 {
7018 	mcst_addr_t	*mcst_p = NULL;
7019 	vsw_t		*vswp = port->p_vswp;
7020 
7021 	D1(vswp, "%s: enter", __func__);
7022 
7023 	mutex_enter(&port->mca_lock);
7024 	while (port->mcap != NULL) {
7025 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
7026 					port->mcap->addr, port);
7027 
7028 		mcst_p = port->mcap->nextp;
7029 		kmem_free(port->mcap, sizeof (mcst_addr_t));
7030 		port->mcap = mcst_p;
7031 	}
7032 	mutex_exit(&port->mca_lock);
7033 
7034 	D1(vswp, "%s: exit", __func__);
7035 }
7036 
7037 /*
7038  * This vsw instance is detaching, but has registered an interest in one
7039  * or more multicast groups. Using the list of addresses maintained
7040  * within the vsw structure find the appropriate entry in the hash
7041  * table and remove this instance from the list of interested ports.
7042  */
7043 static void
7044 vsw_del_mcst_vsw(vsw_t *vswp)
7045 {
7046 	mcst_addr_t	*next_p = NULL;
7047 
7048 	D1(vswp, "%s: enter", __func__);
7049 
7050 	mutex_enter(&vswp->mca_lock);
7051 
7052 	while (vswp->mcap != NULL) {
7053 		DERR(vswp, "%s: deleting addr 0x%llx",
7054 			__func__, vswp->mcap->addr);
7055 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
7056 				vswp->mcap->addr, NULL);
7057 
7058 		next_p = vswp->mcap->nextp;
7059 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
7060 		vswp->mcap = next_p;
7061 	}
7062 
7063 	vswp->mcap = NULL;
7064 	mutex_exit(&vswp->mca_lock);
7065 
7066 	D1(vswp, "%s: exit", __func__);
7067 }
7068 
7069 
7070 /*
7071  * Remove the specified address from the list of address maintained
7072  * in this port node.
7073  */
7074 static void
7075 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
7076 {
7077 	vsw_t		*vswp = NULL;
7078 	vsw_port_t	*port = NULL;
7079 	mcst_addr_t	*prev_p = NULL;
7080 	mcst_addr_t	*curr_p = NULL;
7081 
7082 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
7083 		__func__, devtype, addr);
7084 
7085 	if (devtype == VSW_VNETPORT) {
7086 		port = (vsw_port_t *)arg;
7087 		mutex_enter(&port->mca_lock);
7088 		prev_p = curr_p = port->mcap;
7089 	} else {
7090 		vswp = (vsw_t *)arg;
7091 		mutex_enter(&vswp->mca_lock);
7092 		prev_p = curr_p = vswp->mcap;
7093 	}
7094 
7095 	while (curr_p != NULL) {
7096 		if (curr_p->addr == addr) {
7097 			D2(NULL, "%s: address found", __func__);
7098 			/* match found */
7099 			if (prev_p == curr_p) {
7100 				/* list head */
7101 				if (devtype == VSW_VNETPORT)
7102 					port->mcap = curr_p->nextp;
7103 				else
7104 					vswp->mcap = curr_p->nextp;
7105 			} else {
7106 				prev_p->nextp = curr_p->nextp;
7107 			}
7108 			kmem_free(curr_p, sizeof (mcst_addr_t));
7109 			break;
7110 		} else {
7111 			prev_p = curr_p;
7112 			curr_p = curr_p->nextp;
7113 		}
7114 	}
7115 
7116 	if (devtype == VSW_VNETPORT)
7117 		mutex_exit(&port->mca_lock);
7118 	else
7119 		mutex_exit(&vswp->mca_lock);
7120 
7121 	D1(NULL, "%s: exit", __func__);
7122 }
7123 
7124 /*
7125  * Creates a descriptor ring (dring) and links it into the
7126  * link of outbound drings for this channel.
7127  *
7128  * Returns NULL if creation failed.
7129  */
7130 static dring_info_t *
7131 vsw_create_dring(vsw_ldc_t *ldcp)
7132 {
7133 	vsw_private_desc_t	*priv_addr = NULL;
7134 	vsw_t			*vswp = ldcp->ldc_vswp;
7135 	ldc_mem_info_t		minfo;
7136 	dring_info_t		*dp, *tp;
7137 	int			i;
7138 
7139 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
7140 
7141 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
7142 
7143 	/* create public section of ring */
7144 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
7145 			VSW_PUB_SIZE, &dp->handle)) != 0) {
7146 
7147 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
7148 			"failed", ldcp->ldc_id);
7149 		goto create_fail_exit;
7150 	}
7151 
7152 	ASSERT(dp->handle != NULL);
7153 
7154 	/*
7155 	 * Get the base address of the public section of the ring.
7156 	 */
7157 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
7158 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
7159 			ldcp->ldc_id);
7160 		goto dring_fail_exit;
7161 	} else {
7162 		ASSERT(minfo.vaddr != 0);
7163 		dp->pub_addr = minfo.vaddr;
7164 	}
7165 
7166 	dp->num_descriptors = VSW_RING_NUM_EL;
7167 	dp->descriptor_size = VSW_PUB_SIZE;
7168 	dp->options = VIO_TX_DRING;
7169 	dp->ncookies = 1;	/* guaranteed by ldc */
7170 
7171 	/*
7172 	 * create private portion of ring
7173 	 */
7174 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
7175 		(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
7176 
7177 	if (vsw_setup_ring(ldcp, dp)) {
7178 		DERR(vswp, "%s: unable to setup ring", __func__);
7179 		goto dring_fail_exit;
7180 	}
7181 
7182 	/* haven't used any descriptors yet */
7183 	dp->end_idx = 0;
7184 	dp->last_ack_recv = -1;
7185 
7186 	/* bind dring to the channel */
7187 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
7188 		LDC_SHADOW_MAP, LDC_MEM_RW,
7189 		&dp->cookie[0], &dp->ncookies)) != 0) {
7190 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
7191 			"%lld", ldcp->ldc_id);
7192 		goto dring_fail_exit;
7193 	}
7194 
7195 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
7196 	dp->restart_reqd = B_TRUE;
7197 
7198 	/*
7199 	 * Only ever create rings for outgoing lane. Link it onto
7200 	 * end of list.
7201 	 */
7202 	if (ldcp->lane_out.dringp == NULL) {
7203 		D2(vswp, "vsw_create_dring: adding first outbound ring");
7204 		ldcp->lane_out.dringp = dp;
7205 	} else {
7206 		tp = ldcp->lane_out.dringp;
7207 		while (tp->next != NULL)
7208 			tp = tp->next;
7209 
7210 		tp->next = dp;
7211 	}
7212 
7213 	return (dp);
7214 
7215 dring_fail_exit:
7216 	(void) ldc_mem_dring_destroy(dp->handle);
7217 
7218 create_fail_exit:
7219 	if (dp->priv_addr != NULL) {
7220 		priv_addr = dp->priv_addr;
7221 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
7222 			if (priv_addr->memhandle != NULL)
7223 				(void) ldc_mem_free_handle(
7224 						priv_addr->memhandle);
7225 			priv_addr++;
7226 		}
7227 		kmem_free(dp->priv_addr,
7228 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
7229 	}
7230 	mutex_destroy(&dp->dlock);
7231 
7232 	kmem_free(dp, sizeof (dring_info_t));
7233 	return (NULL);
7234 }
7235 
7236 /*
7237  * Create a ring consisting of just a private portion and link
7238  * it into the list of rings for the outbound lane.
7239  *
7240  * These type of rings are used primarily for temporary data
7241  * storage (i.e. as data buffers).
7242  */
7243 void
7244 vsw_create_privring(vsw_ldc_t *ldcp)
7245 {
7246 	dring_info_t		*dp, *tp;
7247 	vsw_t			*vswp = ldcp->ldc_vswp;
7248 
7249 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
7250 
7251 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
7252 
7253 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
7254 
7255 	/* no public section */
7256 	dp->pub_addr = NULL;
7257 
7258 	dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
7259 					VSW_RING_NUM_EL), KM_SLEEP);
7260 
7261 	dp->num_descriptors = VSW_RING_NUM_EL;
7262 
7263 	if (vsw_setup_ring(ldcp, dp)) {
7264 		DERR(vswp, "%s: setup of ring failed", __func__);
7265 		kmem_free(dp->priv_addr,
7266 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
7267 		mutex_destroy(&dp->dlock);
7268 		kmem_free(dp, sizeof (dring_info_t));
7269 		return;
7270 	}
7271 
7272 	/* haven't used any descriptors yet */
7273 	dp->end_idx = 0;
7274 
7275 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
7276 	dp->restart_reqd = B_TRUE;
7277 
7278 	/*
7279 	 * Only ever create rings for outgoing lane. Link it onto
7280 	 * end of list.
7281 	 */
7282 	if (ldcp->lane_out.dringp == NULL) {
7283 		D2(vswp, "%s: adding first outbound privring", __func__);
7284 		ldcp->lane_out.dringp = dp;
7285 	} else {
7286 		tp = ldcp->lane_out.dringp;
7287 		while (tp->next != NULL)
7288 			tp = tp->next;
7289 
7290 		tp->next = dp;
7291 	}
7292 
7293 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
7294 }
7295 
7296 /*
7297  * Setup the descriptors in the dring. Returns 0 on success, 1 on
7298  * failure.
7299  */
7300 int
7301 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
7302 {
7303 	vnet_public_desc_t	*pub_addr = NULL;
7304 	vsw_private_desc_t	*priv_addr = NULL;
7305 	vsw_t			*vswp = ldcp->ldc_vswp;
7306 	uint64_t		*tmpp;
7307 	uint64_t		offset = 0;
7308 	uint32_t		ncookies = 0;
7309 	static char		*name = "vsw_setup_ring";
7310 	int			i, j, nc, rv;
7311 
7312 	priv_addr = dp->priv_addr;
7313 	pub_addr = dp->pub_addr;
7314 
7315 	/* public section may be null but private should never be */
7316 	ASSERT(priv_addr != NULL);
7317 
7318 	/*
7319 	 * Allocate the region of memory which will be used to hold
7320 	 * the data the descriptors will refer to.
7321 	 */
7322 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
7323 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
7324 
7325 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
7326 		dp->data_sz, dp->data_addr);
7327 
7328 	tmpp = (uint64_t *)dp->data_addr;
7329 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
7330 
7331 	/*
7332 	 * Initialise some of the private and public (if they exist)
7333 	 * descriptor fields.
7334 	 */
7335 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
7336 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
7337 
7338 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
7339 			&priv_addr->memhandle)) != 0) {
7340 			DERR(vswp, "%s: alloc mem handle failed", name);
7341 			goto setup_ring_cleanup;
7342 		}
7343 
7344 		priv_addr->datap = (void *)tmpp;
7345 
7346 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
7347 			(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
7348 			LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
7349 			&(priv_addr->memcookie[0]), &ncookies);
7350 		if (rv != 0) {
7351 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
7352 				"(rv %d)", name, ldcp->ldc_id, rv);
7353 			goto setup_ring_cleanup;
7354 		}
7355 		priv_addr->bound = 1;
7356 
7357 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
7358 			name, i, priv_addr->memcookie[0].addr,
7359 			priv_addr->memcookie[0].size);
7360 
7361 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
7362 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
7363 				"invalid num of cookies (%d) for size 0x%llx",
7364 				name, ldcp->ldc_id, ncookies,
7365 				VSW_RING_EL_DATA_SZ);
7366 
7367 			goto setup_ring_cleanup;
7368 		} else {
7369 			for (j = 1; j < ncookies; j++) {
7370 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
7371 					&(priv_addr->memcookie[j]));
7372 				if (rv != 0) {
7373 					DERR(vswp, "%s: ldc_mem_nextcookie "
7374 						"failed rv (%d)", name, rv);
7375 					goto setup_ring_cleanup;
7376 				}
7377 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
7378 					"size 0x%llx", name, j,
7379 					priv_addr->memcookie[j].addr,
7380 					priv_addr->memcookie[j].size);
7381 			}
7382 
7383 		}
7384 		priv_addr->ncookies = ncookies;
7385 		priv_addr->dstate = VIO_DESC_FREE;
7386 
7387 		if (pub_addr != NULL) {
7388 
7389 			/* link pub and private sides */
7390 			priv_addr->descp = pub_addr;
7391 
7392 			pub_addr->ncookies = priv_addr->ncookies;
7393 
7394 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
7395 				bcopy(&priv_addr->memcookie[nc],
7396 					&pub_addr->memcookie[nc],
7397 					sizeof (ldc_mem_cookie_t));
7398 			}
7399 
7400 			pub_addr->hdr.dstate = VIO_DESC_FREE;
7401 			pub_addr++;
7402 		}
7403 
7404 		/*
7405 		 * move to next element in the dring and the next
7406 		 * position in the data buffer.
7407 		 */
7408 		priv_addr++;
7409 		tmpp += offset;
7410 	}
7411 
7412 	return (0);
7413 
7414 setup_ring_cleanup:
7415 	priv_addr = dp->priv_addr;
7416 
7417 	for (j = 0; j < i; j++) {
7418 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
7419 		(void) ldc_mem_free_handle(priv_addr->memhandle);
7420 
7421 		mutex_destroy(&priv_addr->dstate_lock);
7422 
7423 		priv_addr++;
7424 	}
7425 	kmem_free(dp->data_addr, dp->data_sz);
7426 
7427 	return (1);
7428 }
7429 
7430 /*
7431  * Searches the private section of a ring for a free descriptor,
7432  * starting at the location of the last free descriptor found
7433  * previously.
7434  *
7435  * Returns 0 if free descriptor is available, and updates state
7436  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
7437  *
7438  * FUTURE: might need to return contiguous range of descriptors
7439  * as dring info msg assumes all will be contiguous.
7440  */
7441 static int
7442 vsw_dring_find_free_desc(dring_info_t *dringp,
7443 		vsw_private_desc_t **priv_p, int *idx)
7444 {
7445 	vsw_private_desc_t	*addr = NULL;
7446 	int			num = VSW_RING_NUM_EL;
7447 	int			ret = 1;
7448 
7449 	D1(NULL, "%s enter\n", __func__);
7450 
7451 	ASSERT(dringp->priv_addr != NULL);
7452 
7453 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
7454 			__func__, dringp, dringp->end_idx);
7455 
7456 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
7457 
7458 	mutex_enter(&addr->dstate_lock);
7459 	if (addr->dstate == VIO_DESC_FREE) {
7460 		addr->dstate = VIO_DESC_READY;
7461 		*priv_p = addr;
7462 		*idx = dringp->end_idx;
7463 		dringp->end_idx = (dringp->end_idx + 1) % num;
7464 		ret = 0;
7465 
7466 	}
7467 	mutex_exit(&addr->dstate_lock);
7468 
7469 	/* ring full */
7470 	if (ret == 1) {
7471 		D2(NULL, "%s: no desp free: started at %d", __func__,
7472 			dringp->end_idx);
7473 	}
7474 
7475 	D1(NULL, "%s: exit\n", __func__);
7476 
7477 	return (ret);
7478 }
7479 
7480 /*
7481  * Map from a dring identifier to the ring itself. Returns
7482  * pointer to ring or NULL if no match found.
7483  */
7484 static dring_info_t *
7485 vsw_ident2dring(lane_t *lane, uint64_t ident)
7486 {
7487 	dring_info_t	*dp = NULL;
7488 
7489 	if ((dp = lane->dringp) == NULL) {
7490 		return (NULL);
7491 	} else {
7492 		if (dp->ident == ident)
7493 			return (dp);
7494 
7495 		while (dp != NULL) {
7496 			if (dp->ident == ident)
7497 				break;
7498 			dp = dp->next;
7499 		}
7500 	}
7501 
7502 	return (dp);
7503 }
7504 
7505 /*
7506  * Set the default lane attributes. These are copied into
7507  * the attr msg we send to our peer. If they are not acceptable
7508  * then (currently) the handshake ends.
7509  */
7510 static void
7511 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
7512 {
7513 	bzero(lp, sizeof (lane_t));
7514 
7515 	READ_ENTER(&vswp->if_lockrw);
7516 	ether_copy(&(vswp->if_addr), &(lp->addr));
7517 	RW_EXIT(&vswp->if_lockrw);
7518 
7519 	lp->mtu = VSW_MTU;
7520 	lp->addr_type = ADDR_TYPE_MAC;
7521 	lp->xfer_mode = VIO_DRING_MODE;
7522 	lp->ack_freq = 0;	/* for shared mode */
7523 
7524 	mutex_enter(&lp->seq_lock);
7525 	lp->seq_num = VNET_ISS;
7526 	mutex_exit(&lp->seq_lock);
7527 }
7528 
7529 /*
7530  * Verify that the attributes are acceptable.
7531  *
7532  * FUTURE: If some attributes are not acceptable, change them
7533  * our desired values.
7534  */
7535 static int
7536 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
7537 {
7538 	int	ret = 0;
7539 
7540 	D1(NULL, "vsw_check_attr enter\n");
7541 
7542 	/*
7543 	 * Note we currently only support in-band descriptors
7544 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
7545 	 */
7546 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
7547 			(pkt->xfer_mode != VIO_DRING_MODE)) {
7548 		D2(NULL, "vsw_check_attr: unknown mode %x\n",
7549 			pkt->xfer_mode);
7550 		ret = 1;
7551 	}
7552 
7553 	/* Only support MAC addresses at moment. */
7554 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
7555 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
7556 			"or address 0x%llx\n", pkt->addr_type,
7557 			pkt->addr);
7558 		ret = 1;
7559 	}
7560 
7561 	/*
7562 	 * MAC address supplied by device should match that stored
7563 	 * in the vsw-port OBP node. Need to decide what to do if they
7564 	 * don't match, for the moment just warn but don't fail.
7565 	 */
7566 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
7567 		DERR(NULL, "vsw_check_attr: device supplied address "
7568 			"0x%llx doesn't match node address 0x%llx\n",
7569 			pkt->addr, port->p_macaddr);
7570 	}
7571 
7572 	/*
7573 	 * Ack freq only makes sense in pkt mode, in shared
7574 	 * mode the ring descriptors say whether or not to
7575 	 * send back an ACK.
7576 	 */
7577 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
7578 				(pkt->ack_freq > 0)) {
7579 		D2(NULL, "vsw_check_attr: non zero ack freq "
7580 			" in SHM mode\n");
7581 		ret = 1;
7582 	}
7583 
7584 	/*
7585 	 * Note: for the moment we only support ETHER
7586 	 * frames. This may change in the future.
7587 	 */
7588 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
7589 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
7590 			pkt->mtu);
7591 		ret = 1;
7592 	}
7593 
7594 	D1(NULL, "vsw_check_attr exit\n");
7595 
7596 	return (ret);
7597 }
7598 
7599 /*
7600  * Returns 1 if there is a problem, 0 otherwise.
7601  */
7602 static int
7603 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
7604 {
7605 	_NOTE(ARGUNUSED(pkt))
7606 
7607 	int	ret = 0;
7608 
7609 	D1(NULL, "vsw_check_dring_info enter\n");
7610 
7611 	if ((pkt->num_descriptors == 0) ||
7612 		(pkt->descriptor_size == 0) ||
7613 		(pkt->ncookies != 1)) {
7614 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
7615 		ret = 1;
7616 	}
7617 
7618 	D1(NULL, "vsw_check_dring_info exit\n");
7619 
7620 	return (ret);
7621 }
7622 
7623 /*
7624  * Returns 1 if two memory cookies match. Otherwise returns 0.
7625  */
7626 static int
7627 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
7628 {
7629 	if ((m1->addr != m2->addr) ||
7630 		(m2->size != m2->size)) {
7631 		return (0);
7632 	} else {
7633 		return (1);
7634 	}
7635 }
7636 
7637 /*
7638  * Returns 1 if ring described in reg message matches that
7639  * described by dring_info structure. Otherwise returns 0.
7640  */
7641 static int
7642 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
7643 {
7644 	if ((msg->descriptor_size != dp->descriptor_size) ||
7645 		(msg->num_descriptors != dp->num_descriptors) ||
7646 		(msg->ncookies != dp->ncookies) ||
7647 		!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
7648 		return (0);
7649 	} else {
7650 		return (1);
7651 	}
7652 
7653 }
7654 
7655 static caddr_t
7656 vsw_print_ethaddr(uint8_t *a, char *ebuf)
7657 {
7658 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
7659 	    a[0], a[1], a[2], a[3], a[4], a[5]);
7660 	return (ebuf);
7661 }
7662 
7663 /*
7664  * Reset and free all the resources associated with
7665  * the channel.
7666  */
7667 static void
7668 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
7669 {
7670 	dring_info_t		*dp, *dpp;
7671 	lane_t			*lp = NULL;
7672 	int			rv = 0;
7673 
7674 	ASSERT(ldcp != NULL);
7675 
7676 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
7677 
7678 	if (dir == INBOUND) {
7679 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
7680 			" of channel %lld", __func__, ldcp->ldc_id);
7681 		lp = &ldcp->lane_in;
7682 	} else {
7683 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
7684 			" of channel %lld", __func__, ldcp->ldc_id);
7685 		lp = &ldcp->lane_out;
7686 	}
7687 
7688 	lp->lstate = VSW_LANE_INACTIV;
7689 	mutex_enter(&lp->seq_lock);
7690 	lp->seq_num = VNET_ISS;
7691 	mutex_exit(&lp->seq_lock);
7692 	if (lp->dringp) {
7693 		if (dir == INBOUND) {
7694 			dp = lp->dringp;
7695 			while (dp != NULL) {
7696 				dpp = dp->next;
7697 				if (dp->handle != NULL)
7698 					(void) ldc_mem_dring_unmap(dp->handle);
7699 				kmem_free(dp, sizeof (dring_info_t));
7700 				dp = dpp;
7701 			}
7702 		} else {
7703 			/*
7704 			 * unbind, destroy exported dring, free dring struct
7705 			 */
7706 			dp = lp->dringp;
7707 			rv = vsw_free_ring(dp);
7708 		}
7709 		if (rv == 0) {
7710 			lp->dringp = NULL;
7711 		}
7712 	}
7713 
7714 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
7715 }
7716 
7717 /*
7718  * Free ring and all associated resources.
7719  */
7720 static int
7721 vsw_free_ring(dring_info_t *dp)
7722 {
7723 	vsw_private_desc_t	*paddr = NULL;
7724 	dring_info_t		*dpp;
7725 	int			i, rv = 1;
7726 
7727 	while (dp != NULL) {
7728 		mutex_enter(&dp->dlock);
7729 		dpp = dp->next;
7730 		if (dp->priv_addr != NULL) {
7731 			/*
7732 			 * First unbind and free the memory handles
7733 			 * stored in each descriptor within the ring.
7734 			 */
7735 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
7736 				paddr = (vsw_private_desc_t *)
7737 						dp->priv_addr + i;
7738 				if (paddr->memhandle != NULL) {
7739 					if (paddr->bound == 1) {
7740 						rv = ldc_mem_unbind_handle(
7741 							paddr->memhandle);
7742 
7743 						if (rv != 0) {
7744 							DERR(NULL, "error "
7745 							"unbinding handle for "
7746 							"ring 0x%llx at pos %d",
7747 							dp, i);
7748 							mutex_exit(&dp->dlock);
7749 							return (rv);
7750 						}
7751 						paddr->bound = 0;
7752 					}
7753 
7754 					rv = ldc_mem_free_handle(
7755 							paddr->memhandle);
7756 					if (rv != 0) {
7757 						DERR(NULL, "error freeing "
7758 							"handle for ring "
7759 							"0x%llx at pos %d",
7760 							dp, i);
7761 						mutex_exit(&dp->dlock);
7762 						return (rv);
7763 					}
7764 					paddr->memhandle = NULL;
7765 				}
7766 				mutex_destroy(&paddr->dstate_lock);
7767 			}
7768 			kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
7769 					* VSW_RING_NUM_EL));
7770 		}
7771 
7772 		/*
7773 		 * Now unbind and destroy the ring itself.
7774 		 */
7775 		if (dp->handle != NULL) {
7776 			(void) ldc_mem_dring_unbind(dp->handle);
7777 			(void) ldc_mem_dring_destroy(dp->handle);
7778 		}
7779 
7780 		if (dp->data_addr != NULL) {
7781 			kmem_free(dp->data_addr, dp->data_sz);
7782 		}
7783 
7784 		mutex_exit(&dp->dlock);
7785 		mutex_destroy(&dp->dlock);
7786 		mutex_destroy(&dp->restart_lock);
7787 		kmem_free(dp, sizeof (dring_info_t));
7788 
7789 		dp = dpp;
7790 	}
7791 	return (0);
7792 }
7793 
7794 /*
7795  * Debugging routines
7796  */
7797 static void
7798 display_state(void)
7799 {
7800 	vsw_t		*vswp;
7801 	vsw_port_list_t	*plist;
7802 	vsw_port_t 	*port;
7803 	vsw_ldc_list_t	*ldcl;
7804 	vsw_ldc_t 	*ldcp;
7805 
7806 	cmn_err(CE_NOTE, "***** system state *****");
7807 
7808 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
7809 		plist = &vswp->plist;
7810 		READ_ENTER(&plist->lockrw);
7811 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
7812 			vswp->instance, plist->num_ports);
7813 
7814 		for (port = plist->head; port != NULL; port = port->p_next) {
7815 			ldcl = &port->p_ldclist;
7816 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
7817 				port->p_instance, ldcl->num_ldcs);
7818 			READ_ENTER(&ldcl->lockrw);
7819 			ldcp = ldcl->head;
7820 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
7821 				cmn_err(CE_CONT, "chan %lu : dev %d : "
7822 					"status %d : phase %u\n",
7823 					ldcp->ldc_id, ldcp->dev_class,
7824 					ldcp->ldc_status, ldcp->hphase);
7825 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
7826 					"psession %lu\n",
7827 					ldcp->ldc_id,
7828 					ldcp->local_session,
7829 					ldcp->peer_session);
7830 
7831 				cmn_err(CE_CONT, "Inbound lane:\n");
7832 				display_lane(&ldcp->lane_in);
7833 				cmn_err(CE_CONT, "Outbound lane:\n");
7834 				display_lane(&ldcp->lane_out);
7835 			}
7836 			RW_EXIT(&ldcl->lockrw);
7837 		}
7838 		RW_EXIT(&plist->lockrw);
7839 	}
7840 	cmn_err(CE_NOTE, "***** system state *****");
7841 }
7842 
7843 static void
7844 display_lane(lane_t *lp)
7845 {
7846 	dring_info_t	*drp;
7847 
7848 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
7849 		lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
7850 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
7851 		lp->addr_type, lp->addr, lp->xfer_mode);
7852 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
7853 
7854 	cmn_err(CE_CONT, "Dring info:\n");
7855 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
7856 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
7857 			drp->num_descriptors, drp->descriptor_size);
7858 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
7859 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
7860 			(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
7861 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
7862 			drp->ident, drp->end_idx);
7863 		display_ring(drp);
7864 	}
7865 }
7866 
7867 static void
7868 display_ring(dring_info_t *dringp)
7869 {
7870 	uint64_t		i;
7871 	uint64_t		priv_count = 0;
7872 	uint64_t		pub_count = 0;
7873 	vnet_public_desc_t	*pub_addr = NULL;
7874 	vsw_private_desc_t	*priv_addr = NULL;
7875 
7876 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
7877 		if (dringp->pub_addr != NULL) {
7878 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
7879 
7880 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
7881 				pub_count++;
7882 		}
7883 
7884 		if (dringp->priv_addr != NULL) {
7885 			priv_addr =
7886 				(vsw_private_desc_t *)dringp->priv_addr + i;
7887 
7888 			if (priv_addr->dstate == VIO_DESC_FREE)
7889 				priv_count++;
7890 		}
7891 	}
7892 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
7893 			i, priv_count, pub_count);
7894 }
7895 
7896 static void
7897 dump_flags(uint64_t state)
7898 {
7899 	int	i;
7900 
7901 	typedef struct flag_name {
7902 		int	flag_val;
7903 		char	*flag_name;
7904 	} flag_name_t;
7905 
7906 	flag_name_t	flags[] = {
7907 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
7908 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
7909 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
7910 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
7911 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
7912 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
7913 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
7914 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
7915 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
7916 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
7917 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
7918 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
7919 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
7920 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
7921 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
7922 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
7923 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
7924 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
7925 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
7926 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
7927 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
7928 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
7929 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
7930 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
7931 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
7932 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
7933 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
7934 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
7935 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
7936 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
7937 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
7938 
7939 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
7940 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
7941 		if (state & flags[i].flag_val)
7942 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
7943 	}
7944 }
7945