xref: /illumos-gate/usr/src/uts/sun4v/io/vsw.c (revision 0c44d0008f52b6a42b9c01d3b344661217520a68)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/taskq.h>
60 #include <sys/note.h>
61 #include <sys/mach_descrip.h>
62 #include <sys/mac.h>
63 #include <sys/mdeg.h>
64 #include <sys/ldc.h>
65 #include <sys/vsw_fdb.h>
66 #include <sys/vsw.h>
67 #include <sys/vio_mailbox.h>
68 #include <sys/vnet_mailbox.h>
69 #include <sys/vnet_common.h>
70 
71 /*
72  * Function prototypes.
73  */
74 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
75 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
76 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
77 static	void vsw_get_md_properties(vsw_t *vswp);
78 static	int vsw_setup_layer2(vsw_t *);
79 static	int vsw_setup_layer3(vsw_t *);
80 
81 /* MAC layer routines */
82 static	int vsw_mac_attach(vsw_t *vswp);
83 static	void vsw_mac_detach(vsw_t *vswp);
84 static void vsw_notify_cb(void *, mac_notify_type_t);
85 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
86 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
87 static int vsw_mac_register(vsw_t *);
88 static int vsw_mac_unregister(vsw_t *);
89 static uint64_t vsw_m_stat(void *arg, enum mac_stat);
90 static void vsw_m_stop(void *arg);
91 static int vsw_m_start(void *arg);
92 static int vsw_m_unicst(void *arg, const uint8_t *);
93 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
94 static int vsw_m_promisc(void *arg, boolean_t);
95 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
96 static void vsw_m_resources(void *arg);
97 static void vsw_m_ioctl(void *arg, queue_t *q, mblk_t *mp);
98 
99 /* MDEG routines */
100 static	void vsw_mdeg_register(vsw_t *vswp);
101 static	void vsw_mdeg_unregister(vsw_t *vswp);
102 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
103 
104 /* Port add/deletion routines */
105 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
106 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
107 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
108 static	int vsw_detach_ports(vsw_t *vswp);
109 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
110 static	int vsw_port_delete(vsw_port_t *port);
111 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
112 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
113 static	int vsw_init_ldcs(vsw_port_t *port);
114 static	int vsw_uninit_ldcs(vsw_port_t *port);
115 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
116 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
117 static	int vsw_drain_ldcs(vsw_port_t *port);
118 static	int vsw_drain_port_taskq(vsw_port_t *port);
119 static	void vsw_marker_task(void *);
120 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
121 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
122 
123 /* Interrupt routines */
124 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
125 
126 /* Handshake routines */
127 static	void vsw_restart_handshake(vsw_ldc_t *);
128 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
129 static	void vsw_next_milestone(vsw_ldc_t *);
130 static	int vsw_supported_version(vio_ver_msg_t *);
131 
132 /* Data processing routines */
133 static void vsw_process_pkt(void *);
134 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
135 static void vsw_process_ctrl_pkt(void *);
136 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
137 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
138 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
139 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
140 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
141 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
142 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
143 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
144 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
145 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
146 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
147 
148 /* Switching/data transmit routines */
149 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
150 	    vsw_port_t *port, mac_resource_handle_t);
151 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
152 	    vsw_port_t *port, mac_resource_handle_t);
153 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
154 	    vsw_port_t *port);
155 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
156 	    vsw_port_t *port);
157 static	int vsw_portsend(vsw_port_t *, mblk_t *);
158 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
159 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
160 
161 /* Packet creation routines */
162 static void vsw_send_ver(vsw_ldc_t *);
163 static void vsw_send_attr(vsw_ldc_t *);
164 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
165 static void vsw_send_dring_info(vsw_ldc_t *);
166 static void vsw_send_rdx(vsw_ldc_t *);
167 
168 static void vsw_send_msg(vsw_ldc_t *, void *, int);
169 
170 /* Forwarding database (FDB) routines */
171 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
172 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
173 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
174 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
175 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
176 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
177 static	void vsw_del_addr(uint8_t, void *, uint64_t);
178 static	void vsw_del_mcst_port(vsw_port_t *);
179 static	void vsw_del_mcst_vsw(vsw_t *);
180 
181 /* Dring routines */
182 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
183 static void vsw_create_privring(vsw_ldc_t *);
184 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
185 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
186     int *);
187 static void vsw_dring_priv2pub(vsw_private_desc_t *);
188 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
189 
190 static void vsw_set_lane_attr(vsw_t *, lane_t *);
191 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
192 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
193 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
194 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
195 
196 /* Misc support routines */
197 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
198 
199 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
200 static int vsw_free_ring(dring_info_t *);
201 
202 /* Debugging routines */
203 static void dump_flags(uint64_t);
204 static void display_state(void);
205 static void display_lane(lane_t *);
206 static void display_ring(dring_info_t *);
207 
208 int	vsw_num_handshakes = 3;		/* # of handshake attempts */
209 int	vsw_wretries = 100;		/* # of write attempts */
210 
211 /*
212  * mode specific frame switching function
213  */
214 void		(*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *,
215 			mac_resource_handle_t);
216 
217 static	struct	cb_ops	vsw_cb_ops = {
218 	nulldev,			/* cb_open */
219 	nulldev,			/* cb_close */
220 	nodev,				/* cb_strategy */
221 	nodev,				/* cb_print */
222 	nodev,				/* cb_dump */
223 	nodev,				/* cb_read */
224 	nodev,				/* cb_write */
225 	nodev,				/* cb_ioctl */
226 	nodev,				/* cb_devmap */
227 	nodev,				/* cb_mmap */
228 	nodev,				/* cb_segmap */
229 	nochpoll,			/* cb_chpoll */
230 	ddi_prop_op,			/* cb_prop_op */
231 	NULL,				/* cb_stream */
232 	D_MP,				/* cb_flag */
233 	CB_REV,				/* rev */
234 	nodev,				/* int (*cb_aread)() */
235 	nodev				/* int (*cb_awrite)() */
236 };
237 
238 static	struct	dev_ops	vsw_ops = {
239 	DEVO_REV,		/* devo_rev */
240 	0,			/* devo_refcnt */
241 	vsw_getinfo,		/* devo_getinfo */
242 	nulldev,		/* devo_identify */
243 	nulldev,		/* devo_probe */
244 	vsw_attach,		/* devo_attach */
245 	vsw_detach,		/* devo_detach */
246 	nodev,			/* devo_reset */
247 	&vsw_cb_ops,		/* devo_cb_ops */
248 	(struct bus_ops *)NULL,	/* devo_bus_ops */
249 	ddi_power		/* devo_power */
250 };
251 
252 extern	struct	mod_ops	mod_driverops;
253 static struct modldrv vswmodldrv = {
254 	&mod_driverops,
255 	"sun4v Virtual Switch Driver %I%",
256 	&vsw_ops,
257 };
258 
259 #define	LDC_ENTER_LOCK(ldcp)	\
260 				mutex_enter(&((ldcp)->ldc_cblock));\
261 				mutex_enter(&((ldcp)->ldc_txlock));
262 #define	LDC_EXIT_LOCK(ldcp)	\
263 				mutex_exit(&((ldcp)->ldc_txlock));\
264 				mutex_exit(&((ldcp)->ldc_cblock));
265 
266 /* Driver soft state ptr  */
267 static void	*vsw_state;
268 
269 /*
270  * Linked list of "vsw_t" structures - one per instance.
271  */
272 vsw_t		*vsw_head = NULL;
273 krwlock_t	vsw_rw;
274 
275 /*
276  * Property names
277  */
278 static char vdev_propname[] = "virtual-device";
279 static char vsw_propname[] = "virtual-network-switch";
280 static char physdev_propname[] = "vsw-phys-dev";
281 static char smode_propname[] = "vsw-switch-mode";
282 static char macaddr_propname[] = "local-mac-address";
283 static char remaddr_propname[] = "remote-mac-address";
284 static char ldcids_propname[] = "ldc-ids";
285 static char chan_propname[] = "channel-endpoint";
286 static char id_propname[] = "id";
287 static char reg_propname[] = "reg";
288 
289 /* supported versions */
290 static	ver_sup_t	vsw_versions[] = { {1, 0} };
291 
292 /*
293  * Matching criteria passed to the MDEG to register interest
294  * in changes to 'virtual-device-port' nodes identified by their
295  * 'id' property.
296  */
297 static md_prop_match_t vport_prop_match[] = {
298 	{ MDET_PROP_VAL,    "id"   },
299 	{ MDET_LIST_END,    NULL    }
300 };
301 
302 static mdeg_node_match_t vport_match = { "virtual-device-port",
303 						vport_prop_match };
304 
305 /*
306  * Specification of an MD node passed to the MDEG to filter any
307  * 'vport' nodes that do not belong to the specified node. This
308  * template is copied for each vsw instance and filled in with
309  * the appropriate 'cfg-handle' value before being passed to the MDEG.
310  */
311 static mdeg_prop_spec_t vsw_prop_template[] = {
312 	{ MDET_PROP_STR,    "name",		vsw_propname },
313 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
314 	{ MDET_LIST_END,    NULL,		NULL	}
315 };
316 
317 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
318 
319 /*
320  * Print debug messages - set to 0x1f to enable all msgs
321  * or 0x0 to turn all off.
322  */
323 int vswdbg = 0x0;
324 
325 /*
326  * debug levels:
327  * 0x01:	Function entry/exit tracing
328  * 0x02:	Internal function messages
329  * 0x04:	Verbose internal messages
330  * 0x08:	Warning messages
331  * 0x10:	Error messages
332  */
333 
334 static void
335 vswdebug(vsw_t *vswp, const char *fmt, ...)
336 {
337 	char buf[512];
338 	va_list ap;
339 
340 	va_start(ap, fmt);
341 	(void) vsprintf(buf, fmt, ap);
342 	va_end(ap);
343 
344 	if (vswp == NULL)
345 		cmn_err(CE_CONT, "%s\n", buf);
346 	else
347 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
348 }
349 
350 /*
351  * For the moment the state dump routines have their own
352  * private flag.
353  */
354 #define	DUMP_STATE	0
355 
356 #if DUMP_STATE
357 
358 #define	DUMP_TAG(tag) \
359 {			\
360 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
361 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
362 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
363 }
364 
365 #define	DUMP_TAG_PTR(tag) \
366 {			\
367 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
368 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
369 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
370 }
371 
372 #define	DUMP_FLAGS(flags) dump_flags(flags);
373 #define	DISPLAY_STATE()	display_state()
374 
375 #else
376 
377 #define	DUMP_TAG(tag)
378 #define	DUMP_TAG_PTR(tag)
379 #define	DUMP_FLAGS(state)
380 #define	DISPLAY_STATE()
381 
382 #endif	/* DUMP_STATE */
383 
384 #ifdef DEBUG
385 
386 #define	D1		\
387 if (vswdbg & 0x01)	\
388 	vswdebug
389 
390 #define	D2		\
391 if (vswdbg & 0x02)	\
392 	vswdebug
393 
394 #define	D3		\
395 if (vswdbg & 0x04)	\
396 	vswdebug
397 
398 #define	DWARN		\
399 if (vswdbg & 0x08)	\
400 	vswdebug
401 
402 #define	DERR		\
403 if (vswdbg & 0x10)	\
404 	vswdebug
405 
406 #else
407 
408 #define	DERR		if (0)	vswdebug
409 #define	DWARN		if (0)	vswdebug
410 #define	D1		if (0)	vswdebug
411 #define	D2		if (0)	vswdebug
412 #define	D3		if (0)	vswdebug
413 
414 #endif	/* DEBUG */
415 
416 static struct modlinkage modlinkage = {
417 	MODREV_1,
418 	&vswmodldrv,
419 	NULL
420 };
421 
422 int
423 _init(void)
424 {
425 	int status;
426 
427 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
428 
429 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
430 	if (status != 0) {
431 		return (status);
432 	}
433 
434 	mac_init_ops(&vsw_ops, "vsw");
435 	status = mod_install(&modlinkage);
436 	if (status != 0) {
437 		ddi_soft_state_fini(&vsw_state);
438 	}
439 	return (status);
440 }
441 
442 int
443 _fini(void)
444 {
445 	int status;
446 
447 	status = mod_remove(&modlinkage);
448 	if (status != 0)
449 		return (status);
450 	mac_fini_ops(&vsw_ops);
451 	ddi_soft_state_fini(&vsw_state);
452 
453 	rw_destroy(&vsw_rw);
454 
455 	return (status);
456 }
457 
458 int
459 _info(struct modinfo *modinfop)
460 {
461 	return (mod_info(&modlinkage, modinfop));
462 }
463 
464 static int
465 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
466 {
467 	vsw_t		*vswp;
468 	int		smode, instance, i;
469 	char		hashname[MAXNAMELEN];
470 	char		qname[TASKQ_NAMELEN];
471 	int		rv = 1;
472 	enum		{ PROG_init = 0x0, PROG_if_lock = 0x1,
473 				PROG_fdb = 0x2, PROG_mfdb = 0x4,
474 				PROG_report_dev = 0x8, PROG_plist = 0x10,
475 				PROG_taskq = 0x20}
476 			progress;
477 
478 	progress = PROG_init;
479 
480 	switch (cmd) {
481 	case DDI_ATTACH:
482 		break;
483 	case DDI_RESUME:
484 		/* nothing to do for this non-device */
485 		return (DDI_SUCCESS);
486 	case DDI_PM_RESUME:
487 	default:
488 		return (DDI_FAILURE);
489 	}
490 
491 	instance = ddi_get_instance(dip);
492 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
493 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
494 		return (DDI_FAILURE);
495 	}
496 	vswp = ddi_get_soft_state(vsw_state, instance);
497 
498 	if (vswp == NULL) {
499 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
500 		goto vsw_attach_fail;
501 	}
502 
503 	vswp->dip = dip;
504 	vswp->instance = instance;
505 	ddi_set_driver_private(dip, (caddr_t)vswp);
506 
507 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
508 
509 	progress |= PROG_if_lock;
510 
511 	/*
512 	 * User specifies (via MD) an array of switching modes in
513 	 * decreasing order of preference. Default mode is always
514 	 * layer 2 (mac switching), so init array with that value.
515 	 */
516 	vswp->smode_idx = 0;
517 	for (i = 0; i < NUM_SMODES; i++)
518 		vswp->smode[i] = VSW_LAYER2;
519 
520 	/*
521 	 * Get the various properties such as physical device name
522 	 * (vsw-phys-dev), switch mode etc from the MD.
523 	 */
524 	vsw_get_md_properties(vswp);
525 
526 	/* setup the unicast forwarding database  */
527 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
528 							vswp->instance);
529 	D2(vswp, "creating unicast hash table (%s)...", hashname);
530 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
531 		mod_hash_null_valdtor, sizeof (void *));
532 
533 	progress |= PROG_fdb;
534 
535 	/* setup the multicast fowarding database */
536 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
537 							vswp->instance);
538 	D2(vswp, "creating multicast hash table %s)...", hashname);
539 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
540 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
541 			mod_hash_null_valdtor, sizeof (void *));
542 
543 	progress |= PROG_mfdb;
544 
545 	/*
546 	 * create lock protecting list of multicast addresses
547 	 * which could come via m_multicst() entry point when plumbed.
548 	 */
549 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
550 	vswp->mcap = NULL;
551 
552 	ddi_report_dev(vswp->dip);
553 
554 	progress |= PROG_report_dev;
555 
556 	WRITE_ENTER(&vsw_rw);
557 	vswp->next = vsw_head;
558 	vsw_head = vswp;
559 	RW_EXIT(&vsw_rw);
560 
561 	/* setup the port list */
562 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
563 	vswp->plist.head = NULL;
564 
565 	progress |= PROG_plist;
566 
567 	/*
568 	 * Create the taskq which will process all the VIO
569 	 * control messages.
570 	 */
571 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
572 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
573 					TASKQ_DEFAULTPRI, 0)) == NULL) {
574 		cmn_err(CE_WARN, "Unable to create task queue");
575 		goto vsw_attach_fail;
576 	}
577 
578 	progress |= PROG_taskq;
579 
580 	/* select best switching mode */
581 	for (i = 0; i < NUM_SMODES; i++) {
582 		smode = vswp->smode[i];
583 		switch (smode) {
584 		case VSW_LAYER2:
585 			rv = vsw_setup_layer2(vswp);
586 			break;
587 
588 		case VSW_LAYER2_PROMISC:
589 			rv = vsw_setup_layer2(vswp);
590 			break;
591 
592 		case VSW_LAYER3:
593 			rv = vsw_setup_layer3(vswp);
594 			break;
595 
596 		default:
597 			DERR(vswp, "unknown switch mode");
598 			break;
599 		}
600 
601 		if (rv == 0) {
602 			vswp->smode_idx = i;
603 			break;
604 		}
605 	}
606 
607 	if (rv == 1) {
608 		cmn_err(CE_WARN, "Unable to setup switching mode");
609 		goto vsw_attach_fail;
610 	}
611 
612 	D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]);
613 
614 	/*
615 	 * Register with the MAC layer as a network device so
616 	 * we can be plumbed if desired.
617 	 *
618 	 * Do this in both layer 2 and layer 3 mode.
619 	 */
620 	vswp->if_state &= ~VSW_IF_UP;
621 	vswp->if_macp = NULL;
622 	vswp->if_mrh = NULL;
623 	if (vswp->mdprops & VSW_MD_MACADDR) {
624 		if (vsw_mac_register(vswp) != 0) {
625 			cmn_err(CE_WARN, "Unable to register as provider "
626 				" with MAC layer, continuing with attach");
627 		}
628 	}
629 
630 	/*
631 	 * Now we have everything setup, register for MD change
632 	 * events.
633 	 */
634 	vsw_mdeg_register(vswp);
635 
636 	return (DDI_SUCCESS);
637 
638 vsw_attach_fail:
639 	DERR(NULL, "vsw_attach: failed");
640 
641 	if (progress & PROG_taskq)
642 		ddi_taskq_destroy(vswp->taskq_p);
643 
644 	if (progress & PROG_plist)
645 		rw_destroy(&vswp->plist.lockrw);
646 
647 	if (progress & PROG_report_dev) {
648 		ddi_remove_minor_node(dip, NULL);
649 		mutex_destroy(&vswp->mca_lock);
650 	}
651 
652 	if (progress & PROG_mfdb) {
653 		mod_hash_destroy_hash(vswp->mfdb);
654 		vswp->mfdb = NULL;
655 		rw_destroy(&vswp->mfdbrw);
656 	}
657 
658 	if (progress & PROG_fdb) {
659 		mod_hash_destroy_hash(vswp->fdb);
660 		vswp->fdb = NULL;
661 	}
662 
663 	if (progress & PROG_if_lock)
664 		rw_destroy(&vswp->if_lockrw);
665 
666 	ddi_soft_state_free(vsw_state, instance);
667 	return (DDI_FAILURE);
668 }
669 
670 static int
671 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
672 {
673 	vsw_t	**vswpp, *vswp;
674 	int 	instance;
675 
676 	instance = ddi_get_instance(dip);
677 	vswp = ddi_get_soft_state(vsw_state, instance);
678 
679 	if (vswp == NULL) {
680 		return (DDI_FAILURE);
681 	}
682 
683 	switch (cmd) {
684 	case DDI_DETACH:
685 		break;
686 	case DDI_SUSPEND:
687 	case DDI_PM_SUSPEND:
688 	default:
689 		return (DDI_FAILURE);
690 	}
691 
692 	D2(vswp, "detaching instance %d", instance);
693 
694 	if (vswp->mdprops & VSW_MD_MACADDR) {
695 		if (vsw_mac_unregister(vswp) != 0) {
696 			cmn_err(CE_WARN, "Unable to detach from MAC layer");
697 			return (DDI_FAILURE);
698 		}
699 	}
700 	rw_destroy(&vswp->if_lockrw);
701 
702 	vsw_mdeg_unregister(vswp);
703 
704 	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
705 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) {
706 		vsw_mac_detach(vswp);
707 	}
708 
709 	if (vsw_detach_ports(vswp) != 0) {
710 		cmn_err(CE_WARN, "Unable to detach ports");
711 		return (DDI_FAILURE);
712 	}
713 
714 	/*
715 	 * Remove this instance from any entries it may be on in
716 	 * the hash table by using the list of addresses maintained
717 	 * in the vsw_t structure.
718 	 */
719 	vsw_del_mcst_vsw(vswp);
720 
721 	vswp->mcap = NULL;
722 	mutex_destroy(&vswp->mca_lock);
723 
724 	/*
725 	 * By now any pending tasks have finished and the underlying
726 	 * ldc's have been destroyed, so its safe to delete the control
727 	 * message taskq.
728 	 */
729 	if (vswp->taskq_p != NULL)
730 		ddi_taskq_destroy(vswp->taskq_p);
731 
732 	/*
733 	 * At this stage all the data pointers in the hash table
734 	 * should be NULL, as all the ports have been removed and will
735 	 * have deleted themselves from the port lists which the data
736 	 * pointers point to. Hence we can destroy the table using the
737 	 * default destructors.
738 	 */
739 	D2(vswp, "vsw_detach: destroying hash tables..");
740 	mod_hash_destroy_hash(vswp->fdb);
741 	vswp->fdb = NULL;
742 
743 	WRITE_ENTER(&vswp->mfdbrw);
744 	mod_hash_destroy_hash(vswp->mfdb);
745 	vswp->mfdb = NULL;
746 	RW_EXIT(&vswp->mfdbrw);
747 	rw_destroy(&vswp->mfdbrw);
748 
749 	ddi_remove_minor_node(dip, NULL);
750 
751 	rw_destroy(&vswp->plist.lockrw);
752 	WRITE_ENTER(&vsw_rw);
753 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
754 		if (*vswpp == vswp) {
755 			*vswpp = vswp->next;
756 			break;
757 		}
758 	}
759 	RW_EXIT(&vsw_rw);
760 	ddi_soft_state_free(vsw_state, instance);
761 
762 	return (DDI_SUCCESS);
763 }
764 
765 static int
766 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
767 {
768 	_NOTE(ARGUNUSED(dip))
769 
770 	vsw_t	*vswp = NULL;
771 	dev_t	dev = (dev_t)arg;
772 	int	instance;
773 
774 	instance = getminor(dev);
775 
776 	switch (infocmd) {
777 	case DDI_INFO_DEVT2DEVINFO:
778 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
779 			*result = NULL;
780 			return (DDI_FAILURE);
781 		}
782 		*result = vswp->dip;
783 		return (DDI_SUCCESS);
784 
785 	case DDI_INFO_DEVT2INSTANCE:
786 		*result = (void *)(uintptr_t)instance;
787 		return (DDI_SUCCESS);
788 
789 	default:
790 		*result = NULL;
791 		return (DDI_FAILURE);
792 	}
793 }
794 
795 /*
796  * Get the properties from our MD node.
797  */
798 static void
799 vsw_get_md_properties(vsw_t *vswp)
800 {
801 	md_t		*mdp = NULL;
802 	int		num_nodes = 0;
803 	int		len = 0, listsz = 0;
804 	int		num_vdev = 0;
805 	int		i, idx;
806 	boolean_t	found_node = B_FALSE;
807 	char		*smode = NULL;
808 	char		*curr_mode = NULL;
809 	char		*physname = NULL;
810 	char		*node_name = NULL;
811 	char		*dev;
812 	uint64_t 	macaddr = 0;
813 	uint64_t	md_inst, obp_inst;
814 	mde_cookie_t	*listp = NULL;
815 	mde_cookie_t	rootnode;
816 
817 	D1(vswp, "%s: enter", __func__);
818 
819 	/*
820 	 * Further down we compare the obp 'reg' property to the
821 	 * 'cfg-handle' property in the vsw MD node to determine
822 	 * if the node refers to this particular instance. So if
823 	 * we can't read the obp value then there is no point
824 	 * in proceeding further.
825 	 */
826 	if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip,
827 			DDI_PROP_DONTPASS, reg_propname) != 1) {
828 		cmn_err(CE_WARN, "Unable to read %s property "
829 			"from OBP device node", reg_propname);
830 		return;
831 	}
832 
833 	obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
834 		DDI_PROP_DONTPASS, reg_propname, 0);
835 
836 	D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst);
837 
838 	if ((mdp = md_get_handle()) == NULL) {
839 		DERR(vswp, "%s: unable to init MD", __func__);
840 		return;
841 	}
842 
843 	if ((num_nodes = md_node_count(mdp)) <= 0) {
844 		DERR(vswp, "%s: invalid number of  nodes found %d",
845 			__func__, num_nodes);
846 		(void) md_fini_handle(mdp);
847 		return;
848 	}
849 
850 	D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes);
851 
852 	/* allocate enough space for node list */
853 	listsz = num_nodes * sizeof (mde_cookie_t);
854 	listp = kmem_zalloc(listsz, KM_SLEEP);
855 
856 	rootnode = md_root_node(mdp);
857 
858 	/* Get the list of virtual devices */
859 	num_vdev = md_scan_dag(mdp, rootnode,
860 		md_find_name(mdp, vdev_propname),
861 		md_find_name(mdp, "fwd"), listp);
862 
863 	if (num_vdev <= 0) {
864 		DERR(vswp, "%s: didn't find any virtual-device nodes in MD",
865 			__func__);
866 		goto md_prop_exit;
867 	}
868 
869 	D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev);
870 
871 	/* Look for the virtual switch nodes in the list */
872 	for (idx = 0; idx < num_vdev; idx++) {
873 		if (md_get_prop_str(mdp, listp[idx],
874 				"name", &node_name) != 0) {
875 			DERR(vswp, "%s: unable to get node name", __func__);
876 			continue;
877 
878 		}
879 
880 		if (strcmp(node_name, vsw_propname) == 0) {
881 			/* Virtual switch node */
882 			if (md_get_prop_val(mdp, listp[idx],
883 				"cfg-handle", &md_inst) != 0) {
884 				DERR(vswp, "%s: unable to get cfg-handle from"
885 					" node %d", __func__, idx);
886 				goto md_prop_exit;
887 			} else if (md_inst == obp_inst) {
888 				D2(vswp, "%s: found matching node (%d)"
889 					" 0x%llx == 0x%llx", __func__, idx,
890 					md_inst, obp_inst);
891 				found_node = B_TRUE;
892 				break;
893 			}
894 		}
895 	}
896 
897 	if (!found_node) {
898 		DWARN(vswp, "%s: couldn't find correct vsw node", __func__);
899 		goto md_prop_exit;
900 	}
901 
902 	/*
903 	 * Now, having found the correct node, get the various properties.
904 	 */
905 
906 	if (md_get_prop_data(mdp, listp[idx], physdev_propname,
907 				(uint8_t **)(&physname), &len) != 0) {
908 		cmn_err(CE_WARN, "%s: unable to get name(s) of physical "
909 			"device(s) from MD", __func__);
910 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
911 		cmn_err(CE_WARN, "%s is too long a device name", physname);
912 	} else {
913 		(void) strncpy(vswp->physname, physname, strlen(physname) + 1);
914 		vswp->mdprops |= VSW_MD_PHYSNAME;
915 		D2(vswp, "%s: using first device specified (%s)",
916 			__func__, vswp->physname);
917 	}
918 
919 
920 #ifdef DEBUG
921 	/*
922 	 * As a temporary measure to aid testing we check to see if there
923 	 * is a vsw.conf file present. If there is we use the value of the
924 	 * vsw_physname property in the file as the name of the physical
925 	 * device, overriding the value from the MD.
926 	 *
927 	 * There may be multiple devices listed, but for the moment
928 	 * we just use the first one.
929 	 */
930 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
931 		"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
932 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
933 			cmn_err(CE_WARN, "%s is too long a device name", dev);
934 		} else {
935 			cmn_err(CE_NOTE, "%s: using device name (%s) from "
936 				"config file", __func__, dev);
937 
938 			(void) strncpy(vswp->physname, dev, strlen(dev) + 1);
939 			vswp->mdprops |= VSW_MD_PHYSNAME;
940 		}
941 
942 		ddi_prop_free(dev);
943 
944 	}
945 #endif
946 
947 	/* local mac address */
948 	if (md_get_prop_val(mdp, listp[idx],
949 			macaddr_propname, &macaddr) != 0) {
950 		cmn_err(CE_WARN, "%s: unable to get local MAC address",
951 								__func__);
952 	} else {
953 		READ_ENTER(&vswp->if_lockrw);
954 		for (i = ETHERADDRL - 1; i >= 0; i--) {
955 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
956 			macaddr >>= 8;
957 		}
958 		RW_EXIT(&vswp->if_lockrw);
959 		vswp->mdprops |= VSW_MD_MACADDR;
960 	}
961 
962 	/*
963 	 * Get the switch-mode property. The modes are listed in
964 	 * decreasing order of preference, i.e. prefered mode is
965 	 * first item in list.
966 	 */
967 	len = 0;
968 	if (md_get_prop_data(mdp, listp[idx], smode_propname,
969 				(uint8_t **)(&smode), &len) != 0) {
970 		/*
971 		 * Unable to get switch-mode property, so just use
972 		 * default values which vswp->smode[] array has already
973 		 * been pre-populated with, namely layer2.
974 		 */
975 		cmn_err(CE_WARN, "%s: unable to get switch mode property, "
976 			"defaulting to layer 2 mode", __func__);
977 	} else {
978 		i = 0;
979 		curr_mode = smode;
980 		/*
981 		 * Modes of operation:
982 		 * 'switched'	 - layer 2 switching, underlying HW in
983 		 *			non-promiscuous mode.
984 		 * 'promiscuous' - layer 2 switching, underlying HW in
985 		 *			promiscuous mode.
986 		 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
987 		 *			in non-promiscuous mode.
988 		 */
989 		while ((curr_mode < (smode + len)) && (i < NUM_SMODES)) {
990 			D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
991 			if (strcmp(curr_mode, "switched") == 0)
992 				vswp->smode[i] = VSW_LAYER2;
993 			else if (strcmp(curr_mode, "promiscuous") == 0)
994 				vswp->smode[i] = VSW_LAYER2_PROMISC;
995 			else if (strcmp(curr_mode, "routed") == 0)
996 				vswp->smode[i] = VSW_LAYER3;
997 			else {
998 				DERR(vswp, "%s: unknown mode %s",
999 					__func__, curr_mode);
1000 				/* default to layer 2 */
1001 				vswp->smode[i] = VSW_LAYER2;
1002 			}
1003 			curr_mode += strlen(curr_mode) + 1;
1004 			i++;
1005 		}
1006 
1007 		vswp->mdprops |= VSW_MD_SMODE;
1008 	}
1009 
1010 md_prop_exit:
1011 	(void) md_fini_handle(mdp);
1012 
1013 	kmem_free(listp, listsz);
1014 
1015 	D1(vswp, "%s: exit", __func__);
1016 }
1017 
1018 static int
1019 vsw_setup_layer2(vsw_t *vswp)
1020 {
1021 	int		rv = 0;
1022 
1023 	D1(vswp, "%s: enter", __func__);
1024 
1025 	vsw_switch_frame = vsw_switch_l2_frame;
1026 
1027 	/*
1028 	 * Attempt to link into the MAC layer so we can get
1029 	 * and send packets out over the physical adapter.
1030 	 */
1031 	if (vswp->mdprops & VSW_MD_PHYSNAME) {
1032 		if (vsw_mac_attach(vswp) != 0) {
1033 			/*
1034 			 * Registration with the MAC layer has failed,
1035 			 * so return 1 so that can fall back to next
1036 			 * prefered switching method.
1037 			 */
1038 			cmn_err(CE_WARN, "!unable to join as MAC layer "
1039 				"client, continuing with attach");
1040 			rv = 1;
1041 		}
1042 	} else {
1043 		/* No physical device name found in MD */
1044 		DERR(vswp, "%s: no physical device name specified", __func__);
1045 		rv = 1;
1046 	}
1047 
1048 	D1(vswp, "%s: exit", __func__);
1049 
1050 	return (rv);
1051 }
1052 
1053 static int
1054 vsw_setup_layer3(vsw_t *vswp)
1055 {
1056 	D1(vswp, "%s: enter", __func__);
1057 
1058 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1059 	vsw_switch_frame = vsw_switch_l3_frame;
1060 
1061 	D1(vswp, "%s: exit", __func__);
1062 
1063 	return (0);
1064 }
1065 
1066 /*
1067  * Link into the MAC layer to gain access to the services provided by
1068  * the underlying physical device driver (which should also have
1069  * registered with the MAC layer).
1070  *
1071  * Only when in layer 2 mode.
1072  */
1073 static int
1074 vsw_mac_attach(vsw_t *vswp)
1075 {
1076 	D1(vswp, "vsw_mac_attach: enter");
1077 
1078 	vswp->mh = NULL;
1079 	vswp->mrh = NULL;
1080 	vswp->mnh = NULL;
1081 
1082 	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
1083 
1084 	if ((mac_open(vswp->physname, 0, &vswp->mh)) != 0) {
1085 		cmn_err(CE_WARN, "mac_open %s failed", vswp->physname);
1086 		goto mac_fail_exit;
1087 	}
1088 
1089 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1090 
1091 	/* register for changes in the interface */
1092 	vswp->mnh = mac_notify_add(vswp->mh, vsw_notify_cb, (void *)vswp);
1093 
1094 	/* register our rx callback function */
1095 	vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1096 
1097 	/* get the MAC tx fn */
1098 	vswp->txinfo = mac_tx_get(vswp->mh);
1099 
1100 	/* start the interface */
1101 	if (mac_start(vswp->mh) != 0) {
1102 		cmn_err(CE_WARN, "could not start mac interface");
1103 		goto mac_fail_exit;
1104 	}
1105 
1106 	/* get and store original promisc setting */
1107 	vswp->init_promisc = mac_promisc_get(vswp->mh, MAC_DEVPROMISC);
1108 
1109 	/*
1110 	 * FUTURE: When we have the ability to set multiple unicast
1111 	 * mac address then we won't have to set the device into
1112 	 * promisc mode, but for the moment its the only way we.
1113 	 * can see pkts that logical domains we are serving are
1114 	 * interested in.
1115 	 */
1116 	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) &&
1117 					(vswp->init_promisc == B_FALSE)) {
1118 		DERR(vswp, "vsw_mac_attach: enabling promisc mode..");
1119 
1120 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1121 			DERR(vswp, "vsw_mac_attach: unable to set device"
1122 				" into promiscuous mode");
1123 			goto mac_fail_exit;
1124 		}
1125 	}
1126 
1127 	D1(vswp, "vsw_mac_attach: exit");
1128 	return (0);
1129 
1130 mac_fail_exit:
1131 	if (vswp->mh != NULL) {
1132 		mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC);
1133 		if (vswp->mrh != NULL)
1134 			mac_rx_remove(vswp->mh, vswp->mrh);
1135 
1136 		if (vswp->mnh != NULL)
1137 			mac_notify_remove(vswp->mh, vswp->mnh);
1138 
1139 		mac_close(vswp->mh);
1140 	}
1141 
1142 	vswp->mrh = NULL;
1143 	vswp->mnh = NULL;
1144 	vswp->mh = NULL;
1145 	vswp->txinfo = NULL;
1146 
1147 	D1(vswp, "vsw_mac_attach: fail exit");
1148 	return (1);
1149 }
1150 
1151 static void
1152 vsw_mac_detach(vsw_t *vswp)
1153 {
1154 	D1(vswp, "vsw_mac_detach: enter");
1155 
1156 	if (vswp->mh != NULL) {
1157 		/* restore promisc to original setting */
1158 		mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC);
1159 		if (vswp->mrh != NULL)
1160 			mac_rx_remove(vswp->mh, vswp->mrh);
1161 
1162 		if (vswp->mnh != NULL)
1163 			mac_notify_remove(vswp->mh, vswp->mnh);
1164 
1165 		mac_close(vswp->mh);
1166 	}
1167 
1168 	vswp->mrh = NULL;
1169 	vswp->mnh = NULL;
1170 	vswp->mh = NULL;
1171 	vswp->txinfo = NULL;
1172 
1173 	D1(vswp, "vsw_mac_detach: exit");
1174 }
1175 
1176 /*
1177  * Get notified of changes to the interface.
1178  *
1179  * For the moment we brute force the interface back
1180  * into promisc mode if it is unset (e.g. by snoop).
1181  * When we have the ability to set multiple mac addresses,
1182  * we will need to see if this is necessary.
1183  */
1184 static void
1185 vsw_notify_cb(void *arg, mac_notify_type_t type)
1186 {
1187 	vsw_t		*vswp = (vsw_t *)arg;
1188 
1189 	switch (type) {
1190 	case MAC_NOTE_PROMISC:
1191 		vswp->txinfo = mac_tx_get(vswp->mh);
1192 		if (mac_promisc_get(vswp->mh, MAC_DEVPROMISC) == B_TRUE) {
1193 			D2(vswp, "%s: still in PROMISC mode", __func__);
1194 		} else {
1195 			D2(vswp, "%s: now in NON-PROMISC mode", __func__);
1196 			D2(vswp, "...re-enabling");
1197 			mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC);
1198 		}
1199 		break;
1200 	default:
1201 		break;
1202 	}
1203 }
1204 
1205 /*
1206  * receive callback routine. Invoked by MAC layer when there
1207  * are pkts being passed up from physical device.
1208  *
1209  * PERF: It may be more efficient when the card is in promisc
1210  * mode to check the dest address of the pkts here (against
1211  * the FDB) rather than checking later. Needs to be investigated.
1212  */
1213 static void
1214 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1215 {
1216 	_NOTE(ARGUNUSED(mrh))
1217 
1218 	vsw_t		*vswp = (vsw_t *)arg;
1219 
1220 	ASSERT(vswp != NULL);
1221 
1222 	D1(vswp, "vsw_rx_cb: enter");
1223 
1224 	/* switch the chain of packets received */
1225 	vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1226 
1227 	D1(vswp, "vsw_rx_cb: exit");
1228 }
1229 
1230 /*
1231  * Send a message out over the physical device via the MAC layer.
1232  *
1233  * Returns any mblks that it was unable to transmit.
1234  */
1235 static mblk_t *
1236 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
1237 {
1238 	const mac_txinfo_t	*mtp;
1239 	mblk_t			*nextp;
1240 
1241 	if (vswp->mh == NULL) {
1242 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
1243 		return (mp);
1244 	} else {
1245 		for (;;) {
1246 			nextp = mp->b_next;
1247 			mp->b_next = NULL;
1248 
1249 			mtp = vswp->txinfo;
1250 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
1251 				mp->b_next = nextp;
1252 				break;
1253 			}
1254 
1255 			if ((mp = nextp) == NULL)
1256 				break;
1257 
1258 		}
1259 
1260 	}
1261 
1262 	return (mp);
1263 }
1264 
1265 /*
1266  * Register with the MAC layer as a network device, so we
1267  * can be plumbed if necessary.
1268  */
1269 static int
1270 vsw_mac_register(vsw_t *vswp)
1271 {
1272 	mac_t		*macp = NULL;
1273 	mac_info_t	*mip = NULL;
1274 	int		rv = 0;
1275 
1276 	D1(vswp, "%s: enter", __func__);
1277 
1278 	macp = kmem_zalloc(sizeof (mac_t), KM_SLEEP);
1279 
1280 	/*
1281 	 * Setup the m_info fields.
1282 	 */
1283 	mip = &(macp->m_info);
1284 	mip->mi_media = DL_ETHER;
1285 	mip->mi_sdu_min = 0;
1286 	mip->mi_sdu_max = ETHERMTU;
1287 	mip->mi_cksum = 0;
1288 	mip->mi_poll = DL_CAPAB_POLL;
1289 
1290 	mip->mi_addr_length = ETHERADDRL;
1291 	bcopy(&etherbroadcastaddr, mip->mi_brdcst_addr, ETHERADDRL);
1292 
1293 	READ_ENTER(&vswp->if_lockrw);
1294 	bcopy(&vswp->if_addr, mip->mi_unicst_addr, ETHERADDRL);
1295 	RW_EXIT(&vswp->if_lockrw);
1296 
1297 	MAC_STAT_MIB(mip->mi_stat);
1298 	MAC_STAT_ETHER(mip->mi_stat);
1299 
1300 	/* entry points */
1301 	macp->m_stat = vsw_m_stat;
1302 	macp->m_stop = vsw_m_stop;
1303 	macp->m_start = vsw_m_start;
1304 	macp->m_unicst = vsw_m_unicst;
1305 	macp->m_multicst = vsw_m_multicst;
1306 	macp->m_promisc = vsw_m_promisc;
1307 	macp->m_tx = vsw_m_tx;
1308 	macp->m_resources = vsw_m_resources;
1309 	macp->m_ioctl = vsw_m_ioctl;
1310 
1311 	macp->m_port = 0;
1312 	macp->m_dip = vswp->dip;
1313 	macp->m_ident = MAC_IDENT;
1314 	macp->m_driver = vswp;
1315 
1316 	vswp->if_macp = macp;
1317 
1318 	/* register */
1319 	rv = mac_register(macp);
1320 
1321 	D1(vswp, "%s: exit", __func__);
1322 
1323 	return (rv);
1324 }
1325 
1326 static int
1327 vsw_mac_unregister(vsw_t *vswp)
1328 {
1329 	int		rv = 0;
1330 
1331 	D1(vswp, "%s: enter", __func__);
1332 
1333 	WRITE_ENTER(&vswp->if_lockrw);
1334 
1335 	if (vswp->if_macp != NULL) {
1336 		rv = mac_unregister(vswp->if_macp);
1337 		if (rv != 0) {
1338 			DWARN(vswp, "%s: unable to unregister from MAC "
1339 				"framework", __func__);
1340 
1341 			RW_EXIT(&vswp->if_lockrw);
1342 			D1(vswp, "%s: fail exit", __func__);
1343 			return (rv);
1344 		}
1345 
1346 		/* mark i/f as down and promisc off */
1347 		vswp->if_state &= ~VSW_IF_UP;
1348 
1349 		kmem_free(vswp->if_macp, sizeof (mac_t));
1350 		vswp->if_macp = NULL;
1351 	}
1352 	RW_EXIT(&vswp->if_lockrw);
1353 
1354 	D1(vswp, "%s: exit", __func__);
1355 
1356 	return (rv);
1357 }
1358 
1359 static uint64_t
1360 vsw_m_stat(void *arg, enum mac_stat stat)
1361 {
1362 	vsw_t			*vswp = (vsw_t *)arg;
1363 	const mac_info_t	*mip;
1364 
1365 	D1(vswp, "%s: enter", __func__);
1366 
1367 	if (vswp->mh != NULL)
1368 		mip = mac_info(vswp->mh);
1369 	else
1370 		return (0);
1371 
1372 	if (!mip->mi_stat[stat])
1373 		return (0);
1374 
1375 	/* return stats from underlying device */
1376 	return (mac_stat_get(vswp->mh, stat));
1377 
1378 }
1379 
1380 static void
1381 vsw_m_stop(void *arg)
1382 {
1383 	vsw_t		*vswp = (vsw_t *)arg;
1384 
1385 	D1(vswp, "%s: enter", __func__);
1386 
1387 	WRITE_ENTER(&vswp->if_lockrw);
1388 	vswp->if_state &= ~VSW_IF_UP;
1389 	RW_EXIT(&vswp->if_lockrw);
1390 
1391 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1392 }
1393 
1394 static int
1395 vsw_m_start(void *arg)
1396 {
1397 	vsw_t		*vswp = (vsw_t *)arg;
1398 
1399 	D1(vswp, "%s: enter", __func__);
1400 
1401 	WRITE_ENTER(&vswp->if_lockrw);
1402 	vswp->if_state |= VSW_IF_UP;
1403 	RW_EXIT(&vswp->if_lockrw);
1404 
1405 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1406 	return (0);
1407 }
1408 
1409 /*
1410  * Change the local interface address.
1411  */
1412 static int
1413 vsw_m_unicst(void *arg, const uint8_t *macaddr)
1414 {
1415 	vsw_t		*vswp = (vsw_t *)arg;
1416 
1417 	D1(vswp, "%s: enter", __func__);
1418 
1419 	WRITE_ENTER(&vswp->if_lockrw);
1420 	ether_copy(macaddr, &vswp->if_addr);
1421 	RW_EXIT(&vswp->if_lockrw);
1422 
1423 	D1(vswp, "%s: exit", __func__);
1424 
1425 	return (0);
1426 }
1427 
1428 static int
1429 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
1430 {
1431 	vsw_t		*vswp = (vsw_t *)arg;
1432 	mcst_addr_t	*mcst_p = NULL;
1433 	uint64_t	addr = 0x0;
1434 	int		i;
1435 
1436 	D1(vswp, "%s: enter", __func__);
1437 
1438 	/*
1439 	 * Convert address into form that can be used
1440 	 * as hash table key.
1441 	 */
1442 	for (i = 0; i < ETHERADDRL; i++) {
1443 		addr = (addr << 8) | mca[i];
1444 	}
1445 
1446 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
1447 
1448 	if (add) {
1449 		D2(vswp, "%s: adding multicast", __func__);
1450 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1451 			/*
1452 			 * Update the list of multicast addresses
1453 			 * contained within the vsw_t structure to
1454 			 * include this new one.
1455 			 */
1456 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
1457 			if (mcst_p == NULL) {
1458 				DERR(vswp, "%s unable to alloc mem", __func__);
1459 				return (1);
1460 			}
1461 			mcst_p->addr = addr;
1462 
1463 			mutex_enter(&vswp->mca_lock);
1464 			mcst_p->nextp = vswp->mcap;
1465 			vswp->mcap = mcst_p;
1466 			mutex_exit(&vswp->mca_lock);
1467 
1468 			/*
1469 			 * Call into the underlying driver to program the
1470 			 * address into HW.
1471 			 *
1472 			 * Note:
1473 			 * Can safely ignore the return value as the card
1474 			 * will for the moment always be in promisc mode.
1475 			 * When we can program multiple MAC addresses into the
1476 			 * HW then we will need to care about the return
1477 			 * value here.
1478 			 */
1479 			if (vswp->mh != NULL)
1480 				(void) mac_multicst_add(vswp->mh, mca);
1481 		}
1482 	} else {
1483 		D2(vswp, "%s: removing multicast", __func__);
1484 		/*
1485 		 * Remove the address from the hash table..
1486 		 */
1487 		if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1488 
1489 			/*
1490 			 * ..and then from the list maintained in the
1491 			 * vsw_t structure.
1492 			 */
1493 			vsw_del_addr(VSW_LOCALDEV, vswp, addr);
1494 
1495 			if (vswp->mh != NULL)
1496 				(void) mac_multicst_remove(vswp->mh, mca);
1497 		}
1498 	}
1499 
1500 	D1(vswp, "%s: exit", __func__);
1501 
1502 	return (0);
1503 }
1504 
1505 static int
1506 vsw_m_promisc(void *arg, boolean_t on)
1507 {
1508 	vsw_t		*vswp = (vsw_t *)arg;
1509 
1510 	D1(vswp, "%s: enter", __func__);
1511 
1512 	WRITE_ENTER(&vswp->if_lockrw);
1513 	if (on)
1514 		vswp->if_state |= VSW_IF_PROMISC;
1515 	else
1516 		vswp->if_state &= ~VSW_IF_PROMISC;
1517 	RW_EXIT(&vswp->if_lockrw);
1518 
1519 	D1(vswp, "%s: exit", __func__);
1520 
1521 	return (0);
1522 }
1523 
1524 static mblk_t *
1525 vsw_m_tx(void *arg, mblk_t *mp)
1526 {
1527 	vsw_t		*vswp = (vsw_t *)arg;
1528 
1529 	D1(vswp, "%s: enter", __func__);
1530 
1531 	vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
1532 
1533 	D1(vswp, "%s: exit", __func__);
1534 
1535 	return (NULL);
1536 }
1537 
1538 static void
1539 vsw_m_resources(void *arg)
1540 {
1541 	vsw_t		*vswp = (vsw_t *)arg;
1542 	mac_rx_fifo_t	mrf;
1543 
1544 	D1(vswp, "%s: enter", __func__);
1545 
1546 	mrf.mrf_type = MAC_RX_FIFO;
1547 	mrf.mrf_blank = NULL;
1548 	mrf.mrf_arg = (void *)vswp;
1549 	mrf.mrf_normal_blank_time = 0;
1550 	mrf.mrf_normal_pkt_count = 0;
1551 
1552 	WRITE_ENTER(&vswp->if_lockrw);
1553 	vswp->if_mrh = mac_resource_add(vswp->if_macp, (mac_resource_t *)&mrf);
1554 	RW_EXIT(&vswp->if_lockrw);
1555 
1556 	D1(vswp, "%s: exit", __func__);
1557 }
1558 
1559 static void
1560 vsw_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1561 {
1562 	vsw_t		*vswp = (vsw_t *)arg;
1563 
1564 	D1(vswp, "%s: enter", __func__);
1565 
1566 	miocnak(q, mp, 0, ENOTSUP);
1567 
1568 	D1(vswp, "%s: exit", __func__);
1569 }
1570 
1571 /*
1572  * Register for machine description (MD) updates.
1573  */
1574 static void
1575 vsw_mdeg_register(vsw_t *vswp)
1576 {
1577 	mdeg_prop_spec_t	*pspecp;
1578 	mdeg_node_spec_t	*inst_specp;
1579 	mdeg_handle_t		mdeg_hdl;
1580 	size_t			templatesz;
1581 	int			inst, rv;
1582 
1583 	D1(vswp, "%s: enter", __func__);
1584 
1585 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
1586 		DDI_PROP_DONTPASS, reg_propname, -1);
1587 	if (inst == -1) {
1588 		DERR(vswp, "%s: unable to get %s property",
1589 						__func__, reg_propname);
1590 		return;
1591 	}
1592 
1593 	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
1594 
1595 	/*
1596 	 * Allocate and initialize a per-instance copy
1597 	 * of the global property spec array that will
1598 	 * uniquely identify this vsw instance.
1599 	 */
1600 	templatesz = sizeof (vsw_prop_template);
1601 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
1602 
1603 	bcopy(vsw_prop_template, pspecp, templatesz);
1604 
1605 	VSW_SET_MDEG_PROP_INST(pspecp, inst);
1606 
1607 	/* initialize the complete prop spec structure */
1608 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
1609 	inst_specp->namep = "virtual-device";
1610 	inst_specp->specp = pspecp;
1611 
1612 	/* perform the registration */
1613 	rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb,
1614 	    (void *)vswp, &mdeg_hdl);
1615 
1616 	if (rv != MDEG_SUCCESS) {
1617 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
1618 		kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
1619 		kmem_free(pspecp, templatesz);
1620 		return;
1621 	}
1622 
1623 	/* save off data that will be needed later */
1624 	vswp->inst_spec = inst_specp;
1625 	vswp->mdeg_hdl = mdeg_hdl;
1626 
1627 	D1(vswp, "%s: exit", __func__);
1628 }
1629 
1630 static void
1631 vsw_mdeg_unregister(vsw_t *vswp)
1632 {
1633 	D1(vswp, "vsw_mdeg_unregister: enter");
1634 
1635 	(void) mdeg_unregister(vswp->mdeg_hdl);
1636 
1637 	if (vswp->inst_spec->specp != NULL) {
1638 		(void) kmem_free(vswp->inst_spec->specp,
1639 			sizeof (vsw_prop_template));
1640 		vswp->inst_spec->specp = NULL;
1641 	}
1642 
1643 	if (vswp->inst_spec != NULL) {
1644 		(void) kmem_free(vswp->inst_spec,
1645 			sizeof (mdeg_node_spec_t));
1646 		vswp->inst_spec = NULL;
1647 	}
1648 
1649 	D1(vswp, "vsw_mdeg_unregister: exit");
1650 }
1651 
1652 static int
1653 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
1654 {
1655 	vsw_t		*vswp;
1656 	int		idx;
1657 	md_t		*mdp;
1658 	mde_cookie_t	node;
1659 	uint64_t	inst;
1660 
1661 	if (resp == NULL)
1662 		return (MDEG_FAILURE);
1663 
1664 	vswp = (vsw_t *)cb_argp;
1665 
1666 	D1(vswp, "%s: added %d : removed %d : matched %d",
1667 		__func__, resp->added.nelem, resp->removed.nelem,
1668 		resp->match_prev.nelem);
1669 
1670 	/* process added ports */
1671 	for (idx = 0; idx < resp->added.nelem; idx++) {
1672 		mdp = resp->added.mdp;
1673 		node = resp->added.mdep[idx];
1674 
1675 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
1676 
1677 		if (vsw_port_add(vswp, mdp, &node) != 0) {
1678 			cmn_err(CE_WARN, "Unable to add new port (0x%lx)",
1679 					node);
1680 		}
1681 	}
1682 
1683 	/* process removed ports */
1684 	for (idx = 0; idx < resp->removed.nelem; idx++) {
1685 		mdp = resp->removed.mdp;
1686 		node = resp->removed.mdep[idx];
1687 
1688 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
1689 			DERR(vswp, "%s: prop(%s) not found port(%d)",
1690 				__func__, id_propname, idx);
1691 			continue;
1692 		}
1693 
1694 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
1695 
1696 		if (vsw_port_detach(vswp, inst) != 0) {
1697 			cmn_err(CE_WARN, "Unable to remove port %ld", inst);
1698 		}
1699 	}
1700 
1701 	/*
1702 	 * Currently no support for updating already active ports.
1703 	 * So, ignore the match_curr and match_priv arrays for now.
1704 	 */
1705 
1706 	D1(vswp, "%s: exit", __func__);
1707 
1708 	return (MDEG_SUCCESS);
1709 }
1710 
1711 /*
1712  * Add a new port to the system.
1713  *
1714  * Returns 0 on success, 1 on failure.
1715  */
1716 int
1717 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
1718 {
1719 	uint64_t		ldc_id;
1720 	uint8_t			*addrp;
1721 	int			i, addrsz;
1722 	int			num_nodes = 0, nchan = 0;
1723 	int			listsz = 0;
1724 	mde_cookie_t		*listp = NULL;
1725 	struct ether_addr	ea;
1726 	uint64_t		macaddr;
1727 	uint64_t		inst = 0;
1728 	vsw_port_t		*port;
1729 
1730 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
1731 		DWARN(vswp, "%s: prop(%s) not found", __func__,
1732 			id_propname);
1733 		return (1);
1734 	}
1735 
1736 	/*
1737 	 * Find the channel endpoint node(s) (which should be under this
1738 	 * port node) which contain the channel id(s).
1739 	 */
1740 	if ((num_nodes = md_node_count(mdp)) <= 0) {
1741 		DERR(vswp, "%s: invalid number of nodes found (%d)",
1742 			__func__, num_nodes);
1743 		return (1);
1744 	}
1745 
1746 	/* allocate enough space for node list */
1747 	listsz = num_nodes * sizeof (mde_cookie_t);
1748 	listp = kmem_zalloc(listsz, KM_SLEEP);
1749 
1750 	nchan = md_scan_dag(mdp, *node,
1751 		md_find_name(mdp, chan_propname),
1752 		md_find_name(mdp, "fwd"), listp);
1753 
1754 	if (nchan <= 0) {
1755 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
1756 		kmem_free(listp, listsz);
1757 		return (1);
1758 	}
1759 
1760 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
1761 
1762 	/* use property from first node found */
1763 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
1764 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
1765 			id_propname);
1766 		kmem_free(listp, listsz);
1767 		return (1);
1768 	}
1769 
1770 	/* don't need list any more */
1771 	kmem_free(listp, listsz);
1772 
1773 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
1774 
1775 	/* read mac-address property */
1776 	if (md_get_prop_data(mdp, *node, remaddr_propname,
1777 					&addrp, &addrsz)) {
1778 		DWARN(vswp, "%s: prop(%s) not found",
1779 				__func__, remaddr_propname);
1780 		return (1);
1781 	}
1782 
1783 	if (addrsz < ETHERADDRL) {
1784 		DWARN(vswp, "%s: invalid address size", __func__);
1785 		return (1);
1786 	}
1787 
1788 	macaddr = *((uint64_t *)addrp);
1789 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
1790 
1791 	for (i = ETHERADDRL - 1; i >= 0; i--) {
1792 		ea.ether_addr_octet[i] = macaddr & 0xFF;
1793 		macaddr >>= 8;
1794 	}
1795 
1796 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
1797 		DERR(vswp, "%s: failed to attach port", __func__);
1798 		return (1);
1799 	}
1800 
1801 	port = vsw_lookup_port(vswp, (int)inst);
1802 
1803 	/* just successfuly created the port, so it should exist */
1804 	ASSERT(port != NULL);
1805 
1806 	return (0);
1807 }
1808 
1809 /*
1810  * Attach the specified port.
1811  *
1812  * Returns 0 on success, 1 on failure.
1813  */
1814 static int
1815 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
1816 struct ether_addr *macaddr)
1817 {
1818 	vsw_port_list_t		*plist = &vswp->plist;
1819 	vsw_port_t		*port, **prev_port;
1820 	int			i;
1821 
1822 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
1823 
1824 	/* port already exists? */
1825 	READ_ENTER(&plist->lockrw);
1826 	for (port = plist->head; port != NULL; port = port->p_next) {
1827 		if (port->p_instance == p_instance) {
1828 			DWARN(vswp, "%s: port instance %d already attached",
1829 				__func__, p_instance);
1830 			RW_EXIT(&plist->lockrw);
1831 			return (1);
1832 		}
1833 	}
1834 	RW_EXIT(&plist->lockrw);
1835 
1836 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
1837 	port->p_vswp = vswp;
1838 	port->p_instance = p_instance;
1839 	port->p_ldclist.num_ldcs = 0;
1840 	port->p_ldclist.head = NULL;
1841 
1842 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
1843 
1844 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
1845 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
1846 
1847 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
1848 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
1849 
1850 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
1851 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
1852 	port->state = VSW_PORT_INIT;
1853 
1854 	if (nids > VSW_PORT_MAX_LDCS) {
1855 		D2(vswp, "%s: using first of %d ldc ids",
1856 			__func__, nids);
1857 		nids = VSW_PORT_MAX_LDCS;
1858 	}
1859 
1860 	D2(vswp, "%s: %d nids", __func__, nids);
1861 	for (i = 0; i < nids; i++) {
1862 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
1863 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
1864 			DERR(vswp, "%s: ldc_attach failed", __func__);
1865 
1866 			rw_destroy(&port->p_ldclist.lockrw);
1867 
1868 			cv_destroy(&port->ref_cv);
1869 			mutex_destroy(&port->ref_lock);
1870 
1871 			cv_destroy(&port->state_cv);
1872 			mutex_destroy(&port->state_lock);
1873 
1874 			mutex_destroy(&port->tx_lock);
1875 			mutex_destroy(&port->mca_lock);
1876 			kmem_free(port, sizeof (vsw_port_t));
1877 			return (1);
1878 		}
1879 	}
1880 
1881 	ether_copy(macaddr, &port->p_macaddr);
1882 
1883 	WRITE_ENTER(&plist->lockrw);
1884 
1885 	/* create the fdb entry for this port/mac address */
1886 	(void) vsw_add_fdb(vswp, port);
1887 
1888 	/* link it into the list of ports for this vsw instance */
1889 	prev_port = (vsw_port_t **)(&plist->head);
1890 	port->p_next = *prev_port;
1891 	*prev_port = port;
1892 	plist->num_ports++;
1893 	RW_EXIT(&plist->lockrw);
1894 
1895 	/*
1896 	 * Initialise the port and any ldc's under it.
1897 	 */
1898 	(void) vsw_init_ldcs(port);
1899 
1900 	D1(vswp, "%s: exit", __func__);
1901 	return (0);
1902 }
1903 
1904 /*
1905  * Detach the specified port.
1906  *
1907  * Returns 0 on success, 1 on failure.
1908  */
1909 static int
1910 vsw_port_detach(vsw_t *vswp, int p_instance)
1911 {
1912 	vsw_port_t	*port = NULL;
1913 	vsw_port_list_t	*plist = &vswp->plist;
1914 
1915 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
1916 
1917 	WRITE_ENTER(&plist->lockrw);
1918 
1919 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
1920 		RW_EXIT(&plist->lockrw);
1921 		return (1);
1922 	}
1923 
1924 	if (vsw_plist_del_node(vswp, port)) {
1925 		RW_EXIT(&plist->lockrw);
1926 		return (1);
1927 	}
1928 
1929 	/* Remove the fdb entry for this port/mac address */
1930 	(void) vsw_del_fdb(vswp, port);
1931 
1932 	/* Remove any multicast addresses.. */
1933 	vsw_del_mcst_port(port);
1934 
1935 	/*
1936 	 * No longer need to hold lock on port list now that we
1937 	 * have unlinked the target port from the list.
1938 	 */
1939 	RW_EXIT(&plist->lockrw);
1940 
1941 	if (vsw_port_delete(port)) {
1942 		return (1);
1943 	}
1944 
1945 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
1946 	return (0);
1947 }
1948 
1949 /*
1950  * Detach all active ports.
1951  *
1952  * Returns 0 on success, 1 on failure.
1953  */
1954 static int
1955 vsw_detach_ports(vsw_t *vswp)
1956 {
1957 	vsw_port_list_t 	*plist = &vswp->plist;
1958 	vsw_port_t		*port = NULL;
1959 
1960 	D1(vswp, "%s: enter", __func__);
1961 
1962 	WRITE_ENTER(&plist->lockrw);
1963 
1964 	while ((port = plist->head) != NULL) {
1965 		if (vsw_plist_del_node(vswp, port)) {
1966 			DERR(vswp, "%s: Error deleting port %d"
1967 				" from port list", __func__,
1968 				port->p_instance);
1969 			RW_EXIT(&plist->lockrw);
1970 			return (1);
1971 		}
1972 
1973 		/* Remove the fdb entry for this port/mac address */
1974 		(void) vsw_del_fdb(vswp, port);
1975 
1976 		/* Remove any multicast addresses.. */
1977 		vsw_del_mcst_port(port);
1978 
1979 		/*
1980 		 * No longer need to hold the lock on the port list
1981 		 * now that we have unlinked the target port from the
1982 		 * list.
1983 		 */
1984 		RW_EXIT(&plist->lockrw);
1985 		if (vsw_port_delete(port)) {
1986 			DERR(vswp, "%s: Error deleting port %d",
1987 				__func__, port->p_instance);
1988 			return (1);
1989 		}
1990 		WRITE_ENTER(&plist->lockrw);
1991 	}
1992 	RW_EXIT(&plist->lockrw);
1993 
1994 	D1(vswp, "%s: exit", __func__);
1995 
1996 	return (0);
1997 }
1998 
1999 /*
2000  * Delete the specified port.
2001  *
2002  * Returns 0 on success, 1 on failure.
2003  */
2004 static int
2005 vsw_port_delete(vsw_port_t *port)
2006 {
2007 	vsw_ldc_list_t 		*ldcl;
2008 	vsw_t			*vswp = port->p_vswp;
2009 
2010 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
2011 
2012 	(void) vsw_uninit_ldcs(port);
2013 
2014 	/*
2015 	 * Wait for any pending ctrl msg tasks which reference this
2016 	 * port to finish.
2017 	 */
2018 	if (vsw_drain_port_taskq(port))
2019 		return (1);
2020 
2021 	/*
2022 	 * Wait for port reference count to hit zero.
2023 	 */
2024 	mutex_enter(&port->ref_lock);
2025 	while (port->ref_cnt != 0)
2026 		cv_wait(&port->ref_cv, &port->ref_lock);
2027 	mutex_exit(&port->ref_lock);
2028 
2029 	/*
2030 	 * Wait for any active callbacks to finish
2031 	 */
2032 	if (vsw_drain_ldcs(port))
2033 		return (1);
2034 
2035 	ldcl = &port->p_ldclist;
2036 	WRITE_ENTER(&ldcl->lockrw);
2037 	while (ldcl->num_ldcs > 0) {
2038 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
2039 			cmn_err(CE_WARN, "unable to detach ldc %ld",
2040 					ldcl->head->ldc_id);
2041 			RW_EXIT(&ldcl->lockrw);
2042 			return (1);
2043 		}
2044 	}
2045 	RW_EXIT(&ldcl->lockrw);
2046 
2047 	rw_destroy(&port->p_ldclist.lockrw);
2048 
2049 	mutex_destroy(&port->mca_lock);
2050 	mutex_destroy(&port->tx_lock);
2051 	cv_destroy(&port->ref_cv);
2052 	mutex_destroy(&port->ref_lock);
2053 
2054 	cv_destroy(&port->state_cv);
2055 	mutex_destroy(&port->state_lock);
2056 
2057 	kmem_free(port, sizeof (vsw_port_t));
2058 
2059 	D1(vswp, "%s: exit", __func__);
2060 
2061 	return (0);
2062 }
2063 
2064 /*
2065  * Attach a logical domain channel (ldc) under a specified port.
2066  *
2067  * Returns 0 on success, 1 on failure.
2068  */
2069 static int
2070 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
2071 {
2072 	vsw_t 		*vswp = port->p_vswp;
2073 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
2074 	vsw_ldc_t 	*ldcp = NULL;
2075 	ldc_attr_t 	attr;
2076 	ldc_status_t	istatus;
2077 	int 		status = DDI_FAILURE;
2078 
2079 	D1(vswp, "%s: enter", __func__);
2080 
2081 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
2082 	if (ldcp == NULL) {
2083 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
2084 		return (1);
2085 	}
2086 	ldcp->ldc_id = ldc_id;
2087 
2088 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
2089 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
2090 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
2091 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
2092 
2093 	/* required for handshake with peer */
2094 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
2095 	ldcp->peer_session = 0;
2096 	ldcp->session_status = 0;
2097 
2098 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
2099 	ldcp->hss_id = 1;	/* Initial handshake session id */
2100 
2101 	/* only set for outbound lane, inbound set by peer */
2102 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
2103 
2104 	attr.devclass = LDC_DEV_NT_SVC;
2105 	attr.instance = ddi_get_instance(vswp->dip);
2106 	attr.mode = LDC_MODE_UNRELIABLE;
2107 	attr.qlen = VSW_LDC_QLEN;
2108 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
2109 	if (status != 0) {
2110 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
2111 		    __func__, ldc_id, status);
2112 		mutex_destroy(&ldcp->ldc_txlock);
2113 		mutex_destroy(&ldcp->ldc_cblock);
2114 		cv_destroy(&ldcp->drain_cv);
2115 		mutex_destroy(&ldcp->drain_cv_lock);
2116 		mutex_destroy(&ldcp->hss_lock);
2117 		kmem_free(ldcp, sizeof (vsw_ldc_t));
2118 		return (1);
2119 	}
2120 
2121 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
2122 	if (status != 0) {
2123 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
2124 		    __func__, ldc_id, status);
2125 		mutex_destroy(&ldcp->ldc_txlock);
2126 		mutex_destroy(&ldcp->ldc_cblock);
2127 		cv_destroy(&ldcp->drain_cv);
2128 		mutex_destroy(&ldcp->drain_cv_lock);
2129 		mutex_destroy(&ldcp->hss_lock);
2130 		(void) ldc_fini(ldcp->ldc_handle);
2131 		kmem_free(ldcp, sizeof (vsw_ldc_t));
2132 		return (1);
2133 	}
2134 
2135 
2136 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2137 		DERR(vswp, "%s: ldc_status failed", __func__);
2138 		return (1);
2139 	}
2140 
2141 	ldcp->ldc_status = istatus;
2142 	ldcp->ldc_port = port;
2143 	ldcp->ldc_vswp = vswp;
2144 
2145 	/* link it into the list of channels for this port */
2146 	WRITE_ENTER(&ldcl->lockrw);
2147 	ldcp->ldc_next = ldcl->head;
2148 	ldcl->head = ldcp;
2149 	ldcl->num_ldcs++;
2150 	RW_EXIT(&ldcl->lockrw);
2151 
2152 	D1(vswp, "%s: exit", __func__);
2153 	return (0);
2154 }
2155 
2156 /*
2157  * Detach a logical domain channel (ldc) belonging to a
2158  * particular port.
2159  *
2160  * Returns 0 on success, 1 on failure.
2161  */
2162 static int
2163 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
2164 {
2165 	vsw_t 		*vswp = port->p_vswp;
2166 	vsw_ldc_t 	*ldcp, *prev_ldcp;
2167 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2168 	int 		rv;
2169 
2170 	prev_ldcp = ldcl->head;
2171 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
2172 		if (ldcp->ldc_id == ldc_id) {
2173 			break;
2174 		}
2175 	}
2176 
2177 	/* specified ldc id not found */
2178 	if (ldcp == NULL) {
2179 		DERR(vswp, "%s: ldcp = NULL", __func__);
2180 		return (1);
2181 	}
2182 
2183 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
2184 
2185 	/*
2186 	 * Before we can close the channel we must release any mapped
2187 	 * resources (e.g. drings).
2188 	 */
2189 	vsw_free_lane_resources(ldcp, INBOUND);
2190 	vsw_free_lane_resources(ldcp, OUTBOUND);
2191 
2192 	/*
2193 	 * If the close fails we are in serious trouble, as won't
2194 	 * be able to delete the parent port.
2195 	 */
2196 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
2197 		DERR(vswp, "%s: error %d closing channel %lld",
2198 			__func__, rv, ldcp->ldc_id);
2199 		return (1);
2200 	}
2201 
2202 	(void) ldc_fini(ldcp->ldc_handle);
2203 
2204 	ldcp->ldc_status = LDC_INIT;
2205 	ldcp->ldc_handle = NULL;
2206 	ldcp->ldc_vswp = NULL;
2207 	mutex_destroy(&ldcp->ldc_txlock);
2208 	mutex_destroy(&ldcp->ldc_cblock);
2209 	cv_destroy(&ldcp->drain_cv);
2210 	mutex_destroy(&ldcp->drain_cv_lock);
2211 	mutex_destroy(&ldcp->hss_lock);
2212 
2213 	/* unlink it from the list */
2214 	prev_ldcp = ldcp->ldc_next;
2215 	ldcl->num_ldcs--;
2216 	kmem_free(ldcp, sizeof (vsw_ldc_t));
2217 
2218 	return (0);
2219 }
2220 
2221 /*
2222  * Open and attempt to bring up the channel. Note that channel
2223  * can only be brought up if peer has also opened channel.
2224  *
2225  * Returns 0 if can open and bring up channel, otherwise
2226  * returns 1.
2227  */
2228 static int
2229 vsw_ldc_init(vsw_ldc_t *ldcp)
2230 {
2231 	vsw_t 		*vswp = ldcp->ldc_vswp;
2232 	ldc_status_t	istatus = 0;
2233 	int		rv;
2234 
2235 	D1(vswp, "%s: enter", __func__);
2236 
2237 	LDC_ENTER_LOCK(ldcp);
2238 
2239 	/* don't start at 0 in case clients don't like that */
2240 	ldcp->next_ident = 1;
2241 
2242 	rv = ldc_open(ldcp->ldc_handle);
2243 	if (rv != 0) {
2244 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
2245 		    __func__, ldcp->ldc_id, rv);
2246 		LDC_EXIT_LOCK(ldcp);
2247 		return (1);
2248 	}
2249 
2250 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2251 		DERR(vswp, "%s: unable to get status", __func__);
2252 		LDC_EXIT_LOCK(ldcp);
2253 		return (1);
2254 
2255 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
2256 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
2257 		    __func__, ldcp->ldc_id, istatus);
2258 		LDC_EXIT_LOCK(ldcp);
2259 		return (1);
2260 	}
2261 
2262 	ldcp->ldc_status = istatus;
2263 	rv = ldc_up(ldcp->ldc_handle);
2264 	if (rv != 0) {
2265 		/*
2266 		 * Not a fatal error for ldc_up() to fail, as peer
2267 		 * end point may simply not be ready yet.
2268 		 */
2269 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
2270 			ldcp->ldc_id, rv);
2271 		LDC_EXIT_LOCK(ldcp);
2272 		return (1);
2273 	}
2274 
2275 	/*
2276 	 * ldc_up() call is non-blocking so need to explicitly
2277 	 * check channel status to see if in fact the channel
2278 	 * is UP.
2279 	 */
2280 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2281 		DERR(vswp, "%s: unable to get status", __func__);
2282 		LDC_EXIT_LOCK(ldcp);
2283 		return (1);
2284 
2285 	} else if (istatus != LDC_UP) {
2286 		DERR(vswp, "%s: id(%lld) status(%d) is not UP",
2287 		    __func__, ldcp->ldc_id, istatus);
2288 	} else {
2289 		ldcp->ldc_status = istatus;
2290 	}
2291 
2292 	LDC_EXIT_LOCK(ldcp);
2293 
2294 	D1(vswp, "%s: exit", __func__);
2295 	return (0);
2296 }
2297 
2298 /* disable callbacks on the channel */
2299 static int
2300 vsw_ldc_uninit(vsw_ldc_t *ldcp)
2301 {
2302 	vsw_t	*vswp = ldcp->ldc_vswp;
2303 	int	rv;
2304 
2305 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
2306 
2307 	LDC_ENTER_LOCK(ldcp);
2308 
2309 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
2310 	if (rv != 0) {
2311 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
2312 			"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
2313 		LDC_EXIT_LOCK(ldcp);
2314 		return (1);
2315 	}
2316 
2317 	ldcp->ldc_status = LDC_INIT;
2318 
2319 	LDC_EXIT_LOCK(ldcp);
2320 
2321 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
2322 
2323 	return (0);
2324 }
2325 
2326 static int
2327 vsw_init_ldcs(vsw_port_t *port)
2328 {
2329 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2330 	vsw_ldc_t	*ldcp;
2331 
2332 	READ_ENTER(&ldcl->lockrw);
2333 	ldcp =  ldcl->head;
2334 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2335 		(void) vsw_ldc_init(ldcp);
2336 	}
2337 	RW_EXIT(&ldcl->lockrw);
2338 
2339 	return (0);
2340 }
2341 
2342 static int
2343 vsw_uninit_ldcs(vsw_port_t *port)
2344 {
2345 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2346 	vsw_ldc_t	*ldcp;
2347 
2348 	D1(NULL, "vsw_uninit_ldcs: enter\n");
2349 
2350 	READ_ENTER(&ldcl->lockrw);
2351 	ldcp =  ldcl->head;
2352 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2353 		(void) vsw_ldc_uninit(ldcp);
2354 	}
2355 	RW_EXIT(&ldcl->lockrw);
2356 
2357 	D1(NULL, "vsw_uninit_ldcs: exit\n");
2358 
2359 	return (0);
2360 }
2361 
2362 /*
2363  * Wait until the callback(s) associated with the ldcs under the specified
2364  * port have completed.
2365  *
2366  * Prior to this function being invoked each channel under this port
2367  * should have been quiesced via ldc_set_cb_mode(DISABLE).
2368  *
2369  * A short explaination of what we are doing below..
2370  *
2371  * The simplest approach would be to have a reference counter in
2372  * the ldc structure which is increment/decremented by the callbacks as
2373  * they use the channel. The drain function could then simply disable any
2374  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
2375  * there is a tiny window here - before the callback is able to get the lock
2376  * on the channel it is interrupted and this function gets to execute. It
2377  * sees that the ref count is zero and believes its free to delete the
2378  * associated data structures.
2379  *
2380  * We get around this by taking advantage of the fact that before the ldc
2381  * framework invokes a callback it sets a flag to indicate that there is a
2382  * callback active (or about to become active). If when we attempt to
2383  * unregister a callback when this active flag is set then the unregister
2384  * will fail with EWOULDBLOCK.
2385  *
2386  * If the unregister fails we do a cv_timedwait. We will either be signaled
2387  * by the callback as it is exiting (note we have to wait a short period to
2388  * allow the callback to return fully to the ldc framework and it to clear
2389  * the active flag), or by the timer expiring. In either case we again attempt
2390  * the unregister. We repeat this until we can succesfully unregister the
2391  * callback.
2392  *
2393  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
2394  * the case where the callback has finished but the ldc framework has not yet
2395  * cleared the active flag. In this case we would never get a cv_signal.
2396  */
2397 static int
2398 vsw_drain_ldcs(vsw_port_t *port)
2399 {
2400 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2401 	vsw_ldc_t	*ldcp;
2402 	vsw_t		*vswp = port->p_vswp;
2403 
2404 	D1(vswp, "%s: enter", __func__);
2405 
2406 	READ_ENTER(&ldcl->lockrw);
2407 
2408 	ldcp = ldcl->head;
2409 
2410 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2411 		/*
2412 		 * If we can unregister the channel callback then we
2413 		 * know that there is no callback either running or
2414 		 * scheduled to run for this channel so move on to next
2415 		 * channel in the list.
2416 		 */
2417 		mutex_enter(&ldcp->drain_cv_lock);
2418 
2419 		/* prompt active callbacks to quit */
2420 		ldcp->drain_state = VSW_LDC_DRAINING;
2421 
2422 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
2423 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
2424 				ldcp->ldc_id);
2425 			mutex_exit(&ldcp->drain_cv_lock);
2426 			continue;
2427 		} else {
2428 			/*
2429 			 * If we end up here we know that either 1) a callback
2430 			 * is currently executing, 2) is about to start (i.e.
2431 			 * the ldc framework has set the active flag but
2432 			 * has not actually invoked the callback yet, or 3)
2433 			 * has finished and has returned to the ldc framework
2434 			 * but the ldc framework has not yet cleared the
2435 			 * active bit.
2436 			 *
2437 			 * Wait for it to finish.
2438 			 */
2439 			while (ldc_unreg_callback(ldcp->ldc_handle)
2440 								== EWOULDBLOCK)
2441 				(void) cv_timedwait(&ldcp->drain_cv,
2442 					&ldcp->drain_cv_lock, lbolt + hz);
2443 
2444 			mutex_exit(&ldcp->drain_cv_lock);
2445 			D2(vswp, "%s: unreg callback for chan %ld after "
2446 				"timeout", __func__, ldcp->ldc_id);
2447 		}
2448 	}
2449 	RW_EXIT(&ldcl->lockrw);
2450 
2451 	D1(vswp, "%s: exit", __func__);
2452 	return (0);
2453 }
2454 
2455 /*
2456  * Wait until all tasks which reference this port have completed.
2457  *
2458  * Prior to this function being invoked each channel under this port
2459  * should have been quiesced via ldc_set_cb_mode(DISABLE).
2460  */
2461 static int
2462 vsw_drain_port_taskq(vsw_port_t *port)
2463 {
2464 	vsw_t		*vswp = port->p_vswp;
2465 
2466 	D1(vswp, "%s: enter", __func__);
2467 
2468 	/*
2469 	 * Mark the port as in the process of being detached, and
2470 	 * dispatch a marker task to the queue so we know when all
2471 	 * relevant tasks have completed.
2472 	 */
2473 	mutex_enter(&port->state_lock);
2474 	port->state = VSW_PORT_DETACHING;
2475 
2476 	if ((vswp->taskq_p == NULL) ||
2477 		(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
2478 			port, DDI_NOSLEEP) != DDI_SUCCESS)) {
2479 		DERR(vswp, "%s: unable to dispatch marker task",
2480 			__func__);
2481 		mutex_exit(&port->state_lock);
2482 		return (1);
2483 	}
2484 
2485 	/*
2486 	 * Wait for the marker task to finish.
2487 	 */
2488 	while (port->state != VSW_PORT_DETACHABLE)
2489 		cv_wait(&port->state_cv, &port->state_lock);
2490 
2491 	mutex_exit(&port->state_lock);
2492 
2493 	D1(vswp, "%s: exit", __func__);
2494 
2495 	return (0);
2496 }
2497 
2498 static void
2499 vsw_marker_task(void *arg)
2500 {
2501 	vsw_port_t	*port = arg;
2502 	vsw_t		*vswp = port->p_vswp;
2503 
2504 	D1(vswp, "%s: enter", __func__);
2505 
2506 	mutex_enter(&port->state_lock);
2507 
2508 	/*
2509 	 * No further tasks should be dispatched which reference
2510 	 * this port so ok to mark it as safe to detach.
2511 	 */
2512 	port->state = VSW_PORT_DETACHABLE;
2513 
2514 	cv_signal(&port->state_cv);
2515 
2516 	mutex_exit(&port->state_lock);
2517 
2518 	D1(vswp, "%s: exit", __func__);
2519 }
2520 
2521 static vsw_port_t *
2522 vsw_lookup_port(vsw_t *vswp, int p_instance)
2523 {
2524 	vsw_port_list_t *plist = &vswp->plist;
2525 	vsw_port_t	*port;
2526 
2527 	for (port = plist->head; port != NULL; port = port->p_next) {
2528 		if (port->p_instance == p_instance) {
2529 			D2(vswp, "vsw_lookup_port: found p_instance\n");
2530 			return (port);
2531 		}
2532 	}
2533 
2534 	return (NULL);
2535 }
2536 
2537 /*
2538  * Search for and remove the specified port from the port
2539  * list. Returns 0 if able to locate and remove port, otherwise
2540  * returns 1.
2541  */
2542 static int
2543 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
2544 {
2545 	vsw_port_list_t *plist = &vswp->plist;
2546 	vsw_port_t	*curr_p, *prev_p;
2547 
2548 	if (plist->head == NULL)
2549 		return (1);
2550 
2551 	curr_p = prev_p = plist->head;
2552 
2553 	while (curr_p != NULL) {
2554 		if (curr_p == port) {
2555 			if (prev_p == curr_p) {
2556 				plist->head = curr_p->p_next;
2557 			} else {
2558 				prev_p->p_next = curr_p->p_next;
2559 			}
2560 			plist->num_ports--;
2561 			break;
2562 		} else {
2563 			prev_p = curr_p;
2564 			curr_p = curr_p->p_next;
2565 		}
2566 	}
2567 	return (0);
2568 }
2569 
2570 /*
2571  * Interrupt handler for ldc messages.
2572  */
2573 static uint_t
2574 vsw_ldc_cb(uint64_t event, caddr_t arg)
2575 {
2576 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2577 	vsw_t 		*vswp = ldcp->ldc_vswp;
2578 	ldc_status_t	lstatus;
2579 	int		rv;
2580 
2581 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2582 
2583 	mutex_enter(&ldcp->ldc_cblock);
2584 
2585 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
2586 		mutex_exit(&ldcp->ldc_cblock);
2587 		return (LDC_SUCCESS);
2588 	}
2589 
2590 	if (event & LDC_EVT_UP) {
2591 		/*
2592 		 * Channel has come up, get the state and then start
2593 		 * the handshake.
2594 		 */
2595 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
2596 		if (rv != 0) {
2597 			cmn_err(CE_WARN, "Unable to read channel state");
2598 		}
2599 		ldcp->ldc_status = lstatus;
2600 
2601 		D2(vswp, "%s: id(%ld) event(%llx) UP:  status(%ld)",
2602 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2603 
2604 		vsw_restart_handshake(ldcp);
2605 
2606 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
2607 	}
2608 
2609 	if (event & LDC_EVT_READ) {
2610 		/*
2611 		 * Data available for reading.
2612 		 */
2613 		D2(vswp, "%s: id(ld) event(%llx) data READ",
2614 				__func__, ldcp->ldc_id, event);
2615 
2616 		vsw_process_pkt(ldcp);
2617 
2618 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
2619 
2620 		goto vsw_cb_exit;
2621 	}
2622 
2623 	if (event & LDC_EVT_RESET) {
2624 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
2625 		if (rv != 0) {
2626 			cmn_err(CE_WARN, "Unable to read channel state");
2627 		} else {
2628 			ldcp->ldc_status = lstatus;
2629 		}
2630 		D2(vswp, "%s: id(%ld) event(%llx) RESET:  status (%ld)",
2631 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2632 	}
2633 
2634 	if (event & LDC_EVT_DOWN) {
2635 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
2636 		if (rv != 0) {
2637 			cmn_err(CE_WARN, "Unable to read channel state");
2638 		} else {
2639 			ldcp->ldc_status = lstatus;
2640 		}
2641 
2642 		D2(vswp, "%s: id(%ld) event(%llx) DOWN:  status (%ld)",
2643 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2644 
2645 	}
2646 
2647 	/*
2648 	 * Catch either LDC_EVT_WRITE which we don't support or any
2649 	 * unknown event.
2650 	 */
2651 	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
2652 					| LDC_EVT_DOWN | LDC_EVT_READ)) {
2653 
2654 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
2655 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2656 	}
2657 
2658 vsw_cb_exit:
2659 	mutex_exit(&ldcp->ldc_cblock);
2660 
2661 	/*
2662 	 * Let the drain function know we are finishing if it
2663 	 * is waiting.
2664 	 */
2665 	mutex_enter(&ldcp->drain_cv_lock);
2666 	if (ldcp->drain_state == VSW_LDC_DRAINING)
2667 		cv_signal(&ldcp->drain_cv);
2668 	mutex_exit(&ldcp->drain_cv_lock);
2669 
2670 	return (LDC_SUCCESS);
2671 }
2672 
2673 /*
2674  * (Re)start a handshake with our peer by sending them
2675  * our version info.
2676  */
2677 static void
2678 vsw_restart_handshake(vsw_ldc_t *ldcp)
2679 {
2680 	vsw_t		*vswp = ldcp->ldc_vswp;
2681 	vsw_port_t	*port;
2682 	vsw_ldc_list_t	*ldcl;
2683 
2684 	D1(vswp, "vsw_restart_handshake: enter");
2685 
2686 	port = ldcp->ldc_port;
2687 	ldcl = &port->p_ldclist;
2688 
2689 	WRITE_ENTER(&ldcl->lockrw);
2690 
2691 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
2692 		ldcp->lane_in.lstate, ldcp->lane_out.lstate);
2693 
2694 	vsw_free_lane_resources(ldcp, INBOUND);
2695 	vsw_free_lane_resources(ldcp, OUTBOUND);
2696 	RW_EXIT(&ldcl->lockrw);
2697 
2698 	ldcp->lane_in.lstate = 0;
2699 	ldcp->lane_out.lstate = 0;
2700 
2701 	/*
2702 	 * Remove parent port from any multicast groups
2703 	 * it may have registered with. Client must resend
2704 	 * multicast add command after handshake completes.
2705 	 */
2706 	(void) vsw_del_fdb(vswp, port);
2707 
2708 	vsw_del_mcst_port(port);
2709 
2710 	ldcp->hphase = VSW_MILESTONE0;
2711 
2712 	ldcp->peer_session = 0;
2713 	ldcp->session_status = 0;
2714 
2715 	/*
2716 	 * We now increment the transaction group id. This allows
2717 	 * us to identify and disard any tasks which are still pending
2718 	 * on the taskq and refer to the handshake session we are about
2719 	 * to restart. These stale messages no longer have any real
2720 	 * meaning.
2721 	 */
2722 	mutex_enter(&ldcp->hss_lock);
2723 	ldcp->hss_id++;
2724 	mutex_exit(&ldcp->hss_lock);
2725 
2726 	if (ldcp->hcnt++ > vsw_num_handshakes) {
2727 		cmn_err(CE_WARN, "exceeded number of permitted "
2728 			"handshake attempts (%d) on channel %ld",
2729 			ldcp->hcnt, ldcp->ldc_id);
2730 		return;
2731 	}
2732 
2733 	vsw_send_ver(ldcp);
2734 
2735 	D1(vswp, "vsw_restart_handshake: exit");
2736 }
2737 
2738 /*
2739  * returns 0 if legal for event signified by flag to have
2740  * occured at the time it did. Otherwise returns 1.
2741  */
2742 int
2743 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
2744 {
2745 	vsw_t		*vswp = ldcp->ldc_vswp;
2746 	uint64_t	state;
2747 	uint64_t	phase;
2748 
2749 	if (dir == INBOUND)
2750 		state = ldcp->lane_in.lstate;
2751 	else
2752 		state = ldcp->lane_out.lstate;
2753 
2754 	phase = ldcp->hphase;
2755 
2756 	switch (flag) {
2757 	case VSW_VER_INFO_RECV:
2758 		if (phase > VSW_MILESTONE0) {
2759 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
2760 				" when in state %d\n", ldcp->ldc_id, phase);
2761 			vsw_restart_handshake(ldcp);
2762 			return (1);
2763 		}
2764 		break;
2765 
2766 	case VSW_VER_ACK_RECV:
2767 	case VSW_VER_NACK_RECV:
2768 		if (!(state & VSW_VER_INFO_SENT)) {
2769 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
2770 				" or VER_NACK when in state %d\n",
2771 				ldcp->ldc_id, phase);
2772 			vsw_restart_handshake(ldcp);
2773 			return (1);
2774 		} else
2775 			state &= ~VSW_VER_INFO_SENT;
2776 		break;
2777 
2778 	case VSW_ATTR_INFO_RECV:
2779 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
2780 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
2781 				" when in state %d\n", ldcp->ldc_id, phase);
2782 			vsw_restart_handshake(ldcp);
2783 			return (1);
2784 		}
2785 		break;
2786 
2787 	case VSW_ATTR_ACK_RECV:
2788 	case VSW_ATTR_NACK_RECV:
2789 		if (!(state & VSW_ATTR_INFO_SENT)) {
2790 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
2791 				" or ATTR_NACK when in state %d\n",
2792 				ldcp->ldc_id, phase);
2793 			vsw_restart_handshake(ldcp);
2794 			return (1);
2795 		} else
2796 			state &= ~VSW_ATTR_INFO_SENT;
2797 		break;
2798 
2799 	case VSW_DRING_INFO_RECV:
2800 		if (phase < VSW_MILESTONE1) {
2801 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
2802 				" when in state %d\n", ldcp->ldc_id, phase);
2803 			vsw_restart_handshake(ldcp);
2804 			return (1);
2805 		}
2806 		break;
2807 
2808 	case VSW_DRING_ACK_RECV:
2809 	case VSW_DRING_NACK_RECV:
2810 		if (!(state & VSW_DRING_INFO_SENT)) {
2811 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
2812 				" or DRING_NACK when in state %d\n",
2813 				ldcp->ldc_id, phase);
2814 			vsw_restart_handshake(ldcp);
2815 			return (1);
2816 		} else
2817 			state &= ~VSW_DRING_INFO_SENT;
2818 		break;
2819 
2820 	case VSW_RDX_INFO_RECV:
2821 		if (phase < VSW_MILESTONE3) {
2822 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
2823 				" when in state %d\n", ldcp->ldc_id, phase);
2824 			vsw_restart_handshake(ldcp);
2825 			return (1);
2826 		}
2827 		break;
2828 
2829 	case VSW_RDX_ACK_RECV:
2830 	case VSW_RDX_NACK_RECV:
2831 		if (!(state & VSW_RDX_INFO_SENT)) {
2832 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
2833 				" or RDX_NACK when in state %d\n",
2834 				ldcp->ldc_id, phase);
2835 			vsw_restart_handshake(ldcp);
2836 			return (1);
2837 		} else
2838 			state &= ~VSW_RDX_INFO_SENT;
2839 		break;
2840 
2841 	case VSW_MCST_INFO_RECV:
2842 		if (phase < VSW_MILESTONE3) {
2843 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
2844 				" when in state %d\n", ldcp->ldc_id, phase);
2845 			vsw_restart_handshake(ldcp);
2846 			return (1);
2847 		}
2848 		break;
2849 
2850 	default:
2851 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
2852 				ldcp->ldc_id, flag);
2853 		return (1);
2854 	}
2855 
2856 	if (dir == INBOUND)
2857 		ldcp->lane_in.lstate = state;
2858 	else
2859 		ldcp->lane_out.lstate = state;
2860 
2861 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
2862 
2863 	return (0);
2864 }
2865 
2866 void
2867 vsw_next_milestone(vsw_ldc_t *ldcp)
2868 {
2869 	vsw_t		*vswp = ldcp->ldc_vswp;
2870 
2871 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
2872 		ldcp->ldc_id, ldcp->hphase);
2873 
2874 	DUMP_FLAGS(ldcp->lane_in.lstate);
2875 	DUMP_FLAGS(ldcp->lane_out.lstate);
2876 
2877 	switch (ldcp->hphase) {
2878 
2879 	case VSW_MILESTONE0:
2880 		/*
2881 		 * If we haven't started to handshake with our peer,
2882 		 * start to do so now.
2883 		 */
2884 		if (ldcp->lane_out.lstate == 0) {
2885 			D2(vswp, "%s: (chan %lld) starting handshake "
2886 				"with peer", __func__, ldcp->ldc_id);
2887 			vsw_restart_handshake(ldcp);
2888 		}
2889 
2890 		/*
2891 		 * Only way to pass this milestone is to have successfully
2892 		 * negotiated version info.
2893 		 */
2894 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
2895 			(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
2896 
2897 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
2898 				__func__, ldcp->ldc_id);
2899 
2900 			/*
2901 			 * Next milestone is passed when attribute
2902 			 * information has been successfully exchanged.
2903 			 */
2904 			ldcp->hphase = VSW_MILESTONE1;
2905 			vsw_send_attr(ldcp);
2906 
2907 		}
2908 		break;
2909 
2910 	case VSW_MILESTONE1:
2911 		/*
2912 		 * Only way to pass this milestone is to have successfully
2913 		 * negotiated attribute information.
2914 		 */
2915 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
2916 
2917 			ldcp->hphase = VSW_MILESTONE2;
2918 
2919 			/*
2920 			 * If the peer device has said it wishes to
2921 			 * use descriptor rings then we send it our ring
2922 			 * info, otherwise we just set up a private ring
2923 			 * which we use an internal buffer
2924 			 */
2925 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
2926 				vsw_send_dring_info(ldcp);
2927 		}
2928 		break;
2929 
2930 
2931 	case VSW_MILESTONE2:
2932 		/*
2933 		 * If peer has indicated in its attribute message that
2934 		 * it wishes to use descriptor rings then the only way
2935 		 * to pass this milestone is for us to have received
2936 		 * valid dring info.
2937 		 *
2938 		 * If peer is not using descriptor rings then just fall
2939 		 * through.
2940 		 */
2941 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
2942 			(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
2943 			break;
2944 
2945 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
2946 				__func__, ldcp->ldc_id);
2947 
2948 		ldcp->hphase = VSW_MILESTONE3;
2949 		vsw_send_rdx(ldcp);
2950 		break;
2951 
2952 	case VSW_MILESTONE3:
2953 		/*
2954 		 * Pass this milestone when all paramaters have been
2955 		 * successfully exchanged and RDX sent in both directions.
2956 		 *
2957 		 * Mark outbound lane as available to transmit data.
2958 		 */
2959 		if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) &&
2960 			(ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) {
2961 
2962 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
2963 				__func__, ldcp->ldc_id);
2964 			D2(vswp, "%s: ** handshake complete **", __func__);
2965 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
2966 			ldcp->hphase = VSW_MILESTONE4;
2967 			ldcp->hcnt = 0;
2968 			DISPLAY_STATE();
2969 		}
2970 		break;
2971 
2972 	case VSW_MILESTONE4:
2973 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
2974 							ldcp->ldc_id);
2975 		break;
2976 
2977 	default:
2978 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
2979 			ldcp->ldc_id, ldcp->hphase);
2980 	}
2981 
2982 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
2983 		ldcp->hphase);
2984 }
2985 
2986 /*
2987  * Check if major version is supported.
2988  *
2989  * Returns 0 if finds supported major number, and if necessary
2990  * adjusts the minor field.
2991  *
2992  * Returns 1 if can't match major number exactly. Sets mjor/minor
2993  * to next lowest support values, or to zero if no other values possible.
2994  */
2995 static int
2996 vsw_supported_version(vio_ver_msg_t *vp)
2997 {
2998 	int	i;
2999 
3000 	D1(NULL, "vsw_supported_version: enter");
3001 
3002 	for (i = 0; i < VSW_NUM_VER; i++) {
3003 		if (vsw_versions[i].ver_major == vp->ver_major) {
3004 			/*
3005 			 * Matching or lower major version found. Update
3006 			 * minor number if necessary.
3007 			 */
3008 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3009 				D2(NULL, "%s: adjusting minor value"
3010 					" from %d to %d", __func__,
3011 					vp->ver_minor,
3012 					vsw_versions[i].ver_minor);
3013 				vp->ver_minor = vsw_versions[i].ver_minor;
3014 			}
3015 
3016 			return (0);
3017 		}
3018 
3019 		if (vsw_versions[i].ver_major < vp->ver_major) {
3020 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3021 				D2(NULL, "%s: adjusting minor value"
3022 					" from %d to %d", __func__,
3023 					vp->ver_minor,
3024 					vsw_versions[i].ver_minor);
3025 				vp->ver_minor = vsw_versions[i].ver_minor;
3026 			}
3027 			return (1);
3028 		}
3029 	}
3030 
3031 	/* No match was possible, zero out fields */
3032 	vp->ver_major = 0;
3033 	vp->ver_minor = 0;
3034 
3035 	D1(NULL, "vsw_supported_version: exit");
3036 
3037 	return (1);
3038 }
3039 
3040 /*
3041  * Main routine for processing messages received over LDC.
3042  */
3043 static void
3044 vsw_process_pkt(void *arg)
3045 {
3046 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
3047 	vsw_t 		*vswp = ldcp->ldc_vswp;
3048 	size_t		msglen;
3049 	vio_msg_tag_t	tag;
3050 	def_msg_t	dmsg;
3051 	int 		rv = 0;
3052 
3053 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3054 
3055 	/*
3056 	 * If channel is up read messages until channel is empty.
3057 	 */
3058 	do {
3059 		msglen = sizeof (dmsg);
3060 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
3061 
3062 		if (rv != 0) {
3063 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
3064 				"len(%d)\n", __func__, ldcp->ldc_id,
3065 							rv, msglen);
3066 			break;
3067 		}
3068 
3069 		if (msglen == 0) {
3070 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
3071 			ldcp->ldc_id);
3072 			break;
3073 		}
3074 
3075 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
3076 		    ldcp->ldc_id, msglen);
3077 
3078 		/*
3079 		 * Figure out what sort of packet we have gotten by
3080 		 * examining the msg tag, and then switch it appropriately.
3081 		 */
3082 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
3083 
3084 		switch (tag.vio_msgtype) {
3085 		case VIO_TYPE_CTRL:
3086 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
3087 			break;
3088 		case VIO_TYPE_DATA:
3089 			vsw_process_data_pkt(ldcp, &dmsg, tag);
3090 			break;
3091 		case VIO_TYPE_ERR:
3092 			vsw_process_err_pkt(ldcp, &dmsg, tag);
3093 			break;
3094 		default:
3095 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
3096 				"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
3097 			break;
3098 		}
3099 	} while (msglen);
3100 
3101 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3102 }
3103 
3104 /*
3105  * Dispatch a task to process a VIO control message.
3106  */
3107 static void
3108 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
3109 {
3110 	vsw_ctrl_task_t		*ctaskp = NULL;
3111 	vsw_port_t		*port = ldcp->ldc_port;
3112 	vsw_t			*vswp = port->p_vswp;
3113 
3114 	D1(vswp, "%s: enter", __func__);
3115 
3116 	/*
3117 	 * We need to handle RDX ACK messages in-band as once they
3118 	 * are exchanged it is possible that we will get an
3119 	 * immediate (legitimate) data packet.
3120 	 */
3121 	if ((tag.vio_subtype_env == VIO_RDX) &&
3122 		(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
3123 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV))
3124 			return;
3125 
3126 		ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV;
3127 		vsw_next_milestone(ldcp);
3128 		D2(vswp, "%s (%ld) handling RDX_ACK in place", __func__,
3129 			ldcp->ldc_id);
3130 		return;
3131 	}
3132 
3133 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
3134 
3135 	if (ctaskp == NULL) {
3136 		DERR(vswp, "%s: unable to alloc space for ctrl"
3137 			" msg", __func__);
3138 		vsw_restart_handshake(ldcp);
3139 		return;
3140 	}
3141 
3142 	ctaskp->ldcp = ldcp;
3143 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
3144 	mutex_enter(&ldcp->hss_lock);
3145 	ctaskp->hss_id = ldcp->hss_id;
3146 	mutex_exit(&ldcp->hss_lock);
3147 
3148 	/*
3149 	 * Dispatch task to processing taskq if port is not in
3150 	 * the process of being detached.
3151 	 */
3152 	mutex_enter(&port->state_lock);
3153 	if (port->state == VSW_PORT_INIT) {
3154 		if ((vswp->taskq_p == NULL) ||
3155 			(ddi_taskq_dispatch(vswp->taskq_p,
3156 			vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
3157 							!= DDI_SUCCESS)) {
3158 			DERR(vswp, "%s: unable to dispatch task to taskq",
3159 				__func__);
3160 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3161 			mutex_exit(&port->state_lock);
3162 			vsw_restart_handshake(ldcp);
3163 			return;
3164 		}
3165 	} else {
3166 		DWARN(vswp, "%s: port %d detaching, not dispatching "
3167 			"task", __func__, port->p_instance);
3168 	}
3169 
3170 	mutex_exit(&port->state_lock);
3171 
3172 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
3173 			ldcp->ldc_id);
3174 	D1(vswp, "%s: exit", __func__);
3175 }
3176 
3177 /*
3178  * Process a VIO ctrl message. Invoked from taskq.
3179  */
3180 static void
3181 vsw_process_ctrl_pkt(void *arg)
3182 {
3183 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
3184 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
3185 	vsw_t 		*vswp = ldcp->ldc_vswp;
3186 	vio_msg_tag_t	tag;
3187 	uint16_t	env;
3188 
3189 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3190 
3191 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
3192 	env = tag.vio_subtype_env;
3193 
3194 	/* stale pkt check */
3195 	mutex_enter(&ldcp->hss_lock);
3196 	if (ctaskp->hss_id < ldcp->hss_id) {
3197 		DWARN(vswp, "%s: discarding stale packet belonging to"
3198 			" earlier (%ld) handshake session", __func__,
3199 			ctaskp->hss_id);
3200 		mutex_exit(&ldcp->hss_lock);
3201 		return;
3202 	}
3203 	mutex_exit(&ldcp->hss_lock);
3204 
3205 	/* session id check */
3206 	if (ldcp->session_status & VSW_PEER_SESSION) {
3207 		if (ldcp->peer_session != tag.vio_sid) {
3208 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3209 				__func__, ldcp->ldc_id, tag.vio_sid);
3210 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3211 			vsw_restart_handshake(ldcp);
3212 			return;
3213 		}
3214 	}
3215 
3216 	/*
3217 	 * Switch on vio_subtype envelope, then let lower routines
3218 	 * decide if its an INFO, ACK or NACK packet.
3219 	 */
3220 	switch (env) {
3221 	case VIO_VER_INFO:
3222 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
3223 		break;
3224 	case VIO_DRING_REG:
3225 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
3226 		break;
3227 	case VIO_DRING_UNREG:
3228 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
3229 		break;
3230 	case VIO_ATTR_INFO:
3231 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
3232 		break;
3233 	case VNET_MCAST_INFO:
3234 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
3235 		break;
3236 	case VIO_RDX:
3237 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
3238 		break;
3239 	default:
3240 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
3241 							__func__, env);
3242 	}
3243 
3244 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3245 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3246 }
3247 
3248 /*
3249  * Version negotiation. We can end up here either because our peer
3250  * has responded to a handshake message we have sent it, or our peer
3251  * has initiated a handshake with us. If its the former then can only
3252  * be ACK or NACK, if its the later can only be INFO.
3253  *
3254  * If its an ACK we move to the next stage of the handshake, namely
3255  * attribute exchange. If its a NACK we see if we can specify another
3256  * version, if we can't we stop.
3257  *
3258  * If it is an INFO we reset all params associated with communication
3259  * in that direction over this channel (remember connection is
3260  * essentially 2 independent simplex channels).
3261  */
3262 void
3263 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
3264 {
3265 	vio_ver_msg_t	*ver_pkt;
3266 	vsw_t 		*vswp = ldcp->ldc_vswp;
3267 
3268 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3269 
3270 	/*
3271 	 * We know this is a ctrl/version packet so
3272 	 * cast it into the correct structure.
3273 	 */
3274 	ver_pkt = (vio_ver_msg_t *)pkt;
3275 
3276 	switch (ver_pkt->tag.vio_subtype) {
3277 	case VIO_SUBTYPE_INFO:
3278 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
3279 
3280 		/*
3281 		 * Record the session id, which we will use from now
3282 		 * until we see another VER_INFO msg. Even then the
3283 		 * session id in most cases will be unchanged, execpt
3284 		 * if channel was reset.
3285 		 */
3286 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
3287 			(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
3288 			DERR(vswp, "%s: updating session id for chan %lld "
3289 				"from %llx to %llx", __func__, ldcp->ldc_id,
3290 				ldcp->peer_session, ver_pkt->tag.vio_sid);
3291 		}
3292 
3293 		ldcp->peer_session = ver_pkt->tag.vio_sid;
3294 		ldcp->session_status |= VSW_PEER_SESSION;
3295 
3296 		/* Legal message at this time ? */
3297 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
3298 			return;
3299 
3300 		/*
3301 		 * First check the device class. Currently only expect
3302 		 * to be talking to a network device. In the future may
3303 		 * also talk to another switch.
3304 		 */
3305 		if (ver_pkt->dev_class != VDEV_NETWORK) {
3306 			DERR(vswp, "%s: illegal device class %d", __func__,
3307 				ver_pkt->dev_class);
3308 
3309 			ver_pkt->tag.vio_sid = ldcp->local_session;
3310 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3311 
3312 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3313 
3314 			vsw_send_msg(ldcp, (void *)ver_pkt,
3315 					sizeof (vio_ver_msg_t));
3316 
3317 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
3318 			vsw_next_milestone(ldcp);
3319 			return;
3320 		} else {
3321 			ldcp->dev_class = ver_pkt->dev_class;
3322 		}
3323 
3324 		/*
3325 		 * Now check the version.
3326 		 */
3327 		if (vsw_supported_version(ver_pkt) == 0) {
3328 			/*
3329 			 * Support this major version and possibly
3330 			 * adjusted minor version.
3331 			 */
3332 
3333 			D2(vswp, "%s: accepted ver %d:%d", __func__,
3334 				ver_pkt->ver_major, ver_pkt->ver_minor);
3335 
3336 			/* Store accepted values */
3337 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
3338 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3339 
3340 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3341 
3342 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
3343 		} else {
3344 			/*
3345 			 * NACK back with the next lower major/minor
3346 			 * pairing we support (if don't suuport any more
3347 			 * versions then they will be set to zero.
3348 			 */
3349 
3350 			D2(vswp, "%s: replying with ver %d:%d", __func__,
3351 				ver_pkt->ver_major, ver_pkt->ver_minor);
3352 
3353 			/* Store updated values */
3354 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
3355 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3356 
3357 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3358 
3359 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
3360 		}
3361 
3362 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3363 		ver_pkt->tag.vio_sid = ldcp->local_session;
3364 		vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t));
3365 
3366 		vsw_next_milestone(ldcp);
3367 		break;
3368 
3369 	case VIO_SUBTYPE_ACK:
3370 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
3371 
3372 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
3373 			return;
3374 
3375 		/* Store updated values */
3376 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
3377 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3378 
3379 
3380 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
3381 		vsw_next_milestone(ldcp);
3382 
3383 		break;
3384 
3385 	case VIO_SUBTYPE_NACK:
3386 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
3387 
3388 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
3389 			return;
3390 
3391 		/*
3392 		 * If our peer sent us a NACK with the ver fields set to
3393 		 * zero then there is nothing more we can do. Otherwise see
3394 		 * if we support either the version suggested, or a lesser
3395 		 * one.
3396 		 */
3397 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
3398 			DERR(vswp, "%s: peer unable to negotiate any "
3399 				"further.", __func__);
3400 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
3401 			vsw_next_milestone(ldcp);
3402 			return;
3403 		}
3404 
3405 		/*
3406 		 * Check to see if we support this major version or
3407 		 * a lower one. If we don't then maj/min will be set
3408 		 * to zero.
3409 		 */
3410 		(void) vsw_supported_version(ver_pkt);
3411 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
3412 			/* Nothing more we can do */
3413 			DERR(vswp, "%s: version negotiation failed.\n",
3414 								__func__);
3415 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
3416 			vsw_next_milestone(ldcp);
3417 		} else {
3418 			/* found a supported major version */
3419 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
3420 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
3421 
3422 			D2(vswp, "%s: resending with updated values (%x, %x)",
3423 				__func__, ver_pkt->ver_major,
3424 				ver_pkt->ver_minor);
3425 
3426 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
3427 			ver_pkt->tag.vio_sid = ldcp->local_session;
3428 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3429 
3430 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3431 
3432 			vsw_send_msg(ldcp, (void *)ver_pkt,
3433 					sizeof (vio_ver_msg_t));
3434 
3435 			vsw_next_milestone(ldcp);
3436 
3437 		}
3438 		break;
3439 
3440 	default:
3441 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3442 			ver_pkt->tag.vio_subtype);
3443 	}
3444 
3445 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
3446 }
3447 
3448 /*
3449  * Process an attribute packet. We can end up here either because our peer
3450  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
3451  * peer has sent us an attribute INFO message
3452  *
3453  * If its an ACK we then move to the next stage of the handshake which
3454  * is to send our descriptor ring info to our peer. If its a NACK then
3455  * there is nothing more we can (currently) do.
3456  *
3457  * If we get a valid/acceptable INFO packet (and we have already negotiated
3458  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
3459  * NACK back and reset channel state to INACTIV.
3460  *
3461  * FUTURE: in time we will probably negotiate over attributes, but for
3462  * the moment unacceptable attributes are regarded as a fatal error.
3463  *
3464  */
3465 void
3466 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
3467 {
3468 	vnet_attr_msg_t		*attr_pkt;
3469 	vsw_t			*vswp = ldcp->ldc_vswp;
3470 	vsw_port_t		*port = ldcp->ldc_port;
3471 	uint64_t		macaddr = 0;
3472 	int			i;
3473 
3474 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3475 
3476 	/*
3477 	 * We know this is a ctrl/attr packet so
3478 	 * cast it into the correct structure.
3479 	 */
3480 	attr_pkt = (vnet_attr_msg_t *)pkt;
3481 
3482 	switch (attr_pkt->tag.vio_subtype) {
3483 	case VIO_SUBTYPE_INFO:
3484 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3485 
3486 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
3487 			return;
3488 
3489 		/*
3490 		 * If the attributes are unacceptable then we NACK back.
3491 		 */
3492 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
3493 
3494 			DERR(vswp, "%s (chan %d): invalid attributes",
3495 				__func__, ldcp->ldc_id);
3496 
3497 			vsw_free_lane_resources(ldcp, INBOUND);
3498 
3499 			attr_pkt->tag.vio_sid = ldcp->local_session;
3500 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3501 
3502 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
3503 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
3504 			vsw_send_msg(ldcp, (void *)attr_pkt,
3505 					sizeof (vnet_attr_msg_t));
3506 
3507 			vsw_next_milestone(ldcp);
3508 			return;
3509 		}
3510 
3511 		/*
3512 		 * Otherwise store attributes for this lane and update
3513 		 * lane state.
3514 		 */
3515 		ldcp->lane_in.mtu = attr_pkt->mtu;
3516 		ldcp->lane_in.addr = attr_pkt->addr;
3517 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
3518 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
3519 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
3520 
3521 		macaddr = ldcp->lane_in.addr;
3522 		for (i = ETHERADDRL - 1; i >= 0; i--) {
3523 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
3524 			macaddr >>= 8;
3525 		}
3526 
3527 		/* create the fdb entry for this port/mac address */
3528 		(void) vsw_add_fdb(vswp, port);
3529 
3530 		/* setup device specifc xmit routines */
3531 		mutex_enter(&port->tx_lock);
3532 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
3533 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
3534 			port->transmit = vsw_dringsend;
3535 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
3536 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
3537 			vsw_create_privring(ldcp);
3538 			port->transmit = vsw_descrsend;
3539 		}
3540 		mutex_exit(&port->tx_lock);
3541 
3542 		attr_pkt->tag.vio_sid = ldcp->local_session;
3543 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3544 
3545 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
3546 
3547 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
3548 
3549 		vsw_send_msg(ldcp, (void *)attr_pkt,
3550 					sizeof (vnet_attr_msg_t));
3551 
3552 		vsw_next_milestone(ldcp);
3553 		break;
3554 
3555 	case VIO_SUBTYPE_ACK:
3556 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3557 
3558 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
3559 			return;
3560 
3561 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
3562 		vsw_next_milestone(ldcp);
3563 		break;
3564 
3565 	case VIO_SUBTYPE_NACK:
3566 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3567 
3568 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
3569 			return;
3570 
3571 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
3572 		vsw_next_milestone(ldcp);
3573 		break;
3574 
3575 	default:
3576 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3577 			attr_pkt->tag.vio_subtype);
3578 	}
3579 
3580 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3581 }
3582 
3583 /*
3584  * Process a dring info packet. We can end up here either because our peer
3585  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3586  * peer has sent us a dring INFO message.
3587  *
3588  * If we get a valid/acceptable INFO packet (and we have already negotiated
3589  * a version) we ACK back and update the lane state, otherwise we NACK back.
3590  *
3591  * FUTURE: nothing to stop client from sending us info on multiple dring's
3592  * but for the moment we will just use the first one we are given.
3593  *
3594  */
3595 void
3596 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3597 {
3598 	vio_dring_reg_msg_t	*dring_pkt;
3599 	vsw_t			*vswp = ldcp->ldc_vswp;
3600 	ldc_mem_info_t		minfo;
3601 	dring_info_t		*dp, *dbp;
3602 	int			dring_found = 0;
3603 
3604 	/*
3605 	 * We know this is a ctrl/dring packet so
3606 	 * cast it into the correct structure.
3607 	 */
3608 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
3609 
3610 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3611 
3612 	switch (dring_pkt->tag.vio_subtype) {
3613 	case VIO_SUBTYPE_INFO:
3614 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3615 
3616 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3617 			return;
3618 
3619 		/*
3620 		 * If the dring params are unacceptable then we NACK back.
3621 		 */
3622 		if (vsw_check_dring_info(dring_pkt)) {
3623 
3624 			DERR(vswp, "%s (%lld): invalid dring info",
3625 				__func__, ldcp->ldc_id);
3626 
3627 			vsw_free_lane_resources(ldcp, INBOUND);
3628 
3629 			dring_pkt->tag.vio_sid = ldcp->local_session;
3630 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3631 
3632 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3633 
3634 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3635 
3636 			vsw_send_msg(ldcp, (void *)dring_pkt,
3637 					sizeof (vio_dring_reg_msg_t));
3638 
3639 			vsw_next_milestone(ldcp);
3640 			return;
3641 		}
3642 
3643 		/*
3644 		 * Otherwise, attempt to map in the dring using the
3645 		 * cookie. If that succeeds we send back a unique dring
3646 		 * identifier that the sending side will use in future
3647 		 * to refer to this descriptor ring.
3648 		 */
3649 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
3650 
3651 		dp->num_descriptors = dring_pkt->num_descriptors;
3652 		dp->descriptor_size = dring_pkt->descriptor_size;
3653 		dp->options = dring_pkt->options;
3654 		dp->ncookies = dring_pkt->ncookies;
3655 
3656 		/*
3657 		 * Note: should only get one cookie. Enforced in
3658 		 * the ldc layer.
3659 		 */
3660 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
3661 			sizeof (ldc_mem_cookie_t));
3662 
3663 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
3664 			dp->num_descriptors, dp->descriptor_size);
3665 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
3666 			dp->options, dp->ncookies);
3667 
3668 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
3669 			dp->ncookies, dp->num_descriptors,
3670 			dp->descriptor_size, LDC_SHADOW_MAP,
3671 			&(dp->handle))) != 0) {
3672 
3673 			DERR(vswp, "%s: dring_map failed\n", __func__);
3674 
3675 			kmem_free(dp, sizeof (dring_info_t));
3676 			vsw_free_lane_resources(ldcp, INBOUND);
3677 
3678 			dring_pkt->tag.vio_sid = ldcp->local_session;
3679 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3680 
3681 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3682 
3683 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3684 			vsw_send_msg(ldcp, (void *)dring_pkt,
3685 				sizeof (vio_dring_reg_msg_t));
3686 
3687 			vsw_next_milestone(ldcp);
3688 			return;
3689 		}
3690 
3691 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
3692 
3693 			DERR(vswp, "%s: dring_addr failed\n", __func__);
3694 
3695 			kmem_free(dp, sizeof (dring_info_t));
3696 			vsw_free_lane_resources(ldcp, INBOUND);
3697 
3698 			dring_pkt->tag.vio_sid = ldcp->local_session;
3699 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3700 
3701 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3702 
3703 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3704 			vsw_send_msg(ldcp, (void *)dring_pkt,
3705 				sizeof (vio_dring_reg_msg_t));
3706 
3707 			vsw_next_milestone(ldcp);
3708 			return;
3709 		} else {
3710 			/* store the address of the pub part of ring */
3711 			dp->pub_addr = minfo.vaddr;
3712 		}
3713 
3714 		/* no private section as we are importing */
3715 		dp->priv_addr = NULL;
3716 
3717 		/*
3718 		 * Using simple mono increasing int for ident at
3719 		 * the moment.
3720 		 */
3721 		dp->ident = ldcp->next_ident;
3722 		ldcp->next_ident++;
3723 
3724 		dp->end_idx = 0;
3725 		dp->next = NULL;
3726 
3727 		/*
3728 		 * Link it onto the end of the list of drings
3729 		 * for this lane.
3730 		 */
3731 		if (ldcp->lane_in.dringp == NULL) {
3732 			D2(vswp, "%s: adding first INBOUND dring", __func__);
3733 			ldcp->lane_in.dringp = dp;
3734 		} else {
3735 			dbp = ldcp->lane_in.dringp;
3736 
3737 			while (dbp->next != NULL)
3738 				dbp = dbp->next;
3739 
3740 			dbp->next = dp;
3741 		}
3742 
3743 		/* acknowledge it */
3744 		dring_pkt->tag.vio_sid = ldcp->local_session;
3745 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3746 		dring_pkt->dring_ident = dp->ident;
3747 
3748 		vsw_send_msg(ldcp, (void *)dring_pkt,
3749 				sizeof (vio_dring_reg_msg_t));
3750 
3751 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
3752 		vsw_next_milestone(ldcp);
3753 		break;
3754 
3755 	case VIO_SUBTYPE_ACK:
3756 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3757 
3758 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
3759 			return;
3760 
3761 		/*
3762 		 * Peer is acknowledging our dring info and will have
3763 		 * sent us a dring identifier which we will use to
3764 		 * refer to this ring w.r.t. our peer.
3765 		 */
3766 		dp = ldcp->lane_out.dringp;
3767 		if (dp != NULL) {
3768 			/*
3769 			 * Find the ring this ident should be associated
3770 			 * with.
3771 			 */
3772 			if (vsw_dring_match(dp, dring_pkt)) {
3773 				dring_found = 1;
3774 
3775 			} else while (dp != NULL) {
3776 				if (vsw_dring_match(dp, dring_pkt)) {
3777 					dring_found = 1;
3778 					break;
3779 				}
3780 				dp = dp->next;
3781 			}
3782 
3783 			if (dring_found == 0) {
3784 				DERR(NULL, "%s: unrecognised ring cookie",
3785 					__func__);
3786 				vsw_restart_handshake(ldcp);
3787 				return;
3788 			}
3789 
3790 		} else {
3791 			DERR(vswp, "%s: DRING ACK received but no drings "
3792 				"allocated", __func__);
3793 			vsw_restart_handshake(ldcp);
3794 			return;
3795 		}
3796 
3797 		/* store ident */
3798 		dp->ident = dring_pkt->dring_ident;
3799 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
3800 		vsw_next_milestone(ldcp);
3801 		break;
3802 
3803 	case VIO_SUBTYPE_NACK:
3804 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3805 
3806 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3807 			return;
3808 
3809 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
3810 		vsw_next_milestone(ldcp);
3811 		break;
3812 
3813 	default:
3814 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3815 			dring_pkt->tag.vio_subtype);
3816 	}
3817 
3818 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3819 }
3820 
3821 /*
3822  * Process a request from peer to unregister a dring.
3823  *
3824  * For the moment we just restart the handshake if our
3825  * peer endpoint attempts to unregister a dring.
3826  */
3827 void
3828 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3829 {
3830 	vsw_t			*vswp = ldcp->ldc_vswp;
3831 	vio_dring_unreg_msg_t	*dring_pkt;
3832 
3833 	/*
3834 	 * We know this is a ctrl/dring packet so
3835 	 * cast it into the correct structure.
3836 	 */
3837 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3838 
3839 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3840 
3841 	switch (dring_pkt->tag.vio_subtype) {
3842 	case VIO_SUBTYPE_INFO:
3843 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3844 
3845 		DWARN(vswp, "%s: restarting handshake..", __func__);
3846 		vsw_restart_handshake(ldcp);
3847 		break;
3848 
3849 	case VIO_SUBTYPE_ACK:
3850 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3851 
3852 		DWARN(vswp, "%s: restarting handshake..", __func__);
3853 		vsw_restart_handshake(ldcp);
3854 		break;
3855 
3856 	case VIO_SUBTYPE_NACK:
3857 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3858 
3859 		DWARN(vswp, "%s: restarting handshake..", __func__);
3860 		vsw_restart_handshake(ldcp);
3861 		break;
3862 
3863 	default:
3864 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3865 			dring_pkt->tag.vio_subtype);
3866 		vsw_restart_handshake(ldcp);
3867 	}
3868 
3869 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3870 }
3871 
3872 #define	SND_MCST_NACK(ldcp, pkt) \
3873 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3874 	pkt->tag.vio_sid = ldcp->local_session; \
3875 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t));
3876 
3877 /*
3878  * Process a multicast request from a vnet.
3879  *
3880  * Vnet's specify a multicast address that they are interested in. This
3881  * address is used as a key into the hash table which forms the multicast
3882  * forwarding database (mFDB).
3883  *
3884  * The table keys are the multicast addresses, while the table entries
3885  * are pointers to lists of ports which wish to receive packets for the
3886  * specified multicast address.
3887  *
3888  * When a multicast packet is being switched we use the address as a key
3889  * into the hash table, and then walk the appropriate port list forwarding
3890  * the pkt to each port in turn.
3891  *
3892  * If a vnet is no longer interested in a particular multicast grouping
3893  * we simply find the correct location in the hash table and then delete
3894  * the relevant port from the port list.
3895  *
3896  * To deal with the case whereby a port is being deleted without first
3897  * removing itself from the lists in the hash table, we maintain a list
3898  * of multicast addresses the port has registered an interest in, within
3899  * the port structure itself. We then simply walk that list of addresses
3900  * using them as keys into the hash table and remove the port from the
3901  * appropriate lists.
3902  */
3903 static void
3904 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3905 {
3906 	vnet_mcast_msg_t	*mcst_pkt;
3907 	vsw_port_t		*port = ldcp->ldc_port;
3908 	vsw_t			*vswp = ldcp->ldc_vswp;
3909 	int			i;
3910 
3911 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3912 
3913 	/*
3914 	 * We know this is a ctrl/mcast packet so
3915 	 * cast it into the correct structure.
3916 	 */
3917 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3918 
3919 	switch (mcst_pkt->tag.vio_subtype) {
3920 	case VIO_SUBTYPE_INFO:
3921 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3922 
3923 		/*
3924 		 * Check if in correct state to receive a multicast
3925 		 * message (i.e. handshake complete). If not reset
3926 		 * the handshake.
3927 		 */
3928 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3929 			return;
3930 
3931 		/*
3932 		 * Before attempting to add or remove address check
3933 		 * that they are valid multicast addresses.
3934 		 * If not, then NACK back.
3935 		 */
3936 		for (i = 0; i < mcst_pkt->count; i++) {
3937 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3938 				DERR(vswp, "%s: invalid multicast address",
3939 								__func__);
3940 				SND_MCST_NACK(ldcp, mcst_pkt);
3941 				return;
3942 			}
3943 		}
3944 
3945 		/*
3946 		 * Now add/remove the addresses. If this fails we
3947 		 * NACK back.
3948 		 */
3949 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3950 			SND_MCST_NACK(ldcp, mcst_pkt);
3951 			return;
3952 		}
3953 
3954 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3955 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3956 
3957 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3958 
3959 		vsw_send_msg(ldcp, (void *)mcst_pkt,
3960 					sizeof (vnet_mcast_msg_t));
3961 		break;
3962 
3963 	case VIO_SUBTYPE_ACK:
3964 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3965 
3966 		/*
3967 		 * We shouldn't ever get a multicast ACK message as
3968 		 * at the moment we never request multicast addresses
3969 		 * to be set on some other device. This may change in
3970 		 * the future if we have cascading switches.
3971 		 */
3972 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3973 			return;
3974 
3975 				/* Do nothing */
3976 		break;
3977 
3978 	case VIO_SUBTYPE_NACK:
3979 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3980 
3981 		/*
3982 		 * We shouldn't get a multicast NACK packet for the
3983 		 * same reasons as we shouldn't get a ACK packet.
3984 		 */
3985 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3986 			return;
3987 
3988 				/* Do nothing */
3989 		break;
3990 
3991 	default:
3992 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3993 			mcst_pkt->tag.vio_subtype);
3994 	}
3995 
3996 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3997 }
3998 
3999 static void
4000 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
4001 {
4002 	vio_rdx_msg_t	*rdx_pkt;
4003 	vsw_t		*vswp = ldcp->ldc_vswp;
4004 
4005 	/*
4006 	 * We know this is a ctrl/rdx packet so
4007 	 * cast it into the correct structure.
4008 	 */
4009 	rdx_pkt = (vio_rdx_msg_t *)pkt;
4010 
4011 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4012 
4013 	switch (rdx_pkt->tag.vio_subtype) {
4014 	case VIO_SUBTYPE_INFO:
4015 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4016 
4017 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV))
4018 			return;
4019 
4020 		rdx_pkt->tag.vio_sid = ldcp->local_session;
4021 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4022 
4023 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
4024 
4025 		ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT;
4026 
4027 		vsw_send_msg(ldcp, (void *)rdx_pkt,
4028 				sizeof (vio_rdx_msg_t));
4029 
4030 		vsw_next_milestone(ldcp);
4031 		break;
4032 
4033 	case VIO_SUBTYPE_ACK:
4034 		/*
4035 		 * Should be handled in-band by callback handler.
4036 		 */
4037 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
4038 		vsw_restart_handshake(ldcp);
4039 		break;
4040 
4041 	case VIO_SUBTYPE_NACK:
4042 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4043 
4044 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV))
4045 			return;
4046 
4047 		ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV;
4048 		vsw_next_milestone(ldcp);
4049 		break;
4050 
4051 	default:
4052 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4053 			rdx_pkt->tag.vio_subtype);
4054 	}
4055 
4056 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4057 }
4058 
4059 static void
4060 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
4061 {
4062 	uint16_t	env = tag.vio_subtype_env;
4063 	vsw_t		*vswp = ldcp->ldc_vswp;
4064 
4065 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4066 
4067 	/* session id check */
4068 	if (ldcp->session_status & VSW_PEER_SESSION) {
4069 		if (ldcp->peer_session != tag.vio_sid) {
4070 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
4071 				__func__, ldcp->ldc_id, tag.vio_sid);
4072 			vsw_restart_handshake(ldcp);
4073 			return;
4074 		}
4075 	}
4076 
4077 	/*
4078 	 * It is an error for us to be getting data packets
4079 	 * before the handshake has completed.
4080 	 */
4081 	if (ldcp->hphase != VSW_MILESTONE4) {
4082 		DERR(vswp, "%s: got data packet before handshake complete "
4083 			"hphase %d (%x: %x)", __func__, ldcp->hphase,
4084 			ldcp->lane_in.lstate, ldcp->lane_out.lstate);
4085 		DUMP_FLAGS(ldcp->lane_in.lstate);
4086 		DUMP_FLAGS(ldcp->lane_out.lstate);
4087 		vsw_restart_handshake(ldcp);
4088 		return;
4089 	}
4090 
4091 	/*
4092 	 * Switch on vio_subtype envelope, then let lower routines
4093 	 * decide if its an INFO, ACK or NACK packet.
4094 	 */
4095 	if (env == VIO_DRING_DATA) {
4096 		vsw_process_data_dring_pkt(ldcp, dpkt);
4097 	} else if (env == VIO_PKT_DATA) {
4098 		vsw_process_data_raw_pkt(ldcp, dpkt);
4099 	} else if (env == VIO_DESC_DATA) {
4100 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
4101 	} else {
4102 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
4103 							__func__, env);
4104 	}
4105 
4106 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4107 }
4108 
4109 #define	SND_DRING_NACK(ldcp, pkt) \
4110 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4111 	pkt->tag.vio_sid = ldcp->local_session; \
4112 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t));
4113 
4114 static void
4115 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
4116 {
4117 	vio_dring_msg_t		*dring_pkt;
4118 	vnet_public_desc_t	*pub_addr = NULL;
4119 	vsw_private_desc_t	*priv_addr = NULL;
4120 	dring_info_t		*dp = NULL;
4121 	vsw_t			*vswp = ldcp->ldc_vswp;
4122 	mblk_t			*mp = NULL;
4123 	mblk_t			*bp = NULL;
4124 	mblk_t			*bpt = NULL;
4125 	size_t			nbytes = 0;
4126 	size_t			off = 0;
4127 	uint64_t		ncookies = 0;
4128 	uint64_t		chain = 0;
4129 	uint64_t		j, len, num;
4130 	uint32_t		start, end, datalen;
4131 	int			i, last_sync, rv;
4132 	boolean_t		ack_needed = B_FALSE;
4133 	boolean_t		sync_needed = B_TRUE;
4134 
4135 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4136 
4137 	/*
4138 	 * We know this is a data/dring packet so
4139 	 * cast it into the correct structure.
4140 	 */
4141 	dring_pkt = (vio_dring_msg_t *)dpkt;
4142 
4143 	/*
4144 	 * Switch on the vio_subtype. If its INFO then we need to
4145 	 * process the data. If its an ACK we need to make sure
4146 	 * it makes sense (i.e did we send an earlier data/info),
4147 	 * and if its a NACK then we maybe attempt a retry.
4148 	 */
4149 	switch (dring_pkt->tag.vio_subtype) {
4150 	case VIO_SUBTYPE_INFO:
4151 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
4152 
4153 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
4154 				dring_pkt->dring_ident)) == NULL) {
4155 
4156 			DERR(vswp, "%s(%lld): unable to find dring from "
4157 				"ident 0x%llx", __func__, ldcp->ldc_id,
4158 				dring_pkt->dring_ident);
4159 
4160 			SND_DRING_NACK(ldcp, dring_pkt);
4161 			return;
4162 		}
4163 
4164 		start = end = 0;
4165 		start = dring_pkt->start_idx;
4166 		end = dring_pkt->end_idx;
4167 
4168 		D3(vswp, "%s(%lld): start index %ld : end %ld\n",
4169 			__func__, ldcp->ldc_id, start, end);
4170 
4171 		/* basic sanity check */
4172 		len = dp->num_descriptors;
4173 		if (end > len) {
4174 			DERR(vswp, "%s(%lld): endpoint %lld outside ring"
4175 				" length %lld", __func__, ldcp->ldc_id,
4176 				end, len);
4177 
4178 			SND_DRING_NACK(ldcp, dring_pkt);
4179 			return;
4180 		}
4181 
4182 		/* sync data */
4183 		if ((rv = ldc_mem_dring_acquire(dp->handle,
4184 						start, end)) != 0) {
4185 			DERR(vswp, "%s(%lld): unable to acquire dring : err %d",
4186 				__func__, ldcp->ldc_id, rv);
4187 			return;
4188 		}
4189 
4190 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
4191 
4192 		j = num = 0;
4193 
4194 		/* calculate # descriptors taking into a/c wrap around */
4195 		num = end >= start ? end - start + 1: (len - start + 1) + end;
4196 
4197 		last_sync = start;
4198 
4199 		for (i = start; j < num; i = (i + 1) % len, j++) {
4200 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
4201 
4202 			/*
4203 			 * Data is padded to align on 8 byte boundary,
4204 			 * datalen is actual data length, i.e. minus that
4205 			 * padding.
4206 			 */
4207 			datalen = pub_addr->nbytes;
4208 
4209 			/*
4210 			 * Does peer wish us to ACK when we have finished
4211 			 * with this descriptor ?
4212 			 */
4213 			if (pub_addr->hdr.ack)
4214 				ack_needed = B_TRUE;
4215 
4216 			D2(vswp, "%s(%lld): processing desc %lld at pos"
4217 				" 0x%llx : dstate 0x%lx : datalen 0x%lx",
4218 				__func__, ldcp->ldc_id, i, pub_addr,
4219 				pub_addr->hdr.dstate, datalen);
4220 
4221 			/*
4222 			 * XXXX : Is it a fatal error to be told to
4223 			 * process a packet when the READY bit is not
4224 			 * set ?
4225 			 */
4226 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
4227 				DERR(vswp, "%s(%d): descriptor %lld at pos "
4228 				" 0x%llx not READY (0x%lx)", __func__,
4229 				ldcp->ldc_id, i, pub_addr,
4230 				pub_addr->hdr.dstate);
4231 
4232 				SND_DRING_NACK(ldcp, dring_pkt);
4233 				(void) ldc_mem_dring_release(dp->handle,
4234 					start, end);
4235 				return;
4236 			}
4237 
4238 			/*
4239 			 * Mark that we are starting to process descriptor.
4240 			 */
4241 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
4242 
4243 			/*
4244 			 * allocb(9F) returns an aligned data block. We
4245 			 * need to ensure that we ask ldc for an aligned
4246 			 * number of bytes also.
4247 			 */
4248 			nbytes = datalen;
4249 			if (nbytes & 0x7) {
4250 				off = 8 - (nbytes & 0x7);
4251 				nbytes += off;
4252 			}
4253 			mp = allocb(datalen, BPRI_MED);
4254 			if (mp == NULL) {
4255 				DERR(vswp, "%s(%lld): allocb failed",
4256 					__func__, ldcp->ldc_id);
4257 				(void) ldc_mem_dring_release(dp->handle,
4258 					start, end);
4259 				return;
4260 			}
4261 
4262 			ncookies = pub_addr->ncookies;
4263 			rv = ldc_mem_copy(ldcp->ldc_handle,
4264 				(caddr_t)mp->b_rptr, 0, &nbytes,
4265 				pub_addr->memcookie, ncookies,
4266 				LDC_COPY_IN);
4267 
4268 			if (rv != 0) {
4269 				DERR(vswp, "%s(%d): unable to copy in "
4270 					"data from %d cookies", __func__,
4271 					ldcp->ldc_id, ncookies);
4272 				freemsg(mp);
4273 				(void) ldc_mem_dring_release(dp->handle,
4274 					start, end);
4275 				return;
4276 			} else {
4277 				D2(vswp, "%s(%d): copied in %ld bytes"
4278 					" using %d cookies", __func__,
4279 					ldcp->ldc_id, nbytes, ncookies);
4280 			}
4281 
4282 			/* point to the actual end of data */
4283 			mp->b_wptr = mp->b_rptr + datalen;
4284 
4285 			/* build a chain of received packets */
4286 			if (bp == NULL) {
4287 				/* first pkt */
4288 				bp = mp;
4289 				bp->b_next = bp->b_prev = NULL;
4290 				bpt = bp;
4291 				chain = 1;
4292 			} else {
4293 				mp->b_next = NULL;
4294 				mp->b_prev = bpt;
4295 				bpt->b_next = mp;
4296 				bpt = mp;
4297 				chain++;
4298 			}
4299 
4300 			/* mark we are finished with this descriptor */
4301 			pub_addr->hdr.dstate = VIO_DESC_DONE;
4302 
4303 			/*
4304 			 * Send an ACK back to peer if requested, and sync
4305 			 * the rings up to this point so the remote side sees
4306 			 * the descriptor flag in a consistent state.
4307 			 */
4308 			if (ack_needed) {
4309 				if ((rv = ldc_mem_dring_release(
4310 					dp->handle, last_sync, i)) != 0) {
4311 					DERR(vswp, "%s(%lld): unable to sync"
4312 						" from %d to %d", __func__,
4313 						ldcp->ldc_id, last_sync, i);
4314 				}
4315 
4316 				ack_needed = B_FALSE;
4317 
4318 				if (i == end)
4319 					sync_needed = B_FALSE;
4320 				else
4321 					sync_needed = B_TRUE;
4322 
4323 				last_sync = (i + 1) % len;
4324 
4325 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4326 				dring_pkt->tag.vio_sid = ldcp->local_session;
4327 				vsw_send_msg(ldcp, (void *)dring_pkt,
4328 					sizeof (vio_dring_msg_t));
4329 			}
4330 		}
4331 
4332 		if (sync_needed) {
4333 			if ((rv = ldc_mem_dring_release(dp->handle,
4334 					last_sync, end)) != 0) {
4335 				DERR(vswp, "%s(%lld): unable to sync"
4336 					" from %d to %d", __func__,
4337 					ldcp->ldc_id, last_sync, end);
4338 			}
4339 		}
4340 
4341 		/* send the chain of packets to be switched */
4342 		D3(vswp, "%s(%lld): switching chain of %d msgs", __func__,
4343 			ldcp->ldc_id, chain);
4344 		vsw_switch_frame(vswp, bp, VSW_VNETPORT,
4345 					ldcp->ldc_port, NULL);
4346 
4347 		break;
4348 
4349 	case VIO_SUBTYPE_ACK:
4350 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
4351 		/*
4352 		 * Verify that the relevant descriptors are all
4353 		 * marked as DONE
4354 		 */
4355 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
4356 			dring_pkt->dring_ident)) == NULL) {
4357 			DERR(vswp, "%s: unknown ident in ACK", __func__);
4358 			return;
4359 		}
4360 
4361 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
4362 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4363 
4364 		start = end = 0;
4365 		start = dring_pkt->start_idx;
4366 		end = dring_pkt->end_idx;
4367 		len = dp->num_descriptors;
4368 
4369 
4370 		j = num = 0;
4371 		/* calculate # descriptors taking into a/c wrap around */
4372 		num = end >= start ? end - start + 1: (len - start + 1) + end;
4373 
4374 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
4375 			__func__, ldcp->ldc_id, start, end, num);
4376 
4377 		for (i = start; j < num; i = (i + 1) % len, j++) {
4378 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
4379 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4380 
4381 			if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
4382 				DERR(vswp, "%s: descriptor %lld at pos "
4383 					" 0x%llx not DONE (0x%lx)\n", __func__,
4384 					i, pub_addr, pub_addr->hdr.dstate);
4385 				return;
4386 			} else {
4387 				/* clear all the fields */
4388 				bzero(priv_addr->datap, priv_addr->datalen);
4389 				priv_addr->datalen = 0;
4390 
4391 				pub_addr->hdr.dstate = VIO_DESC_FREE;
4392 				pub_addr->hdr.ack = 0;
4393 				priv_addr->dstate = VIO_DESC_FREE;
4394 
4395 				D3(vswp, "clearing descp %d : pub state "
4396 					"0x%llx : priv state 0x%llx", i,
4397 					pub_addr->hdr.dstate,
4398 					priv_addr->dstate);
4399 			}
4400 		}
4401 
4402 		break;
4403 
4404 	case VIO_SUBTYPE_NACK:
4405 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
4406 						__func__, ldcp->ldc_id);
4407 		/*
4408 		 * Something is badly wrong if we are getting NACK's
4409 		 * for our data pkts. So reset the channel.
4410 		 */
4411 		vsw_restart_handshake(ldcp);
4412 
4413 		break;
4414 
4415 	default:
4416 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4417 			ldcp->ldc_id, dring_pkt->tag.vio_subtype);
4418 	}
4419 
4420 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4421 }
4422 
4423 /*
4424  * VIO_PKT_DATA (a.k.a raw data mode )
4425  *
4426  * Note - currently not supported. Do nothing.
4427  */
4428 static void
4429 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
4430 {
4431 	_NOTE(ARGUNUSED(dpkt))
4432 
4433 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4434 
4435 	DERR(NULL, "%s (%lld): currently  not supported",
4436 						__func__, ldcp->ldc_id);
4437 
4438 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4439 }
4440 
4441 #define	SND_IBND_DESC_NACK(ldcp, pkt) \
4442 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4443 	pkt->tag.vio_sid = ldcp->local_session; \
4444 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t));
4445 
4446 /*
4447  * Process an in-band descriptor message (most likely from
4448  * OBP).
4449  */
4450 static void
4451 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
4452 {
4453 	vio_ibnd_desc_t		*ibnd_desc;
4454 	dring_info_t		*dp = NULL;
4455 	vsw_private_desc_t	*priv_addr = NULL;
4456 	vsw_t			*vswp = ldcp->ldc_vswp;
4457 	mblk_t			*mp = NULL;
4458 	size_t			nbytes = 0;
4459 	size_t			off = 0;
4460 	uint64_t		idx = 0;
4461 	uint32_t		datalen = 0;
4462 	uint64_t		ncookies = 0;
4463 	int			rv;
4464 
4465 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4466 
4467 	ibnd_desc = (vio_ibnd_desc_t *)pkt;
4468 
4469 	switch (ibnd_desc->hdr.tag.vio_subtype) {
4470 	case VIO_SUBTYPE_INFO:
4471 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4472 
4473 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
4474 			return;
4475 
4476 		/*
4477 		 * Data is padded to align on a 8 byte boundary,
4478 		 * nbytes is actual data length, i.e. minus that
4479 		 * padding.
4480 		 */
4481 		datalen = ibnd_desc->nbytes;
4482 
4483 		D2(vswp, "%s(%lld): processing inband desc : "
4484 			": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
4485 
4486 		ncookies = ibnd_desc->ncookies;
4487 
4488 		/*
4489 		 * allocb(9F) returns an aligned data block. We
4490 		 * need to ensure that we ask ldc for an aligned
4491 		 * number of bytes also.
4492 		 */
4493 		nbytes = datalen;
4494 		if (nbytes & 0x7) {
4495 			off = 8 - (nbytes & 0x7);
4496 			nbytes += off;
4497 		}
4498 
4499 		mp = allocb(datalen, BPRI_MED);
4500 		if (mp == NULL) {
4501 			DERR(vswp, "%s(%lld): allocb failed",
4502 					__func__, ldcp->ldc_id);
4503 			return;
4504 		}
4505 
4506 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
4507 			0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
4508 			LDC_COPY_IN);
4509 
4510 		if (rv != 0) {
4511 			DERR(vswp, "%s(%d): unable to copy in data from "
4512 				"%d cookie(s)", __func__,
4513 				ldcp->ldc_id, ncookies);
4514 			freemsg(mp);
4515 			return;
4516 		} else {
4517 			D2(vswp, "%s(%d): copied in %ld bytes using %d "
4518 				"cookies", __func__, ldcp->ldc_id, nbytes,
4519 				ncookies);
4520 		}
4521 
4522 		/* point to the actual end of data */
4523 		mp->b_wptr = mp->b_rptr + datalen;
4524 
4525 		/*
4526 		 * We ACK back every in-band descriptor message we process
4527 		 */
4528 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
4529 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
4530 		vsw_send_msg(ldcp, (void *)ibnd_desc,
4531 				sizeof (vio_ibnd_desc_t));
4532 
4533 		/* send the packet to be switched */
4534 		vsw_switch_frame(vswp, mp, VSW_VNETPORT,
4535 					ldcp->ldc_port, NULL);
4536 
4537 		break;
4538 
4539 	case VIO_SUBTYPE_ACK:
4540 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4541 
4542 		/* Verify the ACK is valid */
4543 		idx = ibnd_desc->hdr.desc_handle;
4544 
4545 		if (idx >= VSW_RING_NUM_EL) {
4546 			cmn_err(CE_WARN, "%s: corrupted ACK received "
4547 				"(idx %ld)", __func__, idx);
4548 			return;
4549 		}
4550 
4551 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4552 			DERR(vswp, "%s: no dring found", __func__);
4553 			return;
4554 		}
4555 
4556 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4557 
4558 		/* move to correct location in ring */
4559 		priv_addr += idx;
4560 
4561 		/*
4562 		 * When we sent the in-band message to our peer we
4563 		 * marked the copy in our private ring as READY. We now
4564 		 * check that the descriptor we are being ACK'ed for is in
4565 		 * fact READY, i.e. it is one we have shared with our peer.
4566 		 */
4567 		if (priv_addr->dstate != VIO_DESC_READY) {
4568 			cmn_err(CE_WARN, "%s: (%ld) desc at index %ld not "
4569 				"READY (0x%lx)", __func__, ldcp->ldc_id, idx,
4570 				priv_addr->dstate);
4571 			cmn_err(CE_CONT, "%s: bound %d: ncookies %ld\n",
4572 				__func__, priv_addr->bound,
4573 				priv_addr->ncookies);
4574 			cmn_err(CE_CONT, "datalen %ld\n", priv_addr->datalen);
4575 			return;
4576 		} else {
4577 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
4578 				ldcp->ldc_id, idx);
4579 
4580 			/* release resources associated with sent msg */
4581 			bzero(priv_addr->datap, priv_addr->datalen);
4582 			priv_addr->datalen = 0;
4583 			priv_addr->dstate = VIO_DESC_FREE;
4584 		}
4585 		break;
4586 
4587 	case VIO_SUBTYPE_NACK:
4588 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4589 
4590 		/*
4591 		 * We should only get a NACK if our peer doesn't like
4592 		 * something about a message we have sent it. If this
4593 		 * happens we just release the resources associated with
4594 		 * the message. (We are relying on higher layers to decide
4595 		 * whether or not to resend.
4596 		 */
4597 
4598 		/* limit check */
4599 		idx = ibnd_desc->hdr.desc_handle;
4600 
4601 		if (idx >= VSW_RING_NUM_EL) {
4602 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
4603 				__func__, idx);
4604 			return;
4605 		}
4606 
4607 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4608 			DERR(vswp, "%s: no dring found", __func__);
4609 			return;
4610 		}
4611 
4612 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4613 
4614 		/* move to correct location in ring */
4615 		priv_addr += idx;
4616 
4617 		/* release resources associated with sent msg */
4618 		bzero(priv_addr->datap, priv_addr->datalen);
4619 		priv_addr->datalen = 0;
4620 		priv_addr->dstate = VIO_DESC_FREE;
4621 
4622 		break;
4623 
4624 	default:
4625 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4626 			ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
4627 	}
4628 
4629 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4630 }
4631 
4632 static void
4633 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
4634 {
4635 	_NOTE(ARGUNUSED(epkt))
4636 
4637 	vsw_t		*vswp = ldcp->ldc_vswp;
4638 	uint16_t	env = tag.vio_subtype_env;
4639 
4640 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4641 
4642 	/*
4643 	 * Error vio_subtypes have yet to be defined. So for
4644 	 * the moment we can't do anything.
4645 	 */
4646 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
4647 
4648 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4649 }
4650 
4651 /*
4652  * Switch the given ethernet frame when operating in layer 2 mode.
4653  *
4654  * vswp: pointer to the vsw instance
4655  * mp: pointer to chain of ethernet frame(s) to be switched
4656  * caller: identifies the source of this frame as:
4657  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
4658  *		2. VSW_PHYSDEV - the physical ethernet device
4659  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
4660  * arg: argument provided by the caller.
4661  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
4662  *		2. for PHYSDEV - NULL
4663  *		3. for LOCALDEV - pointer to to this vsw_t(self)
4664  */
4665 void
4666 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
4667 			vsw_port_t *arg, mac_resource_handle_t mrh)
4668 {
4669 	struct ether_header	*ehp;
4670 	vsw_port_t		*port = NULL;
4671 	mblk_t			*bp, *ret_m;
4672 	mblk_t			*nmp = NULL;
4673 	vsw_port_list_t		*plist = &vswp->plist;
4674 
4675 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
4676 
4677 	/*
4678 	 * PERF: rather than breaking up the chain here, scan it
4679 	 * to find all mblks heading to same destination and then
4680 	 * pass that sub-chain to the lower transmit functions.
4681 	 */
4682 
4683 	/* process the chain of packets */
4684 	bp = mp;
4685 	while (bp) {
4686 		mp = bp;
4687 		bp = bp->b_next;
4688 		mp->b_next = mp->b_prev = NULL;
4689 		ehp = (struct ether_header *)mp->b_rptr;
4690 
4691 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
4692 			__func__, MBLKSIZE(mp), MBLKL(mp));
4693 
4694 		READ_ENTER(&vswp->if_lockrw);
4695 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
4696 			/*
4697 			 * If destination is VSW_LOCALDEV (vsw as an eth
4698 			 * interface) and if the device is up & running,
4699 			 * send the packet up the stack on this host.
4700 			 * If the virtual interface is down, drop the packet.
4701 			 */
4702 			if (caller != VSW_LOCALDEV) {
4703 				if (vswp->if_state & VSW_IF_UP) {
4704 					RW_EXIT(&vswp->if_lockrw);
4705 					mac_rx(vswp->if_macp, mrh, mp);
4706 				} else {
4707 					RW_EXIT(&vswp->if_lockrw);
4708 					/* Interface down, drop pkt */
4709 					freemsg(mp);
4710 				}
4711 			} else {
4712 				RW_EXIT(&vswp->if_lockrw);
4713 				freemsg(mp);
4714 			}
4715 			continue;
4716 		}
4717 		RW_EXIT(&vswp->if_lockrw);
4718 
4719 		READ_ENTER(&plist->lockrw);
4720 		port = vsw_lookup_fdb(vswp, ehp);
4721 		if (port) {
4722 			/*
4723 			 * Mark the port as in-use.
4724 			 */
4725 			mutex_enter(&port->ref_lock);
4726 			port->ref_cnt++;
4727 			mutex_exit(&port->ref_lock);
4728 			RW_EXIT(&plist->lockrw);
4729 
4730 			/*
4731 			 * If plumbed and in promisc mode then copy msg
4732 			 * and send up the stack.
4733 			 */
4734 			READ_ENTER(&vswp->if_lockrw);
4735 			if (VSW_U_P(vswp->if_state)) {
4736 				RW_EXIT(&vswp->if_lockrw);
4737 				nmp = copymsg(mp);
4738 				if (nmp)
4739 					mac_rx(vswp->if_macp, mrh, nmp);
4740 			} else {
4741 				RW_EXIT(&vswp->if_lockrw);
4742 			}
4743 
4744 			/*
4745 			 * If the destination is in FDB, the packet
4746 			 * should be forwarded to the correponding
4747 			 * vsw_port (connected to a vnet device -
4748 			 * VSW_VNETPORT)
4749 			 */
4750 			(void) vsw_portsend(port, mp);
4751 
4752 			/*
4753 			 * Decrement use count in port and check if
4754 			 * should wake delete thread.
4755 			 */
4756 			mutex_enter(&port->ref_lock);
4757 			port->ref_cnt--;
4758 			if (port->ref_cnt == 0)
4759 				cv_signal(&port->ref_cv);
4760 			mutex_exit(&port->ref_lock);
4761 		} else {
4762 			RW_EXIT(&plist->lockrw);
4763 			/*
4764 			 * Destination not in FDB.
4765 			 *
4766 			 * If the destination is broadcast or
4767 			 * multicast forward the packet to all
4768 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
4769 			 * except the caller.
4770 			 */
4771 			if (IS_BROADCAST(ehp)) {
4772 				D3(vswp, "%s: BROADCAST pkt", __func__);
4773 				(void) vsw_forward_all(vswp, mp,
4774 								caller, arg);
4775 			} else if (IS_MULTICAST(ehp)) {
4776 				D3(vswp, "%s: MULTICAST pkt", __func__);
4777 				(void) vsw_forward_grp(vswp, mp,
4778 							caller, arg);
4779 			} else {
4780 				/*
4781 				 * If the destination is unicast, and came
4782 				 * from either a logical network device or
4783 				 * the switch itself when it is plumbed, then
4784 				 * send it out on the physical device and also
4785 				 * up the stack if the logical interface is
4786 				 * in promiscious mode.
4787 				 *
4788 				 * NOTE:  The assumption here is that if we
4789 				 * cannot find the destination in our fdb, its
4790 				 * a unicast address, and came from either a
4791 				 * vnet or down the stack (when plumbed) it
4792 				 * must be destinded for an ethernet device
4793 				 * outside our ldoms.
4794 				 */
4795 				if (caller == VSW_VNETPORT) {
4796 					READ_ENTER(&vswp->if_lockrw);
4797 					if (VSW_U_P(vswp->if_state)) {
4798 						RW_EXIT(&vswp->if_lockrw);
4799 						nmp = copymsg(mp);
4800 						if (nmp)
4801 							mac_rx(vswp->if_macp,
4802 								mrh, nmp);
4803 					} else {
4804 						RW_EXIT(&vswp->if_lockrw);
4805 					}
4806 					if ((ret_m = vsw_tx_msg(vswp, mp))
4807 								!= NULL) {
4808 						DERR(vswp, "%s: drop mblks to "
4809 							"phys dev", __func__);
4810 						freemsg(ret_m);
4811 					}
4812 
4813 				} else if (caller == VSW_PHYSDEV) {
4814 					/*
4815 					 * Pkt seen because card in promisc
4816 					 * mode. Send up stack if plumbed in
4817 					 * promisc mode, else drop it.
4818 					 */
4819 					READ_ENTER(&vswp->if_lockrw);
4820 					if (VSW_U_P(vswp->if_state)) {
4821 						RW_EXIT(&vswp->if_lockrw);
4822 						mac_rx(vswp->if_macp, mrh, mp);
4823 					} else {
4824 						RW_EXIT(&vswp->if_lockrw);
4825 						freemsg(mp);
4826 					}
4827 
4828 				} else if (caller == VSW_LOCALDEV) {
4829 					/*
4830 					 * Pkt came down the stack, send out
4831 					 * over physical device.
4832 					 */
4833 					if ((ret_m = vsw_tx_msg(vswp, mp))
4834 								!= NULL) {
4835 						DERR(vswp, "%s: drop mblks to "
4836 							"phys dev", __func__);
4837 						freemsg(ret_m);
4838 					}
4839 				}
4840 			}
4841 		}
4842 	}
4843 	D1(vswp, "%s: exit\n", __func__);
4844 }
4845 
4846 /*
4847  * Switch ethernet frame when in layer 3 mode (i.e. using IP
4848  * layer to do the routing).
4849  *
4850  * There is a large amount of overlap between this function and
4851  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
4852  * both these functions.
4853  */
4854 void
4855 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
4856 			vsw_port_t *arg, mac_resource_handle_t mrh)
4857 {
4858 	struct ether_header	*ehp;
4859 	vsw_port_t		*port = NULL;
4860 	mblk_t			*bp = NULL;
4861 	vsw_port_list_t		*plist = &vswp->plist;
4862 
4863 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
4864 
4865 	/*
4866 	 * In layer 3 mode should only ever be switching packets
4867 	 * between IP layer and vnet devices. So make sure thats
4868 	 * who is invoking us.
4869 	 */
4870 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
4871 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
4872 		freemsgchain(mp);
4873 		return;
4874 	}
4875 
4876 	/* process the chain of packets */
4877 	bp = mp;
4878 	while (bp) {
4879 		mp = bp;
4880 		bp = bp->b_next;
4881 		mp->b_next = mp->b_prev = NULL;
4882 		ehp = (struct ether_header *)mp->b_rptr;
4883 
4884 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
4885 			__func__, MBLKSIZE(mp), MBLKL(mp));
4886 
4887 		READ_ENTER(&plist->lockrw);
4888 		port = vsw_lookup_fdb(vswp, ehp);
4889 		if (port) {
4890 			/*
4891 			 * Mark port as in-use.
4892 			 */
4893 			mutex_enter(&port->ref_lock);
4894 			port->ref_cnt++;
4895 			mutex_exit(&port->ref_lock);
4896 			RW_EXIT(&plist->lockrw);
4897 
4898 			D2(vswp, "%s: sending to target port", __func__);
4899 			(void) vsw_portsend(port, mp);
4900 
4901 			/*
4902 			 * Finished with port so decrement ref count and
4903 			 * check if should wake delete thread.
4904 			 */
4905 			mutex_enter(&port->ref_lock);
4906 			port->ref_cnt--;
4907 			if (port->ref_cnt == 0)
4908 				cv_signal(&port->ref_cv);
4909 			mutex_exit(&port->ref_lock);
4910 		} else {
4911 			RW_EXIT(&plist->lockrw);
4912 			/*
4913 			 * Destination not in FDB
4914 			 *
4915 			 * If the destination is broadcast or
4916 			 * multicast forward the packet to all
4917 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
4918 			 * except the caller.
4919 			 */
4920 			if (IS_BROADCAST(ehp)) {
4921 				D2(vswp, "%s: BROADCAST pkt", __func__);
4922 				(void) vsw_forward_all(vswp, mp,
4923 								caller, arg);
4924 			} else if (IS_MULTICAST(ehp)) {
4925 				D2(vswp, "%s: MULTICAST pkt", __func__);
4926 				(void) vsw_forward_grp(vswp, mp,
4927 							caller, arg);
4928 			} else {
4929 				/*
4930 				 * Unicast pkt from vnet that we don't have
4931 				 * an FDB entry for, so must be destinded for
4932 				 * the outside world. Attempt to send up to the
4933 				 * IP layer to allow it to deal with it.
4934 				 */
4935 				if (caller == VSW_VNETPORT) {
4936 					READ_ENTER(&vswp->if_lockrw);
4937 					if (vswp->if_state & VSW_IF_UP) {
4938 						RW_EXIT(&vswp->if_lockrw);
4939 						D2(vswp, "%s: sending up",
4940 							__func__);
4941 						mac_rx(vswp->if_macp, mrh, mp);
4942 					} else {
4943 						RW_EXIT(&vswp->if_lockrw);
4944 						/* Interface down, drop pkt */
4945 						D2(vswp, "%s I/F down",
4946 								__func__);
4947 						freemsg(mp);
4948 					}
4949 				}
4950 			}
4951 		}
4952 	}
4953 
4954 	D1(vswp, "%s: exit", __func__);
4955 }
4956 
4957 /*
4958  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
4959  * except the caller (port on which frame arrived).
4960  */
4961 static int
4962 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
4963 {
4964 	vsw_port_list_t	*plist = &vswp->plist;
4965 	vsw_port_t	*portp;
4966 	mblk_t		*nmp = NULL;
4967 	mblk_t		*ret_m = NULL;
4968 	int		skip_port = 0;
4969 
4970 	D1(vswp, "vsw_forward_all: enter\n");
4971 
4972 	/*
4973 	 * Broadcast message from inside ldoms so send to outside
4974 	 * world if in either of layer 2 modes.
4975 	 */
4976 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
4977 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
4978 		((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
4979 
4980 		nmp = dupmsg(mp);
4981 		if (nmp) {
4982 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
4983 				DERR(vswp, "%s: dropping pkt(s) "
4984 				"consisting of %ld bytes of data for"
4985 				" physical device", __func__, MBLKL(ret_m));
4986 			freemsg(ret_m);
4987 			}
4988 		}
4989 	}
4990 
4991 	if (caller == VSW_VNETPORT)
4992 		skip_port = 1;
4993 
4994 	/*
4995 	 * Broadcast message from other vnet (layer 2 or 3) or outside
4996 	 * world (layer 2 only), send up stack if plumbed.
4997 	 */
4998 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
4999 		READ_ENTER(&vswp->if_lockrw);
5000 		if (vswp->if_state & VSW_IF_UP) {
5001 			RW_EXIT(&vswp->if_lockrw);
5002 			nmp = copymsg(mp);
5003 			if (nmp)
5004 				mac_rx(vswp->if_macp, vswp->if_mrh, nmp);
5005 		} else {
5006 			RW_EXIT(&vswp->if_lockrw);
5007 		}
5008 	}
5009 
5010 	/* send it to all VNETPORTs */
5011 	READ_ENTER(&plist->lockrw);
5012 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
5013 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
5014 		/*
5015 		 * Caution ! - don't reorder these two checks as arg
5016 		 * will be NULL if the caller is PHYSDEV. skip_port is
5017 		 * only set if caller is VNETPORT.
5018 		 */
5019 		if ((skip_port) && (portp == arg))
5020 			continue;
5021 		else {
5022 			nmp = dupmsg(mp);
5023 			if (nmp) {
5024 				(void) vsw_portsend(portp, nmp);
5025 			} else {
5026 				DERR(vswp, "vsw_forward_all: nmp NULL");
5027 			}
5028 		}
5029 	}
5030 	RW_EXIT(&plist->lockrw);
5031 
5032 	freemsg(mp);
5033 
5034 	D1(vswp, "vsw_forward_all: exit\n");
5035 	return (0);
5036 }
5037 
5038 /*
5039  * Forward pkts to any devices or interfaces which have registered
5040  * an interest in them (i.e. multicast groups).
5041  */
5042 static int
5043 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
5044 {
5045 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
5046 	mfdb_ent_t		*entp = NULL;
5047 	mfdb_ent_t		*tpp = NULL;
5048 	vsw_port_t 		*port;
5049 	uint64_t		key = 0;
5050 	mblk_t			*nmp = NULL;
5051 	mblk_t			*ret_m = NULL;
5052 	boolean_t		check_if = B_TRUE;
5053 
5054 	/*
5055 	 * Convert address to hash table key
5056 	 */
5057 	KEY_HASH(key, ehp->ether_dhost);
5058 
5059 	D1(vswp, "%s: key 0x%llx", __func__, key);
5060 
5061 	/*
5062 	 * If pkt came from either a vnet or down the stack (if we are
5063 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
5064 	 * over the physical adapter, and then check to see if any other
5065 	 * vnets are interested in it.
5066 	 */
5067 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
5068 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
5069 		((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
5070 		nmp = dupmsg(mp);
5071 		if (nmp) {
5072 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
5073 				DERR(vswp, "%s: dropping pkt(s) "
5074 					"consisting of %ld bytes of "
5075 					"data for physical device",
5076 					__func__, MBLKL(ret_m));
5077 				freemsg(ret_m);
5078 			}
5079 		}
5080 	}
5081 
5082 	READ_ENTER(&vswp->mfdbrw);
5083 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
5084 				(mod_hash_val_t *)&entp) != 0) {
5085 		D3(vswp, "%s: no table entry found for addr 0x%llx",
5086 								__func__, key);
5087 	} else {
5088 		/*
5089 		 * Send to list of devices associated with this address...
5090 		 */
5091 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
5092 
5093 			/* dont send to ourselves */
5094 			if ((caller == VSW_VNETPORT) &&
5095 				(tpp->d_addr == (void *)arg)) {
5096 				port = (vsw_port_t *)tpp->d_addr;
5097 				D3(vswp, "%s: not sending to ourselves"
5098 					" : port %d", __func__,
5099 					port->p_instance);
5100 				continue;
5101 
5102 			} else if ((caller == VSW_LOCALDEV) &&
5103 				(tpp->d_type == VSW_LOCALDEV)) {
5104 				D3(vswp, "%s: not sending back up stack",
5105 					__func__);
5106 				continue;
5107 			}
5108 
5109 			if (tpp->d_type == VSW_VNETPORT) {
5110 				port = (vsw_port_t *)tpp->d_addr;
5111 				D3(vswp, "%s: sending to port %ld for "
5112 					" addr 0x%llx", __func__,
5113 					port->p_instance, key);
5114 
5115 				nmp = dupmsg(mp);
5116 				if (nmp)
5117 					(void) vsw_portsend(port, nmp);
5118 			} else {
5119 				if (vswp->if_state & VSW_IF_UP) {
5120 					nmp = copymsg(mp);
5121 					if (nmp)
5122 						mac_rx(vswp->if_macp,
5123 							vswp->if_mrh, nmp);
5124 					check_if = B_FALSE;
5125 					D3(vswp, "%s: sending up stack"
5126 						" for addr 0x%llx", __func__,
5127 						key);
5128 				}
5129 			}
5130 		}
5131 	}
5132 
5133 	RW_EXIT(&vswp->mfdbrw);
5134 
5135 	/*
5136 	 * If the pkt came from either a vnet or from physical device,
5137 	 * and if we havent already sent the pkt up the stack then we
5138 	 * check now if we can/should (i.e. the interface is plumbed
5139 	 * and in promisc mode).
5140 	 */
5141 	if ((check_if) &&
5142 		((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
5143 		READ_ENTER(&vswp->if_lockrw);
5144 		if (VSW_U_P(vswp->if_state)) {
5145 			RW_EXIT(&vswp->if_lockrw);
5146 			D3(vswp, "%s: (caller %d) finally sending up stack"
5147 				" for addr 0x%llx", __func__, caller, key);
5148 			nmp = copymsg(mp);
5149 			if (nmp)
5150 				mac_rx(vswp->if_macp, vswp->if_mrh, nmp);
5151 		} else {
5152 			RW_EXIT(&vswp->if_lockrw);
5153 		}
5154 	}
5155 
5156 	freemsg(mp);
5157 
5158 	D1(vswp, "%s: exit", __func__);
5159 
5160 	return (0);
5161 }
5162 
5163 /* transmit the packet over the given port */
5164 static int
5165 vsw_portsend(vsw_port_t *port, mblk_t *mp)
5166 {
5167 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
5168 	vsw_ldc_t 	*ldcp;
5169 	int		status = 0;
5170 
5171 
5172 	READ_ENTER(&ldcl->lockrw);
5173 	/*
5174 	 * Note for now, we have a single channel.
5175 	 */
5176 	ldcp = ldcl->head;
5177 	if (ldcp == NULL) {
5178 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
5179 		freemsg(mp);
5180 		RW_EXIT(&ldcl->lockrw);
5181 		return (1);
5182 	}
5183 
5184 	/*
5185 	 * Send the message out using the appropriate
5186 	 * transmit function which will free mblock when it
5187 	 * is finished with it.
5188 	 */
5189 	mutex_enter(&port->tx_lock);
5190 	if (port->transmit != NULL)
5191 		status = (*port->transmit)(ldcp, mp);
5192 	else {
5193 		freemsg(mp);
5194 	}
5195 	mutex_exit(&port->tx_lock);
5196 
5197 	RW_EXIT(&ldcl->lockrw);
5198 
5199 	return (status);
5200 }
5201 
5202 /*
5203  * Send packet out via descriptor ring to a logical device.
5204  */
5205 static int
5206 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
5207 {
5208 	vio_dring_msg_t		dring_pkt;
5209 	dring_info_t		*dp = NULL;
5210 	vsw_private_desc_t	*priv_desc = NULL;
5211 	vsw_t			*vswp = ldcp->ldc_vswp;
5212 	mblk_t			*bp;
5213 	size_t			n, size;
5214 	caddr_t			bufp;
5215 	int			idx;
5216 	int			status = LDC_TX_SUCCESS;
5217 
5218 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
5219 
5220 	/* TODO: make test a macro */
5221 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
5222 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
5223 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
5224 			"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
5225 			ldcp->lane_out.lstate);
5226 		freemsg(mp);
5227 		return (LDC_TX_FAILURE);
5228 	}
5229 
5230 	/*
5231 	 * Note - using first ring only, this may change
5232 	 * in the future.
5233 	 */
5234 	if ((dp = ldcp->lane_out.dringp) == NULL) {
5235 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
5236 			" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
5237 		freemsg(mp);
5238 		return (LDC_TX_FAILURE);
5239 	}
5240 
5241 	mutex_enter(&dp->dlock);
5242 
5243 	size = msgsize(mp);
5244 	if (size > (size_t)ETHERMAX) {
5245 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
5246 		    ldcp->ldc_id, size);
5247 		status = LDC_TX_FAILURE;
5248 		goto vsw_dringsend_free_exit;
5249 	}
5250 
5251 	/*
5252 	 * Find a free descriptor
5253 	 *
5254 	 * Note: for the moment we are assuming that we will only
5255 	 * have one dring going from the switch to each of its
5256 	 * peers. This may change in the future.
5257 	 */
5258 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
5259 		DERR(vswp, "%s(%lld): no descriptor available for ring "
5260 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
5261 
5262 		/* nothing more we can do */
5263 		status = LDC_TX_NORESOURCES;
5264 		goto vsw_dringsend_free_exit;
5265 	} else {
5266 		D2(vswp, "%s(%lld): free private descriptor found at pos "
5267 			"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
5268 			priv_desc);
5269 	}
5270 
5271 	/* copy data into the descriptor */
5272 	bufp = priv_desc->datap;
5273 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
5274 		n = MBLKL(bp);
5275 		bcopy(bp->b_rptr, bufp, n);
5276 		bufp += n;
5277 	}
5278 
5279 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
5280 	priv_desc->dstate = VIO_DESC_READY;
5281 
5282 	/*
5283 	 * Copy relevant sections of private descriptor
5284 	 * to public section
5285 	 */
5286 	vsw_dring_priv2pub(priv_desc);
5287 
5288 	/*
5289 	 * Send a vio_dring_msg to peer to prompt them to read
5290 	 * the updated descriptor ring.
5291 	 */
5292 	dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
5293 	dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
5294 	dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
5295 	dring_pkt.tag.vio_sid = ldcp->local_session;
5296 
5297 	/* Note - for now using first ring */
5298 	dring_pkt.dring_ident = dp->ident;
5299 
5300 	/*
5301 	 * Access to the seq_num is implicitly protected by the
5302 	 * fact that we have only one dring associated with the
5303 	 * lane currently and we hold the associated dring lock.
5304 	 */
5305 	dring_pkt.seq_num = ldcp->lane_out.seq_num++;
5306 
5307 	/* Note - only updating single descrip at time at the moment */
5308 	dring_pkt.start_idx = idx;
5309 	dring_pkt.end_idx = idx;
5310 
5311 	D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
5312 		ldcp->ldc_id, dp, dring_pkt.dring_ident);
5313 	D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", __func__,
5314 		ldcp->ldc_id, dring_pkt.start_idx, dring_pkt.end_idx,
5315 		dring_pkt.seq_num);
5316 
5317 	vsw_send_msg(ldcp, (void *)&dring_pkt, sizeof (vio_dring_msg_t));
5318 
5319 vsw_dringsend_free_exit:
5320 
5321 	mutex_exit(&dp->dlock);
5322 
5323 	/* free the message block */
5324 	freemsg(mp);
5325 
5326 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
5327 	return (status);
5328 }
5329 
5330 /*
5331  * Send an in-band descriptor message over ldc.
5332  */
5333 static int
5334 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
5335 {
5336 	vsw_t			*vswp = ldcp->ldc_vswp;
5337 	vio_ibnd_desc_t		ibnd_msg;
5338 	vsw_private_desc_t	*priv_desc = NULL;
5339 	dring_info_t		*dp = NULL;
5340 	size_t			n, size = 0;
5341 	caddr_t			bufp;
5342 	mblk_t			*bp;
5343 	int			idx, i;
5344 	int			status = LDC_TX_SUCCESS;
5345 	static int		warn_msg = 1;
5346 
5347 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5348 
5349 	ASSERT(mp != NULL);
5350 
5351 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
5352 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
5353 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
5354 			__func__, ldcp->ldc_id, ldcp->ldc_status,
5355 			ldcp->lane_out.lstate);
5356 		freemsg(mp);
5357 		return (LDC_TX_FAILURE);
5358 	}
5359 
5360 	/*
5361 	 * only expect single dring to exist, which we use
5362 	 * as an internal buffer, rather than a transfer channel.
5363 	 */
5364 	if ((dp = ldcp->lane_out.dringp) == NULL) {
5365 		DERR(vswp, "%s(%lld): no dring for outbound lane",
5366 			__func__, ldcp->ldc_id);
5367 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
5368 			__func__, ldcp->ldc_id, ldcp->ldc_status,
5369 			ldcp->lane_out.lstate);
5370 		freemsg(mp);
5371 		return (LDC_TX_FAILURE);
5372 	}
5373 
5374 	mutex_enter(&dp->dlock);
5375 
5376 	size = msgsize(mp);
5377 	if (size > (size_t)ETHERMAX) {
5378 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
5379 		    ldcp->ldc_id, size);
5380 		status = LDC_TX_FAILURE;
5381 		goto vsw_descrsend_free_exit;
5382 	}
5383 
5384 	/*
5385 	 * Find a free descriptor in our buffer ring
5386 	 */
5387 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
5388 		if (warn_msg) {
5389 			DERR(vswp, "%s(%lld): no descriptor available for ring "
5390 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
5391 			warn_msg = 0;
5392 		}
5393 
5394 		/* nothing more we can do */
5395 		status = LDC_TX_NORESOURCES;
5396 		goto vsw_descrsend_free_exit;
5397 	} else {
5398 		D2(vswp, "%s(%lld): free private descriptor found at pos "
5399 			"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
5400 			priv_desc);
5401 		warn_msg = 1;
5402 	}
5403 
5404 	/* copy data into the descriptor */
5405 	bufp = priv_desc->datap;
5406 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
5407 		n = MBLKL(bp);
5408 		bcopy(bp->b_rptr, bufp, n);
5409 		bufp += n;
5410 	}
5411 
5412 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
5413 	priv_desc->dstate = VIO_DESC_READY;
5414 
5415 	/* create and send the in-band descp msg */
5416 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
5417 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
5418 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
5419 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
5420 
5421 	/*
5422 	 * Access to the seq_num is implicitly protected by the
5423 	 * fact that we have only one dring associated with the
5424 	 * lane currently and we hold the associated dring lock.
5425 	 */
5426 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
5427 
5428 	/*
5429 	 * Copy the mem cookies describing the data from the
5430 	 * private region of the descriptor ring into the inband
5431 	 * descriptor.
5432 	 */
5433 	for (i = 0; i < priv_desc->ncookies; i++) {
5434 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
5435 			sizeof (ldc_mem_cookie_t));
5436 	}
5437 
5438 	ibnd_msg.hdr.desc_handle = idx;
5439 	ibnd_msg.ncookies = priv_desc->ncookies;
5440 	ibnd_msg.nbytes = size;
5441 
5442 	vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t));
5443 
5444 vsw_descrsend_free_exit:
5445 
5446 	mutex_exit(&dp->dlock);
5447 
5448 	/* free the allocated message blocks */
5449 	freemsg(mp);
5450 
5451 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5452 	return (status);
5453 }
5454 
5455 static void
5456 vsw_send_ver(vsw_ldc_t *ldcp)
5457 {
5458 	vsw_t		*vswp = ldcp->ldc_vswp;
5459 	lane_t		*lp = &ldcp->lane_out;
5460 	vio_ver_msg_t	ver_msg;
5461 
5462 	D1(vswp, "%s enter", __func__);
5463 
5464 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5465 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5466 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
5467 	ver_msg.tag.vio_sid = ldcp->local_session;
5468 
5469 	ver_msg.ver_major = vsw_versions[0].ver_major;
5470 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
5471 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
5472 
5473 	lp->lstate |= VSW_VER_INFO_SENT;
5474 	lp->ver_major = ver_msg.ver_major;
5475 	lp->ver_minor = ver_msg.ver_minor;
5476 
5477 	DUMP_TAG(ver_msg.tag);
5478 
5479 	vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t));
5480 
5481 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
5482 }
5483 
5484 static void
5485 vsw_send_attr(vsw_ldc_t *ldcp)
5486 {
5487 	vsw_t			*vswp = ldcp->ldc_vswp;
5488 	lane_t			*lp = &ldcp->lane_out;
5489 	vnet_attr_msg_t		attr_msg;
5490 
5491 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5492 
5493 	/*
5494 	 * Subtype is set to INFO by default
5495 	 */
5496 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5497 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5498 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
5499 	attr_msg.tag.vio_sid = ldcp->local_session;
5500 
5501 	/* payload copied from default settings for lane */
5502 	attr_msg.mtu = lp->mtu;
5503 	attr_msg.addr_type = lp->addr_type;
5504 	attr_msg.xfer_mode = lp->xfer_mode;
5505 	attr_msg.ack_freq = lp->xfer_mode;
5506 
5507 	READ_ENTER(&vswp->if_lockrw);
5508 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
5509 	RW_EXIT(&vswp->if_lockrw);
5510 
5511 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
5512 
5513 	DUMP_TAG(attr_msg.tag);
5514 
5515 	vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t));
5516 
5517 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5518 }
5519 
5520 /*
5521  * Create dring info msg (which also results in the creation of
5522  * a dring).
5523  */
5524 static vio_dring_reg_msg_t *
5525 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
5526 {
5527 	vio_dring_reg_msg_t	*mp;
5528 	dring_info_t		*dp;
5529 	vsw_t			*vswp = ldcp->ldc_vswp;
5530 
5531 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
5532 
5533 	/*
5534 	 * If we can't create a dring, obviously no point sending
5535 	 * a message.
5536 	 */
5537 	if ((dp = vsw_create_dring(ldcp)) == NULL)
5538 		return (NULL);
5539 
5540 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
5541 
5542 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
5543 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
5544 	mp->tag.vio_subtype_env = VIO_DRING_REG;
5545 	mp->tag.vio_sid = ldcp->local_session;
5546 
5547 	/* payload */
5548 	mp->num_descriptors = dp->num_descriptors;
5549 	mp->descriptor_size = dp->descriptor_size;
5550 	mp->options = dp->options;
5551 	mp->ncookies = dp->ncookies;
5552 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
5553 
5554 	mp->dring_ident = 0;
5555 
5556 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
5557 
5558 	return (mp);
5559 }
5560 
5561 static void
5562 vsw_send_dring_info(vsw_ldc_t *ldcp)
5563 {
5564 	vio_dring_reg_msg_t	*dring_msg;
5565 	vsw_t			*vswp = ldcp->ldc_vswp;
5566 
5567 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
5568 
5569 	dring_msg = vsw_create_dring_info_pkt(ldcp);
5570 	if (dring_msg == NULL) {
5571 		cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg");
5572 		return;
5573 	}
5574 
5575 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
5576 
5577 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
5578 
5579 	vsw_send_msg(ldcp, dring_msg,
5580 		sizeof (vio_dring_reg_msg_t));
5581 
5582 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
5583 
5584 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
5585 }
5586 
5587 static void
5588 vsw_send_rdx(vsw_ldc_t *ldcp)
5589 {
5590 	vsw_t		*vswp = ldcp->ldc_vswp;
5591 	vio_rdx_msg_t	rdx_msg;
5592 
5593 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5594 
5595 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5596 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5597 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
5598 	rdx_msg.tag.vio_sid = ldcp->local_session;
5599 
5600 	ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT;
5601 
5602 	DUMP_TAG(rdx_msg.tag);
5603 
5604 	vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t));
5605 
5606 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
5607 }
5608 
5609 /*
5610  * Generic routine to send message out over ldc channel.
5611  */
5612 static void
5613 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size)
5614 {
5615 	int		rv;
5616 	size_t		msglen = size;
5617 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
5618 	vsw_t		*vswp = ldcp->ldc_vswp;
5619 
5620 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
5621 			ldcp->ldc_id, size);
5622 
5623 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
5624 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
5625 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
5626 
5627 	mutex_enter(&ldcp->ldc_txlock);
5628 	do {
5629 		msglen = size;
5630 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
5631 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
5632 
5633 	mutex_exit(&ldcp->ldc_txlock);
5634 
5635 	if ((rv != 0) || (msglen != size)) {
5636 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
5637 			"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
5638 			rv, size, msglen);
5639 	}
5640 
5641 	D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes",
5642 			ldcp->ldc_id, msglen);
5643 }
5644 
5645 /*
5646  * Add an entry into FDB, for the given mac address and port_id.
5647  * Returns 0 on success, 1 on failure.
5648  *
5649  * Lock protecting FDB must be held by calling process.
5650  */
5651 static int
5652 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
5653 {
5654 	uint64_t	addr = 0;
5655 
5656 	D1(vswp, "%s: enter", __func__);
5657 
5658 	KEY_HASH(addr, port->p_macaddr);
5659 
5660 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
5661 
5662 	/*
5663 	 * Note: duplicate keys will be rejected by mod_hash.
5664 	 */
5665 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
5666 				(mod_hash_val_t)port) != 0) {
5667 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
5668 		return (1);
5669 	}
5670 
5671 	D1(vswp, "%s: exit", __func__);
5672 	return (0);
5673 }
5674 
5675 /*
5676  * Remove an entry from FDB.
5677  * Returns 0 on success, 1 on failure.
5678  */
5679 static int
5680 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
5681 {
5682 	uint64_t	addr = 0;
5683 
5684 	D1(vswp, "%s: enter", __func__);
5685 
5686 	KEY_HASH(addr, port->p_macaddr);
5687 
5688 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
5689 
5690 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
5691 
5692 	D1(vswp, "%s: enter", __func__);
5693 
5694 	return (0);
5695 }
5696 
5697 /*
5698  * Search fdb for a given mac address.
5699  * Returns pointer to the entry if found, else returns NULL.
5700  */
5701 static vsw_port_t *
5702 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
5703 {
5704 	uint64_t	key = 0;
5705 	vsw_port_t	*port = NULL;
5706 
5707 	D1(vswp, "%s: enter", __func__);
5708 
5709 	KEY_HASH(key, ehp->ether_dhost);
5710 
5711 	D2(vswp, "%s: key = 0x%llx", __func__, key);
5712 
5713 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
5714 				(mod_hash_val_t *)&port) != 0) {
5715 		return (NULL);
5716 	}
5717 
5718 	D1(vswp, "%s: exit", __func__);
5719 
5720 	return (port);
5721 }
5722 
5723 /*
5724  * Add or remove multicast address(es).
5725  *
5726  * Returns 0 on success, 1 on failure.
5727  */
5728 static int
5729 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
5730 {
5731 	mcst_addr_t		*mcst_p = NULL;
5732 	vsw_t			*vswp = port->p_vswp;
5733 	uint64_t		addr = 0x0;
5734 	int			i;
5735 
5736 	D1(vswp, "%s: enter", __func__);
5737 
5738 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
5739 
5740 	for (i = 0; i < mcst_pkt->count; i++) {
5741 		/*
5742 		 * Convert address into form that can be used
5743 		 * as hash table key.
5744 		 */
5745 		KEY_HASH(addr, mcst_pkt->mca[i]);
5746 
5747 		/*
5748 		 * Add or delete the specified address/port combination.
5749 		 */
5750 		if (mcst_pkt->set == 0x1) {
5751 			D3(vswp, "%s: adding multicast address 0x%llx for "
5752 				"port %ld", __func__, addr, port->p_instance);
5753 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
5754 				/*
5755 				 * Update the list of multicast
5756 				 * addresses contained within the
5757 				 * port structure to include this new
5758 				 * one.
5759 				 */
5760 				mcst_p = kmem_alloc(sizeof (mcst_addr_t),
5761 								KM_NOSLEEP);
5762 				if (mcst_p == NULL) {
5763 					DERR(vswp, "%s: unable to alloc mem",
5764 						__func__);
5765 					return (1);
5766 				}
5767 
5768 				mcst_p->nextp = NULL;
5769 				mcst_p->addr = addr;
5770 
5771 				mutex_enter(&port->mca_lock);
5772 				mcst_p->nextp = port->mcap;
5773 				port->mcap = mcst_p;
5774 				mutex_exit(&port->mca_lock);
5775 
5776 				/*
5777 				 * Program the address into HW. If the addr
5778 				 * has already been programmed then the MAC
5779 				 * just increments a ref counter (which is
5780 				 * used when the address is being deleted)
5781 				 *
5782 				 * Note:
5783 				 * For the moment we dont care if this
5784 				 * succeeds because the card must be in
5785 				 * promics mode. When we have the ability
5786 				 * to program multiple unicst address into
5787 				 * the card then we will need to check this
5788 				 * return value.
5789 				 */
5790 				if (vswp->mh != NULL)
5791 					(void) mac_multicst_add(vswp->mh,
5792 						(uchar_t *)&mcst_pkt->mca[i]);
5793 
5794 			} else {
5795 				DERR(vswp, "%s: error adding multicast "
5796 					"address 0x%llx for port %ld",
5797 					__func__, addr, port->p_instance);
5798 				return (1);
5799 			}
5800 		} else {
5801 			/*
5802 			 * Delete an entry from the multicast hash
5803 			 * table and update the address list
5804 			 * appropriately.
5805 			 */
5806 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
5807 				D3(vswp, "%s: deleting multicast address "
5808 					"0x%llx for port %ld", __func__, addr,
5809 					port->p_instance);
5810 
5811 				vsw_del_addr(VSW_VNETPORT, port, addr);
5812 
5813 				/*
5814 				 * Remove the address from HW. The address
5815 				 * will actually only be removed once the ref
5816 				 * count within the MAC layer has dropped to
5817 				 * zero. I.e. we can safely call this fn even
5818 				 * if other ports are interested in this
5819 				 * address.
5820 				 */
5821 				if (vswp->mh != NULL)
5822 					(void) mac_multicst_remove(vswp->mh,
5823 						(uchar_t *)&mcst_pkt->mca[i]);
5824 
5825 			} else {
5826 				DERR(vswp, "%s: error deleting multicast "
5827 					"addr 0x%llx for port %ld",
5828 					__func__, addr, port->p_instance);
5829 				return (1);
5830 			}
5831 		}
5832 	}
5833 	D1(vswp, "%s: exit", __func__);
5834 	return (0);
5835 }
5836 
5837 /*
5838  * Add a new multicast entry.
5839  *
5840  * Search hash table based on address. If match found then
5841  * update associated val (which is chain of ports), otherwise
5842  * create new key/val (addr/port) pair and insert into table.
5843  */
5844 static int
5845 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
5846 {
5847 	int		dup = 0;
5848 	int		rv = 0;
5849 	mfdb_ent_t	*ment = NULL;
5850 	mfdb_ent_t	*tmp_ent = NULL;
5851 	mfdb_ent_t	*new_ent = NULL;
5852 	void		*tgt = NULL;
5853 
5854 	if (devtype == VSW_VNETPORT) {
5855 		/*
5856 		 * Being invoked from a vnet.
5857 		 */
5858 		ASSERT(arg != NULL);
5859 		tgt = arg;
5860 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
5861 			((vsw_port_t *)arg)->p_instance, addr);
5862 	} else {
5863 		/*
5864 		 * We are being invoked via the m_multicst mac entry
5865 		 * point.
5866 		 */
5867 		D2(NULL, "%s: address 0x%llx", __func__, addr);
5868 		tgt = (void *)vswp;
5869 	}
5870 
5871 	WRITE_ENTER(&vswp->mfdbrw);
5872 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
5873 				(mod_hash_val_t *)&ment) != 0) {
5874 
5875 		/* address not currently in table */
5876 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
5877 		ment->d_addr = (void *)tgt;
5878 		ment->d_type = devtype;
5879 		ment->nextp = NULL;
5880 
5881 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
5882 			(mod_hash_val_t)ment) != 0) {
5883 			DERR(vswp, "%s: hash table insertion failed", __func__);
5884 			kmem_free(ment, sizeof (mfdb_ent_t));
5885 			rv = 1;
5886 		} else {
5887 			D2(vswp, "%s: added initial entry for 0x%llx to "
5888 				"table", __func__, addr);
5889 		}
5890 	} else {
5891 		/*
5892 		 * Address in table. Check to see if specified port
5893 		 * is already associated with the address. If not add
5894 		 * it now.
5895 		 */
5896 		tmp_ent = ment;
5897 		while (tmp_ent != NULL) {
5898 			if (tmp_ent->d_addr == (void *)tgt) {
5899 				if (devtype == VSW_VNETPORT) {
5900 					DERR(vswp, "%s: duplicate port entry "
5901 						"found for portid %ld and key "
5902 						"0x%llx", __func__,
5903 						((vsw_port_t *)arg)->p_instance,
5904 						addr);
5905 				} else {
5906 					DERR(vswp, "%s: duplicate entry found"
5907 						"for key 0x%llx",
5908 						__func__, addr);
5909 				}
5910 				rv = 1;
5911 				dup = 1;
5912 				break;
5913 			}
5914 			tmp_ent = tmp_ent->nextp;
5915 		}
5916 
5917 		/*
5918 		 * Port not on list so add it to end now.
5919 		 */
5920 		if (0 == dup) {
5921 			D2(vswp, "%s: added entry for 0x%llx to table",
5922 				__func__, addr);
5923 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
5924 			new_ent->d_addr = (void *)tgt;
5925 			new_ent->d_type = devtype;
5926 			new_ent->nextp = NULL;
5927 
5928 			tmp_ent = ment;
5929 			while (tmp_ent->nextp != NULL)
5930 				tmp_ent = tmp_ent->nextp;
5931 
5932 			tmp_ent->nextp = new_ent;
5933 		}
5934 	}
5935 
5936 	RW_EXIT(&vswp->mfdbrw);
5937 	return (rv);
5938 }
5939 
5940 /*
5941  * Remove a multicast entry from the hashtable.
5942  *
5943  * Search hash table based on address. If match found, scan
5944  * list of ports associated with address. If specified port
5945  * found remove it from list.
5946  */
5947 static int
5948 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
5949 {
5950 	mfdb_ent_t	*ment = NULL;
5951 	mfdb_ent_t	*curr_p, *prev_p;
5952 	void		*tgt = NULL;
5953 
5954 	D1(vswp, "%s: enter", __func__);
5955 
5956 	if (devtype == VSW_VNETPORT) {
5957 		tgt = (vsw_port_t *)arg;
5958 		D2(vswp, "%s: removing port %d from mFDB for address"
5959 			" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
5960 			addr);
5961 	} else {
5962 		D2(vswp, "%s: removing entry", __func__);
5963 		tgt = (void *)vswp;
5964 	}
5965 
5966 	WRITE_ENTER(&vswp->mfdbrw);
5967 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
5968 				(mod_hash_val_t *)&ment) != 0) {
5969 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
5970 		RW_EXIT(&vswp->mfdbrw);
5971 		return (1);
5972 	}
5973 
5974 	prev_p = curr_p = ment;
5975 
5976 	while (curr_p != NULL) {
5977 		if (curr_p->d_addr == (void *)tgt) {
5978 			if (devtype == VSW_VNETPORT) {
5979 				D2(vswp, "%s: port %d found", __func__,
5980 					((vsw_port_t *)tgt)->p_instance);
5981 			} else {
5982 				D2(vswp, "%s: instance found", __func__);
5983 			}
5984 
5985 			if (prev_p == curr_p) {
5986 				/*
5987 				 * head of list, if no other element is in
5988 				 * list then destroy this entry, otherwise
5989 				 * just replace it with updated value.
5990 				 */
5991 				ment = curr_p->nextp;
5992 				kmem_free(curr_p, sizeof (mfdb_ent_t));
5993 				if (ment == NULL) {
5994 					(void) mod_hash_destroy(vswp->mfdb,
5995 							(mod_hash_val_t)addr);
5996 				} else {
5997 					(void) mod_hash_replace(vswp->mfdb,
5998 							(mod_hash_key_t)addr,
5999 							(mod_hash_val_t)ment);
6000 				}
6001 			} else {
6002 				/*
6003 				 * Not head of list, no need to do
6004 				 * replacement, just adjust list pointers.
6005 				 */
6006 				prev_p->nextp = curr_p->nextp;
6007 				kmem_free(curr_p, sizeof (mfdb_ent_t));
6008 			}
6009 			break;
6010 		}
6011 
6012 		prev_p = curr_p;
6013 		curr_p = curr_p->nextp;
6014 	}
6015 
6016 	RW_EXIT(&vswp->mfdbrw);
6017 
6018 	D1(vswp, "%s: exit", __func__);
6019 
6020 	return (0);
6021 }
6022 
6023 /*
6024  * Port is being deleted, but has registered an interest in one
6025  * or more multicast groups. Using the list of addresses maintained
6026  * within the port structure find the appropriate entry in the hash
6027  * table and remove this port from the list of interested ports.
6028  */
6029 static void
6030 vsw_del_mcst_port(vsw_port_t *port)
6031 {
6032 	mcst_addr_t	*mcst_p = NULL;
6033 	vsw_t		*vswp = port->p_vswp;
6034 
6035 	D1(vswp, "%s: enter", __func__);
6036 
6037 	mutex_enter(&port->mca_lock);
6038 	while (port->mcap != NULL) {
6039 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
6040 					port->mcap->addr, port);
6041 
6042 		mcst_p = port->mcap->nextp;
6043 		kmem_free(port->mcap, sizeof (mcst_addr_t));
6044 		port->mcap = mcst_p;
6045 	}
6046 	mutex_exit(&port->mca_lock);
6047 
6048 	D1(vswp, "%s: exit", __func__);
6049 }
6050 
6051 /*
6052  * This vsw instance is detaching, but has registered an interest in one
6053  * or more multicast groups. Using the list of addresses maintained
6054  * within the vsw structure find the appropriate entry in the hash
6055  * table and remove this instance from the list of interested ports.
6056  */
6057 static void
6058 vsw_del_mcst_vsw(vsw_t *vswp)
6059 {
6060 	mcst_addr_t	*next_p = NULL;
6061 
6062 	D1(vswp, "%s: enter", __func__);
6063 
6064 	mutex_enter(&vswp->mca_lock);
6065 
6066 	while (vswp->mcap != NULL) {
6067 		DERR(vswp, "%s: deleting addr 0x%llx",
6068 			__func__, vswp->mcap->addr);
6069 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
6070 				vswp->mcap->addr, NULL);
6071 
6072 		next_p = vswp->mcap->nextp;
6073 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
6074 		vswp->mcap = next_p;
6075 	}
6076 
6077 	vswp->mcap = NULL;
6078 	mutex_exit(&vswp->mca_lock);
6079 
6080 	D1(vswp, "%s: exit", __func__);
6081 }
6082 
6083 
6084 /*
6085  * Remove the specified address from the list of address maintained
6086  * in this port node.
6087  */
6088 static void
6089 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
6090 {
6091 	vsw_t		*vswp = NULL;
6092 	vsw_port_t	*port = NULL;
6093 	mcst_addr_t	*prev_p = NULL;
6094 	mcst_addr_t	*curr_p = NULL;
6095 
6096 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
6097 		__func__, devtype, addr);
6098 
6099 	if (devtype == VSW_VNETPORT) {
6100 		port = (vsw_port_t *)arg;
6101 		mutex_enter(&port->mca_lock);
6102 		prev_p = curr_p = port->mcap;
6103 	} else {
6104 		vswp = (vsw_t *)arg;
6105 		mutex_enter(&vswp->mca_lock);
6106 		prev_p = curr_p = vswp->mcap;
6107 	}
6108 
6109 	while (curr_p != NULL) {
6110 		if (curr_p->addr == addr) {
6111 			D2(NULL, "%s: address found", __func__);
6112 			/* match found */
6113 			if (prev_p == curr_p) {
6114 				/* list head */
6115 				if (devtype == VSW_VNETPORT)
6116 					port->mcap = curr_p->nextp;
6117 				else
6118 					vswp->mcap = curr_p->nextp;
6119 			} else {
6120 				prev_p->nextp = curr_p->nextp;
6121 			}
6122 			kmem_free(curr_p, sizeof (mcst_addr_t));
6123 			break;
6124 		} else {
6125 			prev_p = curr_p;
6126 			curr_p = curr_p->nextp;
6127 		}
6128 	}
6129 
6130 	if (devtype == VSW_VNETPORT)
6131 		mutex_exit(&port->mca_lock);
6132 	else
6133 		mutex_exit(&vswp->mca_lock);
6134 
6135 	D1(NULL, "%s: exit", __func__);
6136 }
6137 
6138 /*
6139  * Creates a descriptor ring (dring) and links it into the
6140  * link of outbound drings for this channel.
6141  *
6142  * Returns NULL if creation failed.
6143  */
6144 static dring_info_t *
6145 vsw_create_dring(vsw_ldc_t *ldcp)
6146 {
6147 	vsw_private_desc_t	*priv_addr = NULL;
6148 	vsw_t			*vswp = ldcp->ldc_vswp;
6149 	ldc_mem_info_t		minfo;
6150 	dring_info_t		*dp, *tp;
6151 	int			i;
6152 
6153 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
6154 
6155 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
6156 
6157 	/* create public section of ring */
6158 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
6159 			VSW_PUB_SIZE, &dp->handle)) != 0) {
6160 
6161 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
6162 			"failed", ldcp->ldc_id);
6163 		goto create_fail_exit;
6164 	}
6165 
6166 	ASSERT(dp->handle != NULL);
6167 
6168 	/*
6169 	 * Get the base address of the public section of the ring.
6170 	 */
6171 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
6172 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
6173 			ldcp->ldc_id);
6174 		goto dring_fail_exit;
6175 	} else {
6176 		ASSERT(minfo.vaddr != 0);
6177 		dp->pub_addr = minfo.vaddr;
6178 	}
6179 
6180 	dp->num_descriptors = VSW_RING_NUM_EL;
6181 	dp->descriptor_size = VSW_PUB_SIZE;
6182 	dp->options = VIO_TX_DRING;
6183 	dp->ncookies = 1;	/* guaranteed by ldc */
6184 
6185 	/*
6186 	 * create private portion of ring
6187 	 */
6188 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
6189 		(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
6190 
6191 	if (vsw_setup_ring(ldcp, dp)) {
6192 		DERR(vswp, "%s: unable to setup ring", __func__);
6193 		goto dring_fail_exit;
6194 	}
6195 
6196 	/* haven't used any descriptors yet */
6197 	dp->end_idx = 0;
6198 
6199 	/* bind dring to the channel */
6200 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
6201 		LDC_SHADOW_MAP, LDC_MEM_RW,
6202 		&dp->cookie[0], &dp->ncookies)) != 0) {
6203 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
6204 			"%lld", ldcp->ldc_id);
6205 		goto dring_fail_exit;
6206 	}
6207 
6208 	/*
6209 	 * Only ever create rings for outgoing lane. Link it onto
6210 	 * end of list.
6211 	 */
6212 	if (ldcp->lane_out.dringp == NULL) {
6213 		D2(vswp, "vsw_create_dring: adding first outbound ring");
6214 		ldcp->lane_out.dringp = dp;
6215 	} else {
6216 		tp = ldcp->lane_out.dringp;
6217 		while (tp->next != NULL)
6218 			tp = tp->next;
6219 
6220 		tp->next = dp;
6221 	}
6222 
6223 	return (dp);
6224 
6225 dring_fail_exit:
6226 	(void) ldc_mem_dring_destroy(dp->handle);
6227 
6228 create_fail_exit:
6229 	if (dp->priv_addr != NULL) {
6230 		priv_addr = dp->priv_addr;
6231 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
6232 			if (priv_addr->memhandle != NULL)
6233 				(void) ldc_mem_free_handle(
6234 						priv_addr->memhandle);
6235 			priv_addr++;
6236 		}
6237 		kmem_free(dp->priv_addr,
6238 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
6239 	}
6240 	mutex_destroy(&dp->dlock);
6241 
6242 	kmem_free(dp, sizeof (dring_info_t));
6243 	return (NULL);
6244 }
6245 
6246 /*
6247  * Create a ring consisting of just a private portion and link
6248  * it into the list of rings for the outbound lane.
6249  *
6250  * These type of rings are used primarily for temporary data
6251  * storage (i.e. as data buffers).
6252  */
6253 void
6254 vsw_create_privring(vsw_ldc_t *ldcp)
6255 {
6256 	dring_info_t		*dp, *tp;
6257 	vsw_t			*vswp = ldcp->ldc_vswp;
6258 
6259 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6260 
6261 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
6262 
6263 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
6264 
6265 	/* no public section */
6266 	dp->pub_addr = NULL;
6267 
6268 	dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
6269 					VSW_RING_NUM_EL), KM_SLEEP);
6270 
6271 	if (vsw_setup_ring(ldcp, dp)) {
6272 		DERR(vswp, "%s: setup of ring failed", __func__);
6273 		kmem_free(dp->priv_addr,
6274 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
6275 		mutex_destroy(&dp->dlock);
6276 		kmem_free(dp, sizeof (dring_info_t));
6277 		return;
6278 	}
6279 
6280 	/* haven't used any descriptors yet */
6281 	dp->end_idx = 0;
6282 
6283 	/*
6284 	 * Only ever create rings for outgoing lane. Link it onto
6285 	 * end of list.
6286 	 */
6287 	if (ldcp->lane_out.dringp == NULL) {
6288 		D2(vswp, "%s: adding first outbound privring", __func__);
6289 		ldcp->lane_out.dringp = dp;
6290 	} else {
6291 		tp = ldcp->lane_out.dringp;
6292 		while (tp->next != NULL)
6293 			tp = tp->next;
6294 
6295 		tp->next = dp;
6296 	}
6297 
6298 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6299 }
6300 
6301 /*
6302  * Setup the descriptors in the dring. Returns 0 on success, 1 on
6303  * failure.
6304  */
6305 int
6306 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
6307 {
6308 	vnet_public_desc_t	*pub_addr = NULL;
6309 	vsw_private_desc_t	*priv_addr = NULL;
6310 	vsw_t			*vswp = ldcp->ldc_vswp;
6311 	uint64_t		*tmpp;
6312 	uint64_t		offset = 0;
6313 	uint32_t		ncookies = 0;
6314 	static char		*name = "vsw_setup_ring";
6315 	int			i, j, rv;
6316 
6317 	/* note - public section may be null */
6318 	priv_addr = dp->priv_addr;
6319 	pub_addr = dp->pub_addr;
6320 
6321 	/*
6322 	 * Allocate the region of memory which will be used to hold
6323 	 * the data the descriptors will refer to.
6324 	 */
6325 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
6326 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
6327 
6328 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
6329 		dp->data_sz, dp->data_addr);
6330 
6331 	tmpp = (uint64_t *)dp->data_addr;
6332 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
6333 
6334 	/*
6335 	 * Initialise some of the private and public (if they exist)
6336 	 * descriptor fields.
6337 	 */
6338 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
6339 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
6340 			&priv_addr->memhandle)) != 0) {
6341 			DERR(vswp, "%s: alloc mem handle failed", name);
6342 			goto setup_ring_cleanup;
6343 		}
6344 
6345 		priv_addr->datap = (void *)tmpp;
6346 
6347 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
6348 			(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
6349 			LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
6350 			&(priv_addr->memcookie[0]), &ncookies);
6351 		if (rv != 0) {
6352 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
6353 				"(rv %d)", name, ldcp->ldc_id, rv);
6354 			goto setup_ring_cleanup;
6355 		}
6356 		priv_addr->bound = 1;
6357 
6358 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
6359 			name, i, priv_addr->memcookie[0].addr,
6360 			priv_addr->memcookie[0].size);
6361 
6362 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
6363 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
6364 				"invalid num of cookies (%d) for size 0x%llx",
6365 				name, ldcp->ldc_id, ncookies,
6366 				VSW_RING_EL_DATA_SZ);
6367 
6368 			goto setup_ring_cleanup;
6369 		} else {
6370 			for (j = 1; j < ncookies; j++) {
6371 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
6372 					&(priv_addr->memcookie[j]));
6373 				if (rv != 0) {
6374 					DERR(vswp, "%s: ldc_mem_nextcookie "
6375 						"failed rv (%d)", name, rv);
6376 					goto setup_ring_cleanup;
6377 				}
6378 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
6379 					"size 0x%llx", name, j,
6380 					priv_addr->memcookie[j].addr,
6381 					priv_addr->memcookie[j].size);
6382 			}
6383 
6384 		}
6385 		priv_addr->ncookies = ncookies;
6386 		priv_addr->dstate = VIO_DESC_FREE;
6387 
6388 		if (pub_addr != NULL) {
6389 
6390 			/* link pub and private sides */
6391 			priv_addr->descp = pub_addr;
6392 
6393 			pub_addr->hdr.dstate = VIO_DESC_FREE;
6394 			pub_addr++;
6395 		}
6396 
6397 		/*
6398 		 * move to next element in the dring and the next
6399 		 * position in the data buffer.
6400 		 */
6401 		priv_addr++;
6402 		tmpp += offset;
6403 	}
6404 
6405 	return (0);
6406 
6407 setup_ring_cleanup:
6408 	priv_addr = dp->priv_addr;
6409 
6410 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
6411 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
6412 		(void) ldc_mem_free_handle(priv_addr->memhandle);
6413 
6414 		priv_addr++;
6415 	}
6416 	kmem_free(dp->data_addr, dp->data_sz);
6417 
6418 	return (1);
6419 }
6420 
6421 /*
6422  * Searches the private section of a ring for a free descriptor,
6423  * starting at the location of the last free descriptor found
6424  * previously.
6425  *
6426  * Returns 0 if free descriptor is available, 1 otherwise.
6427  *
6428  * FUTURE: might need to return contiguous range of descriptors
6429  * as dring info msg assumes all will be contiguous.
6430  */
6431 static int
6432 vsw_dring_find_free_desc(dring_info_t *dringp,
6433 		vsw_private_desc_t **priv_p, int *idx)
6434 {
6435 	vsw_private_desc_t	*addr;
6436 	uint64_t		i;
6437 	uint64_t		j = 0;
6438 	uint64_t		start = dringp->end_idx;
6439 	int			num = VSW_RING_NUM_EL;
6440 	int			ret = 1;
6441 
6442 	D1(NULL, "%s enter\n", __func__);
6443 
6444 	addr = dringp->priv_addr;
6445 
6446 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
6447 			__func__, dringp, start);
6448 
6449 	for (i = start; j < num; i = (i + 1) % num, j++) {
6450 		addr = (vsw_private_desc_t *)dringp->priv_addr + i;
6451 		D2(NULL, "%s: descriptor %lld : dstate 0x%llx\n",
6452 			__func__, i, addr->dstate);
6453 		if (addr->dstate == VIO_DESC_FREE) {
6454 			D2(NULL, "%s: descriptor %lld is available",
6455 								__func__, i);
6456 			*priv_p = addr;
6457 			*idx = i;
6458 			dringp->end_idx = (i + 1) % num;
6459 			ret = 0;
6460 			break;
6461 		}
6462 	}
6463 
6464 	/* ring full */
6465 	if (ret == 1) {
6466 		D2(NULL, "%s: no desp free: started at %d", __func__, start);
6467 	}
6468 
6469 	D1(NULL, "%s: exit\n", __func__);
6470 
6471 	return (ret);
6472 }
6473 
6474 /*
6475  * Copy relevant fields from the private descriptor into the
6476  * associated public side.
6477  */
6478 static void
6479 vsw_dring_priv2pub(vsw_private_desc_t *priv)
6480 {
6481 	vnet_public_desc_t	*pub;
6482 	int			i;
6483 
6484 	D1(NULL, "vsw_dring_priv2pub enter\n");
6485 
6486 	pub = priv->descp;
6487 
6488 	pub->ncookies = priv->ncookies;
6489 	pub->nbytes = priv->datalen;
6490 
6491 	for (i = 0; i < pub->ncookies; i++) {
6492 		bcopy(&priv->memcookie[i], &pub->memcookie[i],
6493 			sizeof (ldc_mem_cookie_t));
6494 	}
6495 
6496 	pub->hdr.ack = 1;
6497 	pub->hdr.dstate = VIO_DESC_READY;
6498 
6499 	D1(NULL, "vsw_dring_priv2pub exit");
6500 }
6501 
6502 /*
6503  * Map from a dring identifier to the ring itself. Returns
6504  * pointer to ring or NULL if no match found.
6505  */
6506 static dring_info_t *
6507 vsw_ident2dring(lane_t *lane, uint64_t ident)
6508 {
6509 	dring_info_t	*dp = NULL;
6510 
6511 	if ((dp = lane->dringp) == NULL) {
6512 		return (NULL);
6513 	} else {
6514 		if (dp->ident == ident)
6515 			return (dp);
6516 
6517 		while (dp != NULL) {
6518 			if (dp->ident == ident)
6519 				break;
6520 			dp = dp->next;
6521 		}
6522 	}
6523 
6524 	return (dp);
6525 }
6526 
6527 /*
6528  * Set the default lane attributes. These are copied into
6529  * the attr msg we send to our peer. If they are not acceptable
6530  * then (currently) the handshake ends.
6531  */
6532 static void
6533 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
6534 {
6535 	bzero(lp, sizeof (lane_t));
6536 
6537 	READ_ENTER(&vswp->if_lockrw);
6538 	ether_copy(&(vswp->if_addr), &(lp->addr));
6539 	RW_EXIT(&vswp->if_lockrw);
6540 
6541 	lp->mtu = VSW_MTU;
6542 	lp->addr_type = ADDR_TYPE_MAC;
6543 	lp->xfer_mode = VIO_DRING_MODE;
6544 	lp->ack_freq = 0;	/* for shared mode */
6545 	lp->seq_num = VNET_ISS;
6546 }
6547 
6548 /*
6549  * Verify that the attributes are acceptable.
6550  *
6551  * FUTURE: If some attributes are not acceptable, change them
6552  * our desired values.
6553  */
6554 static int
6555 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
6556 {
6557 	int	ret = 0;
6558 
6559 	D1(NULL, "vsw_check_attr enter\n");
6560 
6561 	/*
6562 	 * Note we currently only support in-band descriptors
6563 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
6564 	 */
6565 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
6566 			(pkt->xfer_mode != VIO_DRING_MODE)) {
6567 		D2(NULL, "vsw_check_attr: unknown mode %x\n",
6568 			pkt->xfer_mode);
6569 		ret = 1;
6570 	}
6571 
6572 	/* Only support MAC addresses at moment. */
6573 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
6574 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
6575 			"or address 0x%llx\n", pkt->addr_type,
6576 			pkt->addr);
6577 		ret = 1;
6578 	}
6579 
6580 	/*
6581 	 * MAC address supplied by device should match that stored
6582 	 * in the vsw-port OBP node. Need to decide what to do if they
6583 	 * don't match, for the moment just warn but don't fail.
6584 	 */
6585 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
6586 		DERR(NULL, "vsw_check_attr: device supplied address "
6587 			"0x%llx doesn't match node address 0x%llx\n",
6588 			pkt->addr, port->p_macaddr);
6589 	}
6590 
6591 	/*
6592 	 * Ack freq only makes sense in pkt mode, in shared
6593 	 * mode the ring descriptors say whether or not to
6594 	 * send back an ACK.
6595 	 */
6596 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
6597 				(pkt->ack_freq > 0)) {
6598 		D2(NULL, "vsw_check_attr: non zero ack freq "
6599 			" in SHM mode\n");
6600 		ret = 1;
6601 	}
6602 
6603 	/*
6604 	 * Note: for the moment we only support ETHER
6605 	 * frames. This may change in the future.
6606 	 */
6607 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
6608 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
6609 			pkt->mtu);
6610 		ret = 1;
6611 	}
6612 
6613 	D1(NULL, "vsw_check_attr exit\n");
6614 
6615 	return (ret);
6616 }
6617 
6618 /*
6619  * Returns 1 if there is a problem, 0 otherwise.
6620  */
6621 static int
6622 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
6623 {
6624 	_NOTE(ARGUNUSED(pkt))
6625 
6626 	int	ret = 0;
6627 
6628 	D1(NULL, "vsw_check_dring_info enter\n");
6629 
6630 	if ((pkt->num_descriptors == 0) ||
6631 		(pkt->descriptor_size == 0) ||
6632 		(pkt->ncookies != 1)) {
6633 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
6634 		ret = 1;
6635 	}
6636 
6637 	D1(NULL, "vsw_check_dring_info exit\n");
6638 
6639 	return (ret);
6640 }
6641 
6642 /*
6643  * Returns 1 if two memory cookies match. Otherwise returns 0.
6644  */
6645 static int
6646 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
6647 {
6648 	if ((m1->addr != m2->addr) ||
6649 		(m2->size != m2->size)) {
6650 		return (0);
6651 	} else {
6652 		return (1);
6653 	}
6654 }
6655 
6656 /*
6657  * Returns 1 if ring described in reg message matches that
6658  * described by dring_info structure. Otherwise returns 0.
6659  */
6660 static int
6661 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
6662 {
6663 	if ((msg->descriptor_size != dp->descriptor_size) ||
6664 		(msg->num_descriptors != dp->num_descriptors) ||
6665 		(msg->ncookies != dp->ncookies) ||
6666 		!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
6667 		return (0);
6668 	} else {
6669 		return (1);
6670 	}
6671 
6672 }
6673 
6674 static caddr_t
6675 vsw_print_ethaddr(uint8_t *a, char *ebuf)
6676 {
6677 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
6678 	    a[0], a[1], a[2], a[3], a[4], a[5]);
6679 	return (ebuf);
6680 }
6681 
6682 /*
6683  * Reset and free all the resources associated with
6684  * the channel.
6685  */
6686 static void
6687 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
6688 {
6689 	dring_info_t		*dp, *dpp;
6690 	lane_t			*lp = NULL;
6691 	int			rv = 0;
6692 
6693 	ASSERT(ldcp != NULL);
6694 
6695 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
6696 
6697 	if (dir == INBOUND) {
6698 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
6699 			" of channel %lld", __func__, ldcp->ldc_id);
6700 		lp = &ldcp->lane_in;
6701 	} else {
6702 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
6703 			" of channel %lld", __func__, ldcp->ldc_id);
6704 		lp = &ldcp->lane_out;
6705 	}
6706 
6707 	lp->lstate = VSW_LANE_INACTIV;
6708 	lp->seq_num = VNET_ISS;
6709 	if (lp->dringp) {
6710 		if (dir == INBOUND) {
6711 			dp = lp->dringp;
6712 			while (dp != NULL) {
6713 				dpp = dp->next;
6714 				if (dp->handle != NULL)
6715 					(void) ldc_mem_dring_unmap(dp->handle);
6716 				kmem_free(dp, sizeof (dring_info_t));
6717 				dp = dpp;
6718 			}
6719 		} else {
6720 			/*
6721 			 * unbind, destroy exported dring, free dring struct
6722 			 */
6723 			dp = lp->dringp;
6724 			rv = vsw_free_ring(dp);
6725 		}
6726 		if (rv == 0) {
6727 			lp->dringp = NULL;
6728 		}
6729 	}
6730 
6731 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
6732 }
6733 
6734 /*
6735  * Free ring and all associated resources.
6736  */
6737 static int
6738 vsw_free_ring(dring_info_t *dp)
6739 {
6740 	vsw_private_desc_t	*paddr = NULL;
6741 	dring_info_t		*dpp;
6742 	int			i, rv = 1;
6743 
6744 	while (dp != NULL) {
6745 		mutex_enter(&dp->dlock);
6746 		dpp = dp->next;
6747 		if (dp->priv_addr != NULL) {
6748 			/*
6749 			 * First unbind and free the memory handles
6750 			 * stored in each descriptor within the ring.
6751 			 */
6752 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
6753 				paddr = (vsw_private_desc_t *)
6754 						dp->priv_addr + i;
6755 				if (paddr->memhandle != NULL) {
6756 					if (paddr->bound == 1) {
6757 						rv = ldc_mem_unbind_handle(
6758 							paddr->memhandle);
6759 
6760 						if (rv != 0) {
6761 							DERR(NULL, "error "
6762 							"unbinding handle for "
6763 							"ring 0x%llx at pos %d",
6764 							dp, i);
6765 							mutex_exit(&dp->dlock);
6766 							return (rv);
6767 						}
6768 						paddr->bound = 0;
6769 					}
6770 
6771 					rv = ldc_mem_free_handle(
6772 							paddr->memhandle);
6773 					if (rv != 0) {
6774 						DERR(NULL, "error freeing "
6775 							"handle for ring "
6776 							"0x%llx at pos %d",
6777 							dp, i);
6778 						mutex_exit(&dp->dlock);
6779 						return (rv);
6780 					}
6781 					paddr->memhandle = NULL;
6782 				}
6783 			}
6784 			kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
6785 					* VSW_RING_NUM_EL));
6786 		}
6787 
6788 		/*
6789 		 * Now unbind and destroy the ring itself.
6790 		 */
6791 		if (dp->handle != NULL) {
6792 			(void) ldc_mem_dring_unbind(dp->handle);
6793 			(void) ldc_mem_dring_destroy(dp->handle);
6794 		}
6795 
6796 		if (dp->data_addr != NULL) {
6797 			kmem_free(dp->data_addr, dp->data_sz);
6798 		}
6799 
6800 		mutex_exit(&dp->dlock);
6801 		mutex_destroy(&dp->dlock);
6802 		kmem_free(dp, sizeof (dring_info_t));
6803 
6804 		dp = dpp;
6805 	}
6806 	return (0);
6807 }
6808 
6809 /*
6810  * Debugging routines
6811  */
6812 static void
6813 display_state(void)
6814 {
6815 	vsw_t		*vswp;
6816 	vsw_port_list_t	*plist;
6817 	vsw_port_t 	*port;
6818 	vsw_ldc_list_t	*ldcl;
6819 	vsw_ldc_t 	*ldcp;
6820 
6821 	cmn_err(CE_NOTE, "***** system state *****");
6822 
6823 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
6824 		plist = &vswp->plist;
6825 		READ_ENTER(&plist->lockrw);
6826 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
6827 			vswp->instance, plist->num_ports);
6828 
6829 		for (port = plist->head; port != NULL; port = port->p_next) {
6830 			ldcl = &port->p_ldclist;
6831 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
6832 				port->p_instance, ldcl->num_ldcs);
6833 			READ_ENTER(&ldcl->lockrw);
6834 			ldcp = ldcl->head;
6835 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
6836 				cmn_err(CE_CONT, "chan %lu : dev %d : "
6837 					"status %d : phase %u\n",
6838 					ldcp->ldc_id, ldcp->dev_class,
6839 					ldcp->ldc_status, ldcp->hphase);
6840 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
6841 					"psession %lu\n",
6842 					ldcp->ldc_id,
6843 					ldcp->local_session,
6844 					ldcp->peer_session);
6845 
6846 				cmn_err(CE_CONT, "Inbound lane:\n");
6847 				display_lane(&ldcp->lane_in);
6848 				cmn_err(CE_CONT, "Outbound lane:\n");
6849 				display_lane(&ldcp->lane_out);
6850 			}
6851 			RW_EXIT(&ldcl->lockrw);
6852 		}
6853 		RW_EXIT(&plist->lockrw);
6854 	}
6855 	cmn_err(CE_NOTE, "***** system state *****");
6856 }
6857 
6858 static void
6859 display_lane(lane_t *lp)
6860 {
6861 	dring_info_t	*drp;
6862 
6863 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
6864 		lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
6865 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
6866 		lp->addr_type, lp->addr, lp->xfer_mode);
6867 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
6868 
6869 	cmn_err(CE_CONT, "Dring info:\n");
6870 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
6871 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
6872 			drp->num_descriptors, drp->descriptor_size);
6873 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
6874 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
6875 			(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
6876 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
6877 			drp->ident, drp->end_idx);
6878 		display_ring(drp);
6879 	}
6880 }
6881 
6882 static void
6883 display_ring(dring_info_t *dringp)
6884 {
6885 	uint64_t		i;
6886 	uint64_t		priv_count = 0;
6887 	uint64_t		pub_count = 0;
6888 	vnet_public_desc_t	*pub_addr = NULL;
6889 	vsw_private_desc_t	*priv_addr = NULL;
6890 
6891 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
6892 		if (dringp->pub_addr != NULL) {
6893 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
6894 
6895 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
6896 				pub_count++;
6897 		}
6898 
6899 		if (dringp->priv_addr != NULL) {
6900 			priv_addr =
6901 				(vsw_private_desc_t *)dringp->priv_addr + i;
6902 
6903 			if (priv_addr->dstate == VIO_DESC_FREE)
6904 				priv_count++;
6905 		}
6906 	}
6907 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
6908 			i, priv_count, pub_count);
6909 }
6910 
6911 static void
6912 dump_flags(uint64_t state)
6913 {
6914 	int	i;
6915 
6916 	typedef struct flag_name {
6917 		int	flag_val;
6918 		char	*flag_name;
6919 	} flag_name_t;
6920 
6921 	flag_name_t	flags[] = {
6922 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
6923 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
6924 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
6925 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
6926 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
6927 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
6928 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
6929 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
6930 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
6931 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
6932 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
6933 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
6934 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
6935 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
6936 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
6937 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
6938 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
6939 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
6940 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
6941 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
6942 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
6943 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
6944 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
6945 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
6946 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
6947 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
6948 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
6949 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
6950 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
6951 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
6952 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
6953 
6954 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
6955 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
6956 		if (state & flags[i].flag_val)
6957 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
6958 	}
6959 }
6960