xref: /titanic_51/usr/src/uts/sun4v/io/vsw.c (revision ba2e4443695ee6a6f420a35cd4fc3d3346d22932)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 
72 /*
73  * Function prototypes.
74  */
75 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
76 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
77 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
78 static	void vsw_get_md_properties(vsw_t *vswp);
79 static	int vsw_setup_layer2(vsw_t *);
80 static	int vsw_setup_layer3(vsw_t *);
81 
82 /* MAC layer routines */
83 static	int vsw_mac_attach(vsw_t *vswp);
84 static	void vsw_mac_detach(vsw_t *vswp);
85 static void vsw_notify_cb(void *, mac_notify_type_t);
86 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
87 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
88 static int vsw_mac_register(vsw_t *);
89 static int vsw_mac_unregister(vsw_t *);
90 static int vsw_m_stat(void *, uint_t, uint64_t *);
91 static void vsw_m_stop(void *arg);
92 static int vsw_m_start(void *arg);
93 static int vsw_m_unicst(void *arg, const uint8_t *);
94 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
95 static int vsw_m_promisc(void *arg, boolean_t);
96 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
97 
98 /* MDEG routines */
99 static	void vsw_mdeg_register(vsw_t *vswp);
100 static	void vsw_mdeg_unregister(vsw_t *vswp);
101 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
102 
103 /* Port add/deletion routines */
104 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
105 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
106 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
107 static	int vsw_detach_ports(vsw_t *vswp);
108 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
109 static	int vsw_port_delete(vsw_port_t *port);
110 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
111 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
112 static	int vsw_init_ldcs(vsw_port_t *port);
113 static	int vsw_uninit_ldcs(vsw_port_t *port);
114 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
115 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
116 static	int vsw_drain_ldcs(vsw_port_t *port);
117 static	int vsw_drain_port_taskq(vsw_port_t *port);
118 static	void vsw_marker_task(void *);
119 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
120 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
121 
122 /* Interrupt routines */
123 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
124 
125 /* Handshake routines */
126 static	void vsw_restart_handshake(vsw_ldc_t *);
127 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
128 static	void vsw_next_milestone(vsw_ldc_t *);
129 static	int vsw_supported_version(vio_ver_msg_t *);
130 
131 /* Data processing routines */
132 static void vsw_process_pkt(void *);
133 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
134 static void vsw_process_ctrl_pkt(void *);
135 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
136 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
137 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
138 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
139 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
140 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
141 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
142 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
143 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
144 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
145 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
146 
147 /* Switching/data transmit routines */
148 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
149 	    vsw_port_t *port, mac_resource_handle_t);
150 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
151 	    vsw_port_t *port, mac_resource_handle_t);
152 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
153 	    vsw_port_t *port);
154 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
155 	    vsw_port_t *port);
156 static	int vsw_portsend(vsw_port_t *, mblk_t *);
157 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
158 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
159 
160 /* Packet creation routines */
161 static void vsw_send_ver(vsw_ldc_t *);
162 static void vsw_send_attr(vsw_ldc_t *);
163 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
164 static void vsw_send_dring_info(vsw_ldc_t *);
165 static void vsw_send_rdx(vsw_ldc_t *);
166 
167 static void vsw_send_msg(vsw_ldc_t *, void *, int);
168 
169 /* Forwarding database (FDB) routines */
170 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
171 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
172 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
173 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
174 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
175 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
176 static	void vsw_del_addr(uint8_t, void *, uint64_t);
177 static	void vsw_del_mcst_port(vsw_port_t *);
178 static	void vsw_del_mcst_vsw(vsw_t *);
179 
180 /* Dring routines */
181 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
182 static void vsw_create_privring(vsw_ldc_t *);
183 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
184 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
185     int *);
186 static void vsw_dring_priv2pub(vsw_private_desc_t *);
187 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
188 
189 static void vsw_set_lane_attr(vsw_t *, lane_t *);
190 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
191 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
192 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
193 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
194 
195 /* Misc support routines */
196 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
197 
198 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
199 static int vsw_free_ring(dring_info_t *);
200 
201 /* Debugging routines */
202 static void dump_flags(uint64_t);
203 static void display_state(void);
204 static void display_lane(lane_t *);
205 static void display_ring(dring_info_t *);
206 
207 int	vsw_num_handshakes = 3;		/* # of handshake attempts */
208 int	vsw_wretries = 100;		/* # of write attempts */
209 
210 /*
211  * mode specific frame switching function
212  */
213 void		(*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *,
214 			mac_resource_handle_t);
215 
216 static	mac_callbacks_t	vsw_m_callbacks = {
217 	0,
218 	vsw_m_stat,
219 	vsw_m_start,
220 	vsw_m_stop,
221 	vsw_m_promisc,
222 	vsw_m_multicst,
223 	vsw_m_unicst,
224 	vsw_m_tx,
225 	NULL,
226 	NULL,
227 	NULL
228 };
229 
230 static	struct	cb_ops	vsw_cb_ops = {
231 	nulldev,			/* cb_open */
232 	nulldev,			/* cb_close */
233 	nodev,				/* cb_strategy */
234 	nodev,				/* cb_print */
235 	nodev,				/* cb_dump */
236 	nodev,				/* cb_read */
237 	nodev,				/* cb_write */
238 	nodev,				/* cb_ioctl */
239 	nodev,				/* cb_devmap */
240 	nodev,				/* cb_mmap */
241 	nodev,				/* cb_segmap */
242 	nochpoll,			/* cb_chpoll */
243 	ddi_prop_op,			/* cb_prop_op */
244 	NULL,				/* cb_stream */
245 	D_MP,				/* cb_flag */
246 	CB_REV,				/* rev */
247 	nodev,				/* int (*cb_aread)() */
248 	nodev				/* int (*cb_awrite)() */
249 };
250 
251 static	struct	dev_ops	vsw_ops = {
252 	DEVO_REV,		/* devo_rev */
253 	0,			/* devo_refcnt */
254 	vsw_getinfo,		/* devo_getinfo */
255 	nulldev,		/* devo_identify */
256 	nulldev,		/* devo_probe */
257 	vsw_attach,		/* devo_attach */
258 	vsw_detach,		/* devo_detach */
259 	nodev,			/* devo_reset */
260 	&vsw_cb_ops,		/* devo_cb_ops */
261 	(struct bus_ops *)NULL,	/* devo_bus_ops */
262 	ddi_power		/* devo_power */
263 };
264 
265 extern	struct	mod_ops	mod_driverops;
266 static struct modldrv vswmodldrv = {
267 	&mod_driverops,
268 	"sun4v Virtual Switch Driver %I%",
269 	&vsw_ops,
270 };
271 
272 #define	LDC_ENTER_LOCK(ldcp)	\
273 				mutex_enter(&((ldcp)->ldc_cblock));\
274 				mutex_enter(&((ldcp)->ldc_txlock));
275 #define	LDC_EXIT_LOCK(ldcp)	\
276 				mutex_exit(&((ldcp)->ldc_txlock));\
277 				mutex_exit(&((ldcp)->ldc_cblock));
278 
279 /* Driver soft state ptr  */
280 static void	*vsw_state;
281 
282 /*
283  * Linked list of "vsw_t" structures - one per instance.
284  */
285 vsw_t		*vsw_head = NULL;
286 krwlock_t	vsw_rw;
287 
288 /*
289  * Property names
290  */
291 static char vdev_propname[] = "virtual-device";
292 static char vsw_propname[] = "virtual-network-switch";
293 static char physdev_propname[] = "vsw-phys-dev";
294 static char smode_propname[] = "vsw-switch-mode";
295 static char macaddr_propname[] = "local-mac-address";
296 static char remaddr_propname[] = "remote-mac-address";
297 static char ldcids_propname[] = "ldc-ids";
298 static char chan_propname[] = "channel-endpoint";
299 static char id_propname[] = "id";
300 static char reg_propname[] = "reg";
301 
302 /* supported versions */
303 static	ver_sup_t	vsw_versions[] = { {1, 0} };
304 
305 /*
306  * Matching criteria passed to the MDEG to register interest
307  * in changes to 'virtual-device-port' nodes identified by their
308  * 'id' property.
309  */
310 static md_prop_match_t vport_prop_match[] = {
311 	{ MDET_PROP_VAL,    "id"   },
312 	{ MDET_LIST_END,    NULL    }
313 };
314 
315 static mdeg_node_match_t vport_match = { "virtual-device-port",
316 						vport_prop_match };
317 
318 /*
319  * Specification of an MD node passed to the MDEG to filter any
320  * 'vport' nodes that do not belong to the specified node. This
321  * template is copied for each vsw instance and filled in with
322  * the appropriate 'cfg-handle' value before being passed to the MDEG.
323  */
324 static mdeg_prop_spec_t vsw_prop_template[] = {
325 	{ MDET_PROP_STR,    "name",		vsw_propname },
326 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
327 	{ MDET_LIST_END,    NULL,		NULL	}
328 };
329 
330 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
331 
332 /*
333  * Print debug messages - set to 0x1f to enable all msgs
334  * or 0x0 to turn all off.
335  */
336 int vswdbg = 0x0;
337 
338 /*
339  * debug levels:
340  * 0x01:	Function entry/exit tracing
341  * 0x02:	Internal function messages
342  * 0x04:	Verbose internal messages
343  * 0x08:	Warning messages
344  * 0x10:	Error messages
345  */
346 
347 static void
348 vswdebug(vsw_t *vswp, const char *fmt, ...)
349 {
350 	char buf[512];
351 	va_list ap;
352 
353 	va_start(ap, fmt);
354 	(void) vsprintf(buf, fmt, ap);
355 	va_end(ap);
356 
357 	if (vswp == NULL)
358 		cmn_err(CE_CONT, "%s\n", buf);
359 	else
360 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
361 }
362 
363 /*
364  * For the moment the state dump routines have their own
365  * private flag.
366  */
367 #define	DUMP_STATE	0
368 
369 #if DUMP_STATE
370 
371 #define	DUMP_TAG(tag) \
372 {			\
373 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
374 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
375 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
376 }
377 
378 #define	DUMP_TAG_PTR(tag) \
379 {			\
380 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
381 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
382 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
383 }
384 
385 #define	DUMP_FLAGS(flags) dump_flags(flags);
386 #define	DISPLAY_STATE()	display_state()
387 
388 #else
389 
390 #define	DUMP_TAG(tag)
391 #define	DUMP_TAG_PTR(tag)
392 #define	DUMP_FLAGS(state)
393 #define	DISPLAY_STATE()
394 
395 #endif	/* DUMP_STATE */
396 
397 #ifdef DEBUG
398 
399 #define	D1		\
400 if (vswdbg & 0x01)	\
401 	vswdebug
402 
403 #define	D2		\
404 if (vswdbg & 0x02)	\
405 	vswdebug
406 
407 #define	D3		\
408 if (vswdbg & 0x04)	\
409 	vswdebug
410 
411 #define	DWARN		\
412 if (vswdbg & 0x08)	\
413 	vswdebug
414 
415 #define	DERR		\
416 if (vswdbg & 0x10)	\
417 	vswdebug
418 
419 #else
420 
421 #define	DERR		if (0)	vswdebug
422 #define	DWARN		if (0)	vswdebug
423 #define	D1		if (0)	vswdebug
424 #define	D2		if (0)	vswdebug
425 #define	D3		if (0)	vswdebug
426 
427 #endif	/* DEBUG */
428 
429 static struct modlinkage modlinkage = {
430 	MODREV_1,
431 	&vswmodldrv,
432 	NULL
433 };
434 
435 int
436 _init(void)
437 {
438 	int status;
439 
440 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
441 
442 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
443 	if (status != 0) {
444 		return (status);
445 	}
446 
447 	mac_init_ops(&vsw_ops, "vsw");
448 	status = mod_install(&modlinkage);
449 	if (status != 0) {
450 		ddi_soft_state_fini(&vsw_state);
451 	}
452 	return (status);
453 }
454 
455 int
456 _fini(void)
457 {
458 	int status;
459 
460 	status = mod_remove(&modlinkage);
461 	if (status != 0)
462 		return (status);
463 	mac_fini_ops(&vsw_ops);
464 	ddi_soft_state_fini(&vsw_state);
465 
466 	rw_destroy(&vsw_rw);
467 
468 	return (status);
469 }
470 
471 int
472 _info(struct modinfo *modinfop)
473 {
474 	return (mod_info(&modlinkage, modinfop));
475 }
476 
477 static int
478 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
479 {
480 	vsw_t		*vswp;
481 	int		smode, instance, i;
482 	char		hashname[MAXNAMELEN];
483 	char		qname[TASKQ_NAMELEN];
484 	int		rv = 1;
485 	enum		{ PROG_init = 0x0, PROG_if_lock = 0x1,
486 				PROG_fdb = 0x2, PROG_mfdb = 0x4,
487 				PROG_report_dev = 0x8, PROG_plist = 0x10,
488 				PROG_taskq = 0x20}
489 			progress;
490 
491 	progress = PROG_init;
492 
493 	switch (cmd) {
494 	case DDI_ATTACH:
495 		break;
496 	case DDI_RESUME:
497 		/* nothing to do for this non-device */
498 		return (DDI_SUCCESS);
499 	case DDI_PM_RESUME:
500 	default:
501 		return (DDI_FAILURE);
502 	}
503 
504 	instance = ddi_get_instance(dip);
505 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
506 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
507 		return (DDI_FAILURE);
508 	}
509 	vswp = ddi_get_soft_state(vsw_state, instance);
510 
511 	if (vswp == NULL) {
512 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
513 		goto vsw_attach_fail;
514 	}
515 
516 	vswp->dip = dip;
517 	vswp->instance = instance;
518 	ddi_set_driver_private(dip, (caddr_t)vswp);
519 
520 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
521 
522 	progress |= PROG_if_lock;
523 
524 	/*
525 	 * User specifies (via MD) an array of switching modes in
526 	 * decreasing order of preference. Default mode is always
527 	 * layer 2 (mac switching), so init array with that value.
528 	 */
529 	vswp->smode_idx = 0;
530 	for (i = 0; i < NUM_SMODES; i++)
531 		vswp->smode[i] = VSW_LAYER2;
532 
533 	/*
534 	 * Get the various properties such as physical device name
535 	 * (vsw-phys-dev), switch mode etc from the MD.
536 	 */
537 	vsw_get_md_properties(vswp);
538 
539 	/* setup the unicast forwarding database  */
540 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
541 							vswp->instance);
542 	D2(vswp, "creating unicast hash table (%s)...", hashname);
543 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
544 		mod_hash_null_valdtor, sizeof (void *));
545 
546 	progress |= PROG_fdb;
547 
548 	/* setup the multicast fowarding database */
549 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
550 							vswp->instance);
551 	D2(vswp, "creating multicast hash table %s)...", hashname);
552 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
553 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
554 			mod_hash_null_valdtor, sizeof (void *));
555 
556 	progress |= PROG_mfdb;
557 
558 	/*
559 	 * create lock protecting list of multicast addresses
560 	 * which could come via m_multicst() entry point when plumbed.
561 	 */
562 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
563 	vswp->mcap = NULL;
564 
565 	ddi_report_dev(vswp->dip);
566 
567 	progress |= PROG_report_dev;
568 
569 	WRITE_ENTER(&vsw_rw);
570 	vswp->next = vsw_head;
571 	vsw_head = vswp;
572 	RW_EXIT(&vsw_rw);
573 
574 	/* setup the port list */
575 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
576 	vswp->plist.head = NULL;
577 
578 	progress |= PROG_plist;
579 
580 	/*
581 	 * Create the taskq which will process all the VIO
582 	 * control messages.
583 	 */
584 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
585 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
586 					TASKQ_DEFAULTPRI, 0)) == NULL) {
587 		cmn_err(CE_WARN, "Unable to create task queue");
588 		goto vsw_attach_fail;
589 	}
590 
591 	progress |= PROG_taskq;
592 
593 	/* select best switching mode */
594 	for (i = 0; i < NUM_SMODES; i++) {
595 		smode = vswp->smode[i];
596 		switch (smode) {
597 		case VSW_LAYER2:
598 			rv = vsw_setup_layer2(vswp);
599 			break;
600 
601 		case VSW_LAYER2_PROMISC:
602 			rv = vsw_setup_layer2(vswp);
603 			break;
604 
605 		case VSW_LAYER3:
606 			rv = vsw_setup_layer3(vswp);
607 			break;
608 
609 		default:
610 			DERR(vswp, "unknown switch mode");
611 			break;
612 		}
613 
614 		if (rv == 0) {
615 			vswp->smode_idx = i;
616 			break;
617 		}
618 	}
619 
620 	if (rv == 1) {
621 		cmn_err(CE_WARN, "Unable to setup switching mode");
622 		goto vsw_attach_fail;
623 	}
624 
625 	D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]);
626 
627 	/*
628 	 * Register with the MAC layer as a network device so
629 	 * we can be plumbed if desired.
630 	 *
631 	 * Do this in both layer 2 and layer 3 mode.
632 	 */
633 	vswp->if_state &= ~VSW_IF_UP;
634 	if (vswp->mdprops & VSW_MD_MACADDR) {
635 		if (vsw_mac_register(vswp) != 0) {
636 			cmn_err(CE_WARN, "Unable to register as provider "
637 				" with MAC layer, continuing with attach");
638 		}
639 	}
640 
641 	/*
642 	 * Now we have everything setup, register for MD change
643 	 * events.
644 	 */
645 	vsw_mdeg_register(vswp);
646 
647 	return (DDI_SUCCESS);
648 
649 vsw_attach_fail:
650 	DERR(NULL, "vsw_attach: failed");
651 
652 	if (progress & PROG_taskq)
653 		ddi_taskq_destroy(vswp->taskq_p);
654 
655 	if (progress & PROG_plist)
656 		rw_destroy(&vswp->plist.lockrw);
657 
658 	if (progress & PROG_report_dev) {
659 		ddi_remove_minor_node(dip, NULL);
660 		mutex_destroy(&vswp->mca_lock);
661 	}
662 
663 	if (progress & PROG_mfdb) {
664 		mod_hash_destroy_hash(vswp->mfdb);
665 		vswp->mfdb = NULL;
666 		rw_destroy(&vswp->mfdbrw);
667 	}
668 
669 	if (progress & PROG_fdb) {
670 		mod_hash_destroy_hash(vswp->fdb);
671 		vswp->fdb = NULL;
672 	}
673 
674 	if (progress & PROG_if_lock)
675 		rw_destroy(&vswp->if_lockrw);
676 
677 	ddi_soft_state_free(vsw_state, instance);
678 	return (DDI_FAILURE);
679 }
680 
681 static int
682 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
683 {
684 	vsw_t	**vswpp, *vswp;
685 	int 	instance;
686 
687 	instance = ddi_get_instance(dip);
688 	vswp = ddi_get_soft_state(vsw_state, instance);
689 
690 	if (vswp == NULL) {
691 		return (DDI_FAILURE);
692 	}
693 
694 	switch (cmd) {
695 	case DDI_DETACH:
696 		break;
697 	case DDI_SUSPEND:
698 	case DDI_PM_SUSPEND:
699 	default:
700 		return (DDI_FAILURE);
701 	}
702 
703 	D2(vswp, "detaching instance %d", instance);
704 
705 	if (vswp->mdprops & VSW_MD_MACADDR) {
706 		if (vsw_mac_unregister(vswp) != 0) {
707 			cmn_err(CE_WARN, "Unable to detach from MAC layer");
708 			return (DDI_FAILURE);
709 		}
710 	}
711 	rw_destroy(&vswp->if_lockrw);
712 
713 	vsw_mdeg_unregister(vswp);
714 
715 	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
716 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) {
717 		vsw_mac_detach(vswp);
718 	}
719 
720 	if (vsw_detach_ports(vswp) != 0) {
721 		cmn_err(CE_WARN, "Unable to detach ports");
722 		return (DDI_FAILURE);
723 	}
724 
725 	/*
726 	 * Remove this instance from any entries it may be on in
727 	 * the hash table by using the list of addresses maintained
728 	 * in the vsw_t structure.
729 	 */
730 	vsw_del_mcst_vsw(vswp);
731 
732 	vswp->mcap = NULL;
733 	mutex_destroy(&vswp->mca_lock);
734 
735 	/*
736 	 * By now any pending tasks have finished and the underlying
737 	 * ldc's have been destroyed, so its safe to delete the control
738 	 * message taskq.
739 	 */
740 	if (vswp->taskq_p != NULL)
741 		ddi_taskq_destroy(vswp->taskq_p);
742 
743 	/*
744 	 * At this stage all the data pointers in the hash table
745 	 * should be NULL, as all the ports have been removed and will
746 	 * have deleted themselves from the port lists which the data
747 	 * pointers point to. Hence we can destroy the table using the
748 	 * default destructors.
749 	 */
750 	D2(vswp, "vsw_detach: destroying hash tables..");
751 	mod_hash_destroy_hash(vswp->fdb);
752 	vswp->fdb = NULL;
753 
754 	WRITE_ENTER(&vswp->mfdbrw);
755 	mod_hash_destroy_hash(vswp->mfdb);
756 	vswp->mfdb = NULL;
757 	RW_EXIT(&vswp->mfdbrw);
758 	rw_destroy(&vswp->mfdbrw);
759 
760 	ddi_remove_minor_node(dip, NULL);
761 
762 	rw_destroy(&vswp->plist.lockrw);
763 	WRITE_ENTER(&vsw_rw);
764 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
765 		if (*vswpp == vswp) {
766 			*vswpp = vswp->next;
767 			break;
768 		}
769 	}
770 	RW_EXIT(&vsw_rw);
771 	ddi_soft_state_free(vsw_state, instance);
772 
773 	return (DDI_SUCCESS);
774 }
775 
776 static int
777 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
778 {
779 	_NOTE(ARGUNUSED(dip))
780 
781 	vsw_t	*vswp = NULL;
782 	dev_t	dev = (dev_t)arg;
783 	int	instance;
784 
785 	instance = getminor(dev);
786 
787 	switch (infocmd) {
788 	case DDI_INFO_DEVT2DEVINFO:
789 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
790 			*result = NULL;
791 			return (DDI_FAILURE);
792 		}
793 		*result = vswp->dip;
794 		return (DDI_SUCCESS);
795 
796 	case DDI_INFO_DEVT2INSTANCE:
797 		*result = (void *)(uintptr_t)instance;
798 		return (DDI_SUCCESS);
799 
800 	default:
801 		*result = NULL;
802 		return (DDI_FAILURE);
803 	}
804 }
805 
806 /*
807  * Get the properties from our MD node.
808  */
809 static void
810 vsw_get_md_properties(vsw_t *vswp)
811 {
812 	md_t		*mdp = NULL;
813 	int		num_nodes = 0;
814 	int		len = 0, listsz = 0;
815 	int		num_vdev = 0;
816 	int		i, idx;
817 	boolean_t	found_node = B_FALSE;
818 	char		*smode = NULL;
819 	char		*curr_mode = NULL;
820 	char		*physname = NULL;
821 	char		*node_name = NULL;
822 	char		*dev;
823 	uint64_t 	macaddr = 0;
824 	uint64_t	md_inst, obp_inst;
825 	mde_cookie_t	*listp = NULL;
826 	mde_cookie_t	rootnode;
827 
828 	D1(vswp, "%s: enter", __func__);
829 
830 	/*
831 	 * Further down we compare the obp 'reg' property to the
832 	 * 'cfg-handle' property in the vsw MD node to determine
833 	 * if the node refers to this particular instance. So if
834 	 * we can't read the obp value then there is no point
835 	 * in proceeding further.
836 	 */
837 	if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip,
838 			DDI_PROP_DONTPASS, reg_propname) != 1) {
839 		cmn_err(CE_WARN, "Unable to read %s property "
840 			"from OBP device node", reg_propname);
841 		return;
842 	}
843 
844 	obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
845 		DDI_PROP_DONTPASS, reg_propname, 0);
846 
847 	D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst);
848 
849 	if ((mdp = md_get_handle()) == NULL) {
850 		DERR(vswp, "%s: unable to init MD", __func__);
851 		return;
852 	}
853 
854 	if ((num_nodes = md_node_count(mdp)) <= 0) {
855 		DERR(vswp, "%s: invalid number of  nodes found %d",
856 			__func__, num_nodes);
857 		(void) md_fini_handle(mdp);
858 		return;
859 	}
860 
861 	D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes);
862 
863 	/* allocate enough space for node list */
864 	listsz = num_nodes * sizeof (mde_cookie_t);
865 	listp = kmem_zalloc(listsz, KM_SLEEP);
866 
867 	rootnode = md_root_node(mdp);
868 
869 	/* Get the list of virtual devices */
870 	num_vdev = md_scan_dag(mdp, rootnode,
871 		md_find_name(mdp, vdev_propname),
872 		md_find_name(mdp, "fwd"), listp);
873 
874 	if (num_vdev <= 0) {
875 		DERR(vswp, "%s: didn't find any virtual-device nodes in MD",
876 			__func__);
877 		goto md_prop_exit;
878 	}
879 
880 	D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev);
881 
882 	/* Look for the virtual switch nodes in the list */
883 	for (idx = 0; idx < num_vdev; idx++) {
884 		if (md_get_prop_str(mdp, listp[idx],
885 				"name", &node_name) != 0) {
886 			DERR(vswp, "%s: unable to get node name", __func__);
887 			continue;
888 
889 		}
890 
891 		if (strcmp(node_name, vsw_propname) == 0) {
892 			/* Virtual switch node */
893 			if (md_get_prop_val(mdp, listp[idx],
894 				"cfg-handle", &md_inst) != 0) {
895 				DERR(vswp, "%s: unable to get cfg-handle from"
896 					" node %d", __func__, idx);
897 				goto md_prop_exit;
898 			} else if (md_inst == obp_inst) {
899 				D2(vswp, "%s: found matching node (%d)"
900 					" 0x%llx == 0x%llx", __func__, idx,
901 					md_inst, obp_inst);
902 				found_node = B_TRUE;
903 				break;
904 			}
905 		}
906 	}
907 
908 	if (!found_node) {
909 		DWARN(vswp, "%s: couldn't find correct vsw node", __func__);
910 		goto md_prop_exit;
911 	}
912 
913 	/*
914 	 * Now, having found the correct node, get the various properties.
915 	 */
916 
917 	if (md_get_prop_data(mdp, listp[idx], physdev_propname,
918 				(uint8_t **)(&physname), &len) != 0) {
919 		cmn_err(CE_WARN, "%s: unable to get name(s) of physical "
920 			"device(s) from MD", __func__);
921 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
922 		cmn_err(CE_WARN, "%s is too long a device name", physname);
923 	} else {
924 		(void) strncpy(vswp->physname, physname, strlen(physname) + 1);
925 		vswp->mdprops |= VSW_MD_PHYSNAME;
926 		D2(vswp, "%s: using first device specified (%s)",
927 			__func__, vswp->physname);
928 	}
929 
930 
931 #ifdef DEBUG
932 	/*
933 	 * As a temporary measure to aid testing we check to see if there
934 	 * is a vsw.conf file present. If there is we use the value of the
935 	 * vsw_physname property in the file as the name of the physical
936 	 * device, overriding the value from the MD.
937 	 *
938 	 * There may be multiple devices listed, but for the moment
939 	 * we just use the first one.
940 	 */
941 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
942 		"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
943 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
944 			cmn_err(CE_WARN, "%s is too long a device name", dev);
945 		} else {
946 			cmn_err(CE_NOTE, "%s: using device name (%s) from "
947 				"config file", __func__, dev);
948 
949 			(void) strncpy(vswp->physname, dev, strlen(dev) + 1);
950 			vswp->mdprops |= VSW_MD_PHYSNAME;
951 		}
952 
953 		ddi_prop_free(dev);
954 
955 	}
956 #endif
957 
958 	/* local mac address */
959 	if (md_get_prop_val(mdp, listp[idx],
960 			macaddr_propname, &macaddr) != 0) {
961 		cmn_err(CE_WARN, "%s: unable to get local MAC address",
962 								__func__);
963 	} else {
964 		READ_ENTER(&vswp->if_lockrw);
965 		for (i = ETHERADDRL - 1; i >= 0; i--) {
966 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
967 			macaddr >>= 8;
968 		}
969 		RW_EXIT(&vswp->if_lockrw);
970 		vswp->mdprops |= VSW_MD_MACADDR;
971 	}
972 
973 	/*
974 	 * Get the switch-mode property. The modes are listed in
975 	 * decreasing order of preference, i.e. prefered mode is
976 	 * first item in list.
977 	 */
978 	len = 0;
979 	if (md_get_prop_data(mdp, listp[idx], smode_propname,
980 				(uint8_t **)(&smode), &len) != 0) {
981 		/*
982 		 * Unable to get switch-mode property, so just use
983 		 * default values which vswp->smode[] array has already
984 		 * been pre-populated with, namely layer2.
985 		 */
986 		cmn_err(CE_WARN, "%s: unable to get switch mode property, "
987 			"defaulting to layer 2 mode", __func__);
988 	} else {
989 		i = 0;
990 		curr_mode = smode;
991 		/*
992 		 * Modes of operation:
993 		 * 'switched'	 - layer 2 switching, underlying HW in
994 		 *			non-promiscuous mode.
995 		 * 'promiscuous' - layer 2 switching, underlying HW in
996 		 *			promiscuous mode.
997 		 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
998 		 *			in non-promiscuous mode.
999 		 */
1000 		while ((curr_mode < (smode + len)) && (i < NUM_SMODES)) {
1001 			D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
1002 			if (strcmp(curr_mode, "switched") == 0)
1003 				vswp->smode[i] = VSW_LAYER2;
1004 			else if (strcmp(curr_mode, "promiscuous") == 0)
1005 				vswp->smode[i] = VSW_LAYER2_PROMISC;
1006 			else if (strcmp(curr_mode, "routed") == 0)
1007 				vswp->smode[i] = VSW_LAYER3;
1008 			else {
1009 				DERR(vswp, "%s: unknown mode %s",
1010 					__func__, curr_mode);
1011 				/* default to layer 2 */
1012 				vswp->smode[i] = VSW_LAYER2;
1013 			}
1014 			curr_mode += strlen(curr_mode) + 1;
1015 			i++;
1016 		}
1017 
1018 		vswp->mdprops |= VSW_MD_SMODE;
1019 	}
1020 
1021 md_prop_exit:
1022 	(void) md_fini_handle(mdp);
1023 
1024 	kmem_free(listp, listsz);
1025 
1026 	D1(vswp, "%s: exit", __func__);
1027 }
1028 
1029 static int
1030 vsw_setup_layer2(vsw_t *vswp)
1031 {
1032 	int		rv = 0;
1033 
1034 	D1(vswp, "%s: enter", __func__);
1035 
1036 	vsw_switch_frame = vsw_switch_l2_frame;
1037 
1038 	/*
1039 	 * Attempt to link into the MAC layer so we can get
1040 	 * and send packets out over the physical adapter.
1041 	 */
1042 	if (vswp->mdprops & VSW_MD_PHYSNAME) {
1043 		if (vsw_mac_attach(vswp) != 0) {
1044 			/*
1045 			 * Registration with the MAC layer has failed,
1046 			 * so return 1 so that can fall back to next
1047 			 * prefered switching method.
1048 			 */
1049 			cmn_err(CE_WARN, "!unable to join as MAC layer "
1050 				"client, continuing with attach");
1051 			rv = 1;
1052 		}
1053 	} else {
1054 		/* No physical device name found in MD */
1055 		DERR(vswp, "%s: no physical device name specified", __func__);
1056 		rv = 1;
1057 	}
1058 
1059 	D1(vswp, "%s: exit", __func__);
1060 
1061 	return (rv);
1062 }
1063 
1064 static int
1065 vsw_setup_layer3(vsw_t *vswp)
1066 {
1067 	D1(vswp, "%s: enter", __func__);
1068 
1069 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1070 	vsw_switch_frame = vsw_switch_l3_frame;
1071 
1072 	D1(vswp, "%s: exit", __func__);
1073 
1074 	return (0);
1075 }
1076 
1077 /*
1078  * Link into the MAC layer to gain access to the services provided by
1079  * the underlying physical device driver (which should also have
1080  * registered with the MAC layer).
1081  *
1082  * Only when in layer 2 mode.
1083  */
1084 static int
1085 vsw_mac_attach(vsw_t *vswp)
1086 {
1087 	char	drv[LIFNAMSIZ];
1088 	uint_t	ddi_instance;
1089 
1090 	D1(vswp, "vsw_mac_attach: enter");
1091 
1092 	vswp->mh = NULL;
1093 	vswp->mrh = NULL;
1094 	vswp->mnh = NULL;
1095 
1096 	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
1097 
1098 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
1099 		cmn_err(CE_WARN, "invalid device name: %s", vswp->physname);
1100 		goto mac_fail_exit;
1101 	}
1102 	if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
1103 		cmn_err(CE_WARN, "mac_open %s failed", vswp->physname);
1104 		goto mac_fail_exit;
1105 	}
1106 
1107 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1108 
1109 	/* register for changes in the interface */
1110 	vswp->mnh = mac_notify_add(vswp->mh, vsw_notify_cb, (void *)vswp);
1111 
1112 	/* register our rx callback function */
1113 	vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1114 
1115 	/* get the MAC tx fn */
1116 	vswp->txinfo = mac_tx_get(vswp->mh);
1117 
1118 	/* start the interface */
1119 	if (mac_start(vswp->mh) != 0) {
1120 		cmn_err(CE_WARN, "could not start mac interface");
1121 		goto mac_fail_exit;
1122 	}
1123 
1124 	/* get and store original promisc setting */
1125 	vswp->init_promisc = mac_promisc_get(vswp->mh, MAC_DEVPROMISC);
1126 
1127 	/*
1128 	 * FUTURE: When we have the ability to set multiple unicast
1129 	 * mac address then we won't have to set the device into
1130 	 * promisc mode, but for the moment its the only way we.
1131 	 * can see pkts that logical domains we are serving are
1132 	 * interested in.
1133 	 */
1134 	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) &&
1135 					(vswp->init_promisc == B_FALSE)) {
1136 		DERR(vswp, "vsw_mac_attach: enabling promisc mode..");
1137 
1138 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1139 			DERR(vswp, "vsw_mac_attach: unable to set device"
1140 				" into promiscuous mode");
1141 			goto mac_fail_exit;
1142 		}
1143 	}
1144 
1145 	D1(vswp, "vsw_mac_attach: exit");
1146 	return (0);
1147 
1148 mac_fail_exit:
1149 	if (vswp->mh != NULL) {
1150 		mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC);
1151 		if (vswp->mrh != NULL)
1152 			mac_rx_remove(vswp->mh, vswp->mrh);
1153 
1154 		if (vswp->mnh != NULL)
1155 			mac_notify_remove(vswp->mh, vswp->mnh);
1156 
1157 		mac_close(vswp->mh);
1158 	}
1159 
1160 	vswp->mrh = NULL;
1161 	vswp->mnh = NULL;
1162 	vswp->mh = NULL;
1163 	vswp->txinfo = NULL;
1164 
1165 	D1(vswp, "vsw_mac_attach: fail exit");
1166 	return (1);
1167 }
1168 
1169 static void
1170 vsw_mac_detach(vsw_t *vswp)
1171 {
1172 	D1(vswp, "vsw_mac_detach: enter");
1173 
1174 	if (vswp->mh != NULL) {
1175 		/* restore promisc to original setting */
1176 		mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC);
1177 		if (vswp->mrh != NULL)
1178 			mac_rx_remove(vswp->mh, vswp->mrh);
1179 
1180 		if (vswp->mnh != NULL)
1181 			mac_notify_remove(vswp->mh, vswp->mnh);
1182 
1183 		mac_close(vswp->mh);
1184 	}
1185 
1186 	vswp->mrh = NULL;
1187 	vswp->mnh = NULL;
1188 	vswp->mh = NULL;
1189 	vswp->txinfo = NULL;
1190 
1191 	D1(vswp, "vsw_mac_detach: exit");
1192 }
1193 
1194 /*
1195  * Get notified of changes to the interface.
1196  *
1197  * For the moment we brute force the interface back
1198  * into promisc mode if it is unset (e.g. by snoop).
1199  * When we have the ability to set multiple mac addresses,
1200  * we will need to see if this is necessary.
1201  */
1202 static void
1203 vsw_notify_cb(void *arg, mac_notify_type_t type)
1204 {
1205 	vsw_t		*vswp = (vsw_t *)arg;
1206 
1207 	switch (type) {
1208 	case MAC_NOTE_PROMISC:
1209 		vswp->txinfo = mac_tx_get(vswp->mh);
1210 		if (mac_promisc_get(vswp->mh, MAC_DEVPROMISC) == B_TRUE) {
1211 			D2(vswp, "%s: still in PROMISC mode", __func__);
1212 		} else {
1213 			D2(vswp, "%s: now in NON-PROMISC mode", __func__);
1214 			D2(vswp, "...re-enabling");
1215 			mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC);
1216 		}
1217 		break;
1218 	default:
1219 		break;
1220 	}
1221 }
1222 
1223 /*
1224  * receive callback routine. Invoked by MAC layer when there
1225  * are pkts being passed up from physical device.
1226  *
1227  * PERF: It may be more efficient when the card is in promisc
1228  * mode to check the dest address of the pkts here (against
1229  * the FDB) rather than checking later. Needs to be investigated.
1230  */
1231 static void
1232 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1233 {
1234 	_NOTE(ARGUNUSED(mrh))
1235 
1236 	vsw_t		*vswp = (vsw_t *)arg;
1237 
1238 	ASSERT(vswp != NULL);
1239 
1240 	D1(vswp, "vsw_rx_cb: enter");
1241 
1242 	/* switch the chain of packets received */
1243 	vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1244 
1245 	D1(vswp, "vsw_rx_cb: exit");
1246 }
1247 
1248 /*
1249  * Send a message out over the physical device via the MAC layer.
1250  *
1251  * Returns any mblks that it was unable to transmit.
1252  */
1253 static mblk_t *
1254 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
1255 {
1256 	const mac_txinfo_t	*mtp;
1257 	mblk_t			*nextp;
1258 
1259 	if (vswp->mh == NULL) {
1260 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
1261 		return (mp);
1262 	} else {
1263 		for (;;) {
1264 			nextp = mp->b_next;
1265 			mp->b_next = NULL;
1266 
1267 			mtp = vswp->txinfo;
1268 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
1269 				mp->b_next = nextp;
1270 				break;
1271 			}
1272 
1273 			if ((mp = nextp) == NULL)
1274 				break;
1275 
1276 		}
1277 
1278 	}
1279 
1280 	return (mp);
1281 }
1282 
1283 /*
1284  * Register with the MAC layer as a network device, so we
1285  * can be plumbed if necessary.
1286  */
1287 static int
1288 vsw_mac_register(vsw_t *vswp)
1289 {
1290 	mac_register_t	*macp;
1291 	int		rv;
1292 
1293 	D1(vswp, "%s: enter", __func__);
1294 
1295 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
1296 		return (EINVAL);
1297 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1298 	macp->m_driver = vswp;
1299 	macp->m_dip = vswp->dip;
1300 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
1301 	macp->m_callbacks = &vsw_m_callbacks;
1302 	macp->m_min_sdu = 0;
1303 	macp->m_max_sdu = ETHERMTU;
1304 	rv = mac_register(macp, &vswp->if_mh);
1305 	mac_free(macp);
1306 	if (rv == 0)
1307 		vswp->if_state |= VSW_IF_REG;
1308 
1309 	D1(vswp, "%s: exit", __func__);
1310 
1311 	return (rv);
1312 }
1313 
1314 static int
1315 vsw_mac_unregister(vsw_t *vswp)
1316 {
1317 	int		rv = 0;
1318 
1319 	D1(vswp, "%s: enter", __func__);
1320 
1321 	WRITE_ENTER(&vswp->if_lockrw);
1322 
1323 	if (vswp->if_state & VSW_IF_REG) {
1324 		rv = mac_unregister(vswp->if_mh);
1325 		if (rv != 0) {
1326 			DWARN(vswp, "%s: unable to unregister from MAC "
1327 				"framework", __func__);
1328 
1329 			RW_EXIT(&vswp->if_lockrw);
1330 			D1(vswp, "%s: fail exit", __func__);
1331 			return (rv);
1332 		}
1333 
1334 		/* mark i/f as down and unregistered */
1335 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
1336 	}
1337 	RW_EXIT(&vswp->if_lockrw);
1338 
1339 	D1(vswp, "%s: exit", __func__);
1340 
1341 	return (rv);
1342 }
1343 
1344 static int
1345 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
1346 {
1347 	vsw_t			*vswp = (vsw_t *)arg;
1348 
1349 	D1(vswp, "%s: enter", __func__);
1350 
1351 	if (vswp->mh == NULL)
1352 		return (EINVAL);
1353 
1354 	/* return stats from underlying device */
1355 	*val = mac_stat_get(vswp->mh, stat);
1356 	return (0);
1357 }
1358 
1359 static void
1360 vsw_m_stop(void *arg)
1361 {
1362 	vsw_t		*vswp = (vsw_t *)arg;
1363 
1364 	D1(vswp, "%s: enter", __func__);
1365 
1366 	WRITE_ENTER(&vswp->if_lockrw);
1367 	vswp->if_state &= ~VSW_IF_UP;
1368 	RW_EXIT(&vswp->if_lockrw);
1369 
1370 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1371 }
1372 
1373 static int
1374 vsw_m_start(void *arg)
1375 {
1376 	vsw_t		*vswp = (vsw_t *)arg;
1377 
1378 	D1(vswp, "%s: enter", __func__);
1379 
1380 	WRITE_ENTER(&vswp->if_lockrw);
1381 	vswp->if_state |= VSW_IF_UP;
1382 	RW_EXIT(&vswp->if_lockrw);
1383 
1384 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1385 	return (0);
1386 }
1387 
1388 /*
1389  * Change the local interface address.
1390  */
1391 static int
1392 vsw_m_unicst(void *arg, const uint8_t *macaddr)
1393 {
1394 	vsw_t		*vswp = (vsw_t *)arg;
1395 
1396 	D1(vswp, "%s: enter", __func__);
1397 
1398 	WRITE_ENTER(&vswp->if_lockrw);
1399 	ether_copy(macaddr, &vswp->if_addr);
1400 	RW_EXIT(&vswp->if_lockrw);
1401 
1402 	D1(vswp, "%s: exit", __func__);
1403 
1404 	return (0);
1405 }
1406 
1407 static int
1408 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
1409 {
1410 	vsw_t		*vswp = (vsw_t *)arg;
1411 	mcst_addr_t	*mcst_p = NULL;
1412 	uint64_t	addr = 0x0;
1413 	int		i;
1414 
1415 	D1(vswp, "%s: enter", __func__);
1416 
1417 	/*
1418 	 * Convert address into form that can be used
1419 	 * as hash table key.
1420 	 */
1421 	for (i = 0; i < ETHERADDRL; i++) {
1422 		addr = (addr << 8) | mca[i];
1423 	}
1424 
1425 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
1426 
1427 	if (add) {
1428 		D2(vswp, "%s: adding multicast", __func__);
1429 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1430 			/*
1431 			 * Update the list of multicast addresses
1432 			 * contained within the vsw_t structure to
1433 			 * include this new one.
1434 			 */
1435 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
1436 			if (mcst_p == NULL) {
1437 				DERR(vswp, "%s unable to alloc mem", __func__);
1438 				return (1);
1439 			}
1440 			mcst_p->addr = addr;
1441 
1442 			mutex_enter(&vswp->mca_lock);
1443 			mcst_p->nextp = vswp->mcap;
1444 			vswp->mcap = mcst_p;
1445 			mutex_exit(&vswp->mca_lock);
1446 
1447 			/*
1448 			 * Call into the underlying driver to program the
1449 			 * address into HW.
1450 			 *
1451 			 * Note:
1452 			 * Can safely ignore the return value as the card
1453 			 * will for the moment always be in promisc mode.
1454 			 * When we can program multiple MAC addresses into the
1455 			 * HW then we will need to care about the return
1456 			 * value here.
1457 			 */
1458 			if (vswp->mh != NULL)
1459 				(void) mac_multicst_add(vswp->mh, mca);
1460 		}
1461 	} else {
1462 		D2(vswp, "%s: removing multicast", __func__);
1463 		/*
1464 		 * Remove the address from the hash table..
1465 		 */
1466 		if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1467 
1468 			/*
1469 			 * ..and then from the list maintained in the
1470 			 * vsw_t structure.
1471 			 */
1472 			vsw_del_addr(VSW_LOCALDEV, vswp, addr);
1473 
1474 			if (vswp->mh != NULL)
1475 				(void) mac_multicst_remove(vswp->mh, mca);
1476 		}
1477 	}
1478 
1479 	D1(vswp, "%s: exit", __func__);
1480 
1481 	return (0);
1482 }
1483 
1484 static int
1485 vsw_m_promisc(void *arg, boolean_t on)
1486 {
1487 	vsw_t		*vswp = (vsw_t *)arg;
1488 
1489 	D1(vswp, "%s: enter", __func__);
1490 
1491 	WRITE_ENTER(&vswp->if_lockrw);
1492 	if (on)
1493 		vswp->if_state |= VSW_IF_PROMISC;
1494 	else
1495 		vswp->if_state &= ~VSW_IF_PROMISC;
1496 	RW_EXIT(&vswp->if_lockrw);
1497 
1498 	D1(vswp, "%s: exit", __func__);
1499 
1500 	return (0);
1501 }
1502 
1503 static mblk_t *
1504 vsw_m_tx(void *arg, mblk_t *mp)
1505 {
1506 	vsw_t		*vswp = (vsw_t *)arg;
1507 
1508 	D1(vswp, "%s: enter", __func__);
1509 
1510 	vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
1511 
1512 	D1(vswp, "%s: exit", __func__);
1513 
1514 	return (NULL);
1515 }
1516 
1517 /*
1518  * Register for machine description (MD) updates.
1519  */
1520 static void
1521 vsw_mdeg_register(vsw_t *vswp)
1522 {
1523 	mdeg_prop_spec_t	*pspecp;
1524 	mdeg_node_spec_t	*inst_specp;
1525 	mdeg_handle_t		mdeg_hdl;
1526 	size_t			templatesz;
1527 	int			inst, rv;
1528 
1529 	D1(vswp, "%s: enter", __func__);
1530 
1531 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
1532 		DDI_PROP_DONTPASS, reg_propname, -1);
1533 	if (inst == -1) {
1534 		DERR(vswp, "%s: unable to get %s property",
1535 						__func__, reg_propname);
1536 		return;
1537 	}
1538 
1539 	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
1540 
1541 	/*
1542 	 * Allocate and initialize a per-instance copy
1543 	 * of the global property spec array that will
1544 	 * uniquely identify this vsw instance.
1545 	 */
1546 	templatesz = sizeof (vsw_prop_template);
1547 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
1548 
1549 	bcopy(vsw_prop_template, pspecp, templatesz);
1550 
1551 	VSW_SET_MDEG_PROP_INST(pspecp, inst);
1552 
1553 	/* initialize the complete prop spec structure */
1554 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
1555 	inst_specp->namep = "virtual-device";
1556 	inst_specp->specp = pspecp;
1557 
1558 	/* perform the registration */
1559 	rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb,
1560 	    (void *)vswp, &mdeg_hdl);
1561 
1562 	if (rv != MDEG_SUCCESS) {
1563 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
1564 		kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
1565 		kmem_free(pspecp, templatesz);
1566 		return;
1567 	}
1568 
1569 	/* save off data that will be needed later */
1570 	vswp->inst_spec = inst_specp;
1571 	vswp->mdeg_hdl = mdeg_hdl;
1572 
1573 	D1(vswp, "%s: exit", __func__);
1574 }
1575 
1576 static void
1577 vsw_mdeg_unregister(vsw_t *vswp)
1578 {
1579 	D1(vswp, "vsw_mdeg_unregister: enter");
1580 
1581 	(void) mdeg_unregister(vswp->mdeg_hdl);
1582 
1583 	if (vswp->inst_spec->specp != NULL) {
1584 		(void) kmem_free(vswp->inst_spec->specp,
1585 			sizeof (vsw_prop_template));
1586 		vswp->inst_spec->specp = NULL;
1587 	}
1588 
1589 	if (vswp->inst_spec != NULL) {
1590 		(void) kmem_free(vswp->inst_spec,
1591 			sizeof (mdeg_node_spec_t));
1592 		vswp->inst_spec = NULL;
1593 	}
1594 
1595 	D1(vswp, "vsw_mdeg_unregister: exit");
1596 }
1597 
1598 static int
1599 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
1600 {
1601 	vsw_t		*vswp;
1602 	int		idx;
1603 	md_t		*mdp;
1604 	mde_cookie_t	node;
1605 	uint64_t	inst;
1606 
1607 	if (resp == NULL)
1608 		return (MDEG_FAILURE);
1609 
1610 	vswp = (vsw_t *)cb_argp;
1611 
1612 	D1(vswp, "%s: added %d : removed %d : matched %d",
1613 		__func__, resp->added.nelem, resp->removed.nelem,
1614 		resp->match_prev.nelem);
1615 
1616 	/* process added ports */
1617 	for (idx = 0; idx < resp->added.nelem; idx++) {
1618 		mdp = resp->added.mdp;
1619 		node = resp->added.mdep[idx];
1620 
1621 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
1622 
1623 		if (vsw_port_add(vswp, mdp, &node) != 0) {
1624 			cmn_err(CE_WARN, "Unable to add new port (0x%lx)",
1625 					node);
1626 		}
1627 	}
1628 
1629 	/* process removed ports */
1630 	for (idx = 0; idx < resp->removed.nelem; idx++) {
1631 		mdp = resp->removed.mdp;
1632 		node = resp->removed.mdep[idx];
1633 
1634 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
1635 			DERR(vswp, "%s: prop(%s) not found port(%d)",
1636 				__func__, id_propname, idx);
1637 			continue;
1638 		}
1639 
1640 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
1641 
1642 		if (vsw_port_detach(vswp, inst) != 0) {
1643 			cmn_err(CE_WARN, "Unable to remove port %ld", inst);
1644 		}
1645 	}
1646 
1647 	/*
1648 	 * Currently no support for updating already active ports.
1649 	 * So, ignore the match_curr and match_priv arrays for now.
1650 	 */
1651 
1652 	D1(vswp, "%s: exit", __func__);
1653 
1654 	return (MDEG_SUCCESS);
1655 }
1656 
1657 /*
1658  * Add a new port to the system.
1659  *
1660  * Returns 0 on success, 1 on failure.
1661  */
1662 int
1663 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
1664 {
1665 	uint64_t		ldc_id;
1666 	uint8_t			*addrp;
1667 	int			i, addrsz;
1668 	int			num_nodes = 0, nchan = 0;
1669 	int			listsz = 0;
1670 	mde_cookie_t		*listp = NULL;
1671 	struct ether_addr	ea;
1672 	uint64_t		macaddr;
1673 	uint64_t		inst = 0;
1674 	vsw_port_t		*port;
1675 
1676 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
1677 		DWARN(vswp, "%s: prop(%s) not found", __func__,
1678 			id_propname);
1679 		return (1);
1680 	}
1681 
1682 	/*
1683 	 * Find the channel endpoint node(s) (which should be under this
1684 	 * port node) which contain the channel id(s).
1685 	 */
1686 	if ((num_nodes = md_node_count(mdp)) <= 0) {
1687 		DERR(vswp, "%s: invalid number of nodes found (%d)",
1688 			__func__, num_nodes);
1689 		return (1);
1690 	}
1691 
1692 	/* allocate enough space for node list */
1693 	listsz = num_nodes * sizeof (mde_cookie_t);
1694 	listp = kmem_zalloc(listsz, KM_SLEEP);
1695 
1696 	nchan = md_scan_dag(mdp, *node,
1697 		md_find_name(mdp, chan_propname),
1698 		md_find_name(mdp, "fwd"), listp);
1699 
1700 	if (nchan <= 0) {
1701 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
1702 		kmem_free(listp, listsz);
1703 		return (1);
1704 	}
1705 
1706 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
1707 
1708 	/* use property from first node found */
1709 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
1710 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
1711 			id_propname);
1712 		kmem_free(listp, listsz);
1713 		return (1);
1714 	}
1715 
1716 	/* don't need list any more */
1717 	kmem_free(listp, listsz);
1718 
1719 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
1720 
1721 	/* read mac-address property */
1722 	if (md_get_prop_data(mdp, *node, remaddr_propname,
1723 					&addrp, &addrsz)) {
1724 		DWARN(vswp, "%s: prop(%s) not found",
1725 				__func__, remaddr_propname);
1726 		return (1);
1727 	}
1728 
1729 	if (addrsz < ETHERADDRL) {
1730 		DWARN(vswp, "%s: invalid address size", __func__);
1731 		return (1);
1732 	}
1733 
1734 	macaddr = *((uint64_t *)addrp);
1735 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
1736 
1737 	for (i = ETHERADDRL - 1; i >= 0; i--) {
1738 		ea.ether_addr_octet[i] = macaddr & 0xFF;
1739 		macaddr >>= 8;
1740 	}
1741 
1742 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
1743 		DERR(vswp, "%s: failed to attach port", __func__);
1744 		return (1);
1745 	}
1746 
1747 	port = vsw_lookup_port(vswp, (int)inst);
1748 
1749 	/* just successfuly created the port, so it should exist */
1750 	ASSERT(port != NULL);
1751 
1752 	return (0);
1753 }
1754 
1755 /*
1756  * Attach the specified port.
1757  *
1758  * Returns 0 on success, 1 on failure.
1759  */
1760 static int
1761 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
1762 struct ether_addr *macaddr)
1763 {
1764 	vsw_port_list_t		*plist = &vswp->plist;
1765 	vsw_port_t		*port, **prev_port;
1766 	int			i;
1767 
1768 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
1769 
1770 	/* port already exists? */
1771 	READ_ENTER(&plist->lockrw);
1772 	for (port = plist->head; port != NULL; port = port->p_next) {
1773 		if (port->p_instance == p_instance) {
1774 			DWARN(vswp, "%s: port instance %d already attached",
1775 				__func__, p_instance);
1776 			RW_EXIT(&plist->lockrw);
1777 			return (1);
1778 		}
1779 	}
1780 	RW_EXIT(&plist->lockrw);
1781 
1782 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
1783 	port->p_vswp = vswp;
1784 	port->p_instance = p_instance;
1785 	port->p_ldclist.num_ldcs = 0;
1786 	port->p_ldclist.head = NULL;
1787 
1788 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
1789 
1790 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
1791 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
1792 
1793 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
1794 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
1795 
1796 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
1797 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
1798 	port->state = VSW_PORT_INIT;
1799 
1800 	if (nids > VSW_PORT_MAX_LDCS) {
1801 		D2(vswp, "%s: using first of %d ldc ids",
1802 			__func__, nids);
1803 		nids = VSW_PORT_MAX_LDCS;
1804 	}
1805 
1806 	D2(vswp, "%s: %d nids", __func__, nids);
1807 	for (i = 0; i < nids; i++) {
1808 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
1809 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
1810 			DERR(vswp, "%s: ldc_attach failed", __func__);
1811 
1812 			rw_destroy(&port->p_ldclist.lockrw);
1813 
1814 			cv_destroy(&port->ref_cv);
1815 			mutex_destroy(&port->ref_lock);
1816 
1817 			cv_destroy(&port->state_cv);
1818 			mutex_destroy(&port->state_lock);
1819 
1820 			mutex_destroy(&port->tx_lock);
1821 			mutex_destroy(&port->mca_lock);
1822 			kmem_free(port, sizeof (vsw_port_t));
1823 			return (1);
1824 		}
1825 	}
1826 
1827 	ether_copy(macaddr, &port->p_macaddr);
1828 
1829 	WRITE_ENTER(&plist->lockrw);
1830 
1831 	/* create the fdb entry for this port/mac address */
1832 	(void) vsw_add_fdb(vswp, port);
1833 
1834 	/* link it into the list of ports for this vsw instance */
1835 	prev_port = (vsw_port_t **)(&plist->head);
1836 	port->p_next = *prev_port;
1837 	*prev_port = port;
1838 	plist->num_ports++;
1839 	RW_EXIT(&plist->lockrw);
1840 
1841 	/*
1842 	 * Initialise the port and any ldc's under it.
1843 	 */
1844 	(void) vsw_init_ldcs(port);
1845 
1846 	D1(vswp, "%s: exit", __func__);
1847 	return (0);
1848 }
1849 
1850 /*
1851  * Detach the specified port.
1852  *
1853  * Returns 0 on success, 1 on failure.
1854  */
1855 static int
1856 vsw_port_detach(vsw_t *vswp, int p_instance)
1857 {
1858 	vsw_port_t	*port = NULL;
1859 	vsw_port_list_t	*plist = &vswp->plist;
1860 
1861 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
1862 
1863 	WRITE_ENTER(&plist->lockrw);
1864 
1865 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
1866 		RW_EXIT(&plist->lockrw);
1867 		return (1);
1868 	}
1869 
1870 	if (vsw_plist_del_node(vswp, port)) {
1871 		RW_EXIT(&plist->lockrw);
1872 		return (1);
1873 	}
1874 
1875 	/* Remove the fdb entry for this port/mac address */
1876 	(void) vsw_del_fdb(vswp, port);
1877 
1878 	/* Remove any multicast addresses.. */
1879 	vsw_del_mcst_port(port);
1880 
1881 	/*
1882 	 * No longer need to hold lock on port list now that we
1883 	 * have unlinked the target port from the list.
1884 	 */
1885 	RW_EXIT(&plist->lockrw);
1886 
1887 	if (vsw_port_delete(port)) {
1888 		return (1);
1889 	}
1890 
1891 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
1892 	return (0);
1893 }
1894 
1895 /*
1896  * Detach all active ports.
1897  *
1898  * Returns 0 on success, 1 on failure.
1899  */
1900 static int
1901 vsw_detach_ports(vsw_t *vswp)
1902 {
1903 	vsw_port_list_t 	*plist = &vswp->plist;
1904 	vsw_port_t		*port = NULL;
1905 
1906 	D1(vswp, "%s: enter", __func__);
1907 
1908 	WRITE_ENTER(&plist->lockrw);
1909 
1910 	while ((port = plist->head) != NULL) {
1911 		if (vsw_plist_del_node(vswp, port)) {
1912 			DERR(vswp, "%s: Error deleting port %d"
1913 				" from port list", __func__,
1914 				port->p_instance);
1915 			RW_EXIT(&plist->lockrw);
1916 			return (1);
1917 		}
1918 
1919 		/* Remove the fdb entry for this port/mac address */
1920 		(void) vsw_del_fdb(vswp, port);
1921 
1922 		/* Remove any multicast addresses.. */
1923 		vsw_del_mcst_port(port);
1924 
1925 		/*
1926 		 * No longer need to hold the lock on the port list
1927 		 * now that we have unlinked the target port from the
1928 		 * list.
1929 		 */
1930 		RW_EXIT(&plist->lockrw);
1931 		if (vsw_port_delete(port)) {
1932 			DERR(vswp, "%s: Error deleting port %d",
1933 				__func__, port->p_instance);
1934 			return (1);
1935 		}
1936 		WRITE_ENTER(&plist->lockrw);
1937 	}
1938 	RW_EXIT(&plist->lockrw);
1939 
1940 	D1(vswp, "%s: exit", __func__);
1941 
1942 	return (0);
1943 }
1944 
1945 /*
1946  * Delete the specified port.
1947  *
1948  * Returns 0 on success, 1 on failure.
1949  */
1950 static int
1951 vsw_port_delete(vsw_port_t *port)
1952 {
1953 	vsw_ldc_list_t 		*ldcl;
1954 	vsw_t			*vswp = port->p_vswp;
1955 
1956 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
1957 
1958 	(void) vsw_uninit_ldcs(port);
1959 
1960 	/*
1961 	 * Wait for any pending ctrl msg tasks which reference this
1962 	 * port to finish.
1963 	 */
1964 	if (vsw_drain_port_taskq(port))
1965 		return (1);
1966 
1967 	/*
1968 	 * Wait for port reference count to hit zero.
1969 	 */
1970 	mutex_enter(&port->ref_lock);
1971 	while (port->ref_cnt != 0)
1972 		cv_wait(&port->ref_cv, &port->ref_lock);
1973 	mutex_exit(&port->ref_lock);
1974 
1975 	/*
1976 	 * Wait for any active callbacks to finish
1977 	 */
1978 	if (vsw_drain_ldcs(port))
1979 		return (1);
1980 
1981 	ldcl = &port->p_ldclist;
1982 	WRITE_ENTER(&ldcl->lockrw);
1983 	while (ldcl->num_ldcs > 0) {
1984 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
1985 			cmn_err(CE_WARN, "unable to detach ldc %ld",
1986 					ldcl->head->ldc_id);
1987 			RW_EXIT(&ldcl->lockrw);
1988 			return (1);
1989 		}
1990 	}
1991 	RW_EXIT(&ldcl->lockrw);
1992 
1993 	rw_destroy(&port->p_ldclist.lockrw);
1994 
1995 	mutex_destroy(&port->mca_lock);
1996 	mutex_destroy(&port->tx_lock);
1997 	cv_destroy(&port->ref_cv);
1998 	mutex_destroy(&port->ref_lock);
1999 
2000 	cv_destroy(&port->state_cv);
2001 	mutex_destroy(&port->state_lock);
2002 
2003 	kmem_free(port, sizeof (vsw_port_t));
2004 
2005 	D1(vswp, "%s: exit", __func__);
2006 
2007 	return (0);
2008 }
2009 
2010 /*
2011  * Attach a logical domain channel (ldc) under a specified port.
2012  *
2013  * Returns 0 on success, 1 on failure.
2014  */
2015 static int
2016 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
2017 {
2018 	vsw_t 		*vswp = port->p_vswp;
2019 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
2020 	vsw_ldc_t 	*ldcp = NULL;
2021 	ldc_attr_t 	attr;
2022 	ldc_status_t	istatus;
2023 	int 		status = DDI_FAILURE;
2024 
2025 	D1(vswp, "%s: enter", __func__);
2026 
2027 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
2028 	if (ldcp == NULL) {
2029 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
2030 		return (1);
2031 	}
2032 	ldcp->ldc_id = ldc_id;
2033 
2034 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
2035 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
2036 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
2037 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
2038 
2039 	/* required for handshake with peer */
2040 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
2041 	ldcp->peer_session = 0;
2042 	ldcp->session_status = 0;
2043 
2044 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
2045 	ldcp->hss_id = 1;	/* Initial handshake session id */
2046 
2047 	/* only set for outbound lane, inbound set by peer */
2048 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
2049 
2050 	attr.devclass = LDC_DEV_NT_SVC;
2051 	attr.instance = ddi_get_instance(vswp->dip);
2052 	attr.mode = LDC_MODE_UNRELIABLE;
2053 	attr.qlen = VSW_LDC_QLEN;
2054 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
2055 	if (status != 0) {
2056 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
2057 		    __func__, ldc_id, status);
2058 		mutex_destroy(&ldcp->ldc_txlock);
2059 		mutex_destroy(&ldcp->ldc_cblock);
2060 		cv_destroy(&ldcp->drain_cv);
2061 		mutex_destroy(&ldcp->drain_cv_lock);
2062 		mutex_destroy(&ldcp->hss_lock);
2063 		kmem_free(ldcp, sizeof (vsw_ldc_t));
2064 		return (1);
2065 	}
2066 
2067 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
2068 	if (status != 0) {
2069 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
2070 		    __func__, ldc_id, status);
2071 		mutex_destroy(&ldcp->ldc_txlock);
2072 		mutex_destroy(&ldcp->ldc_cblock);
2073 		cv_destroy(&ldcp->drain_cv);
2074 		mutex_destroy(&ldcp->drain_cv_lock);
2075 		mutex_destroy(&ldcp->hss_lock);
2076 		(void) ldc_fini(ldcp->ldc_handle);
2077 		kmem_free(ldcp, sizeof (vsw_ldc_t));
2078 		return (1);
2079 	}
2080 
2081 
2082 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2083 		DERR(vswp, "%s: ldc_status failed", __func__);
2084 		return (1);
2085 	}
2086 
2087 	ldcp->ldc_status = istatus;
2088 	ldcp->ldc_port = port;
2089 	ldcp->ldc_vswp = vswp;
2090 
2091 	/* link it into the list of channels for this port */
2092 	WRITE_ENTER(&ldcl->lockrw);
2093 	ldcp->ldc_next = ldcl->head;
2094 	ldcl->head = ldcp;
2095 	ldcl->num_ldcs++;
2096 	RW_EXIT(&ldcl->lockrw);
2097 
2098 	D1(vswp, "%s: exit", __func__);
2099 	return (0);
2100 }
2101 
2102 /*
2103  * Detach a logical domain channel (ldc) belonging to a
2104  * particular port.
2105  *
2106  * Returns 0 on success, 1 on failure.
2107  */
2108 static int
2109 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
2110 {
2111 	vsw_t 		*vswp = port->p_vswp;
2112 	vsw_ldc_t 	*ldcp, *prev_ldcp;
2113 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2114 	int 		rv;
2115 
2116 	prev_ldcp = ldcl->head;
2117 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
2118 		if (ldcp->ldc_id == ldc_id) {
2119 			break;
2120 		}
2121 	}
2122 
2123 	/* specified ldc id not found */
2124 	if (ldcp == NULL) {
2125 		DERR(vswp, "%s: ldcp = NULL", __func__);
2126 		return (1);
2127 	}
2128 
2129 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
2130 
2131 	/*
2132 	 * Before we can close the channel we must release any mapped
2133 	 * resources (e.g. drings).
2134 	 */
2135 	vsw_free_lane_resources(ldcp, INBOUND);
2136 	vsw_free_lane_resources(ldcp, OUTBOUND);
2137 
2138 	/*
2139 	 * If the close fails we are in serious trouble, as won't
2140 	 * be able to delete the parent port.
2141 	 */
2142 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
2143 		DERR(vswp, "%s: error %d closing channel %lld",
2144 			__func__, rv, ldcp->ldc_id);
2145 		return (1);
2146 	}
2147 
2148 	(void) ldc_fini(ldcp->ldc_handle);
2149 
2150 	ldcp->ldc_status = LDC_INIT;
2151 	ldcp->ldc_handle = NULL;
2152 	ldcp->ldc_vswp = NULL;
2153 	mutex_destroy(&ldcp->ldc_txlock);
2154 	mutex_destroy(&ldcp->ldc_cblock);
2155 	cv_destroy(&ldcp->drain_cv);
2156 	mutex_destroy(&ldcp->drain_cv_lock);
2157 	mutex_destroy(&ldcp->hss_lock);
2158 
2159 	/* unlink it from the list */
2160 	prev_ldcp = ldcp->ldc_next;
2161 	ldcl->num_ldcs--;
2162 	kmem_free(ldcp, sizeof (vsw_ldc_t));
2163 
2164 	return (0);
2165 }
2166 
2167 /*
2168  * Open and attempt to bring up the channel. Note that channel
2169  * can only be brought up if peer has also opened channel.
2170  *
2171  * Returns 0 if can open and bring up channel, otherwise
2172  * returns 1.
2173  */
2174 static int
2175 vsw_ldc_init(vsw_ldc_t *ldcp)
2176 {
2177 	vsw_t 		*vswp = ldcp->ldc_vswp;
2178 	ldc_status_t	istatus = 0;
2179 	int		rv;
2180 
2181 	D1(vswp, "%s: enter", __func__);
2182 
2183 	LDC_ENTER_LOCK(ldcp);
2184 
2185 	/* don't start at 0 in case clients don't like that */
2186 	ldcp->next_ident = 1;
2187 
2188 	rv = ldc_open(ldcp->ldc_handle);
2189 	if (rv != 0) {
2190 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
2191 		    __func__, ldcp->ldc_id, rv);
2192 		LDC_EXIT_LOCK(ldcp);
2193 		return (1);
2194 	}
2195 
2196 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2197 		DERR(vswp, "%s: unable to get status", __func__);
2198 		LDC_EXIT_LOCK(ldcp);
2199 		return (1);
2200 
2201 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
2202 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
2203 		    __func__, ldcp->ldc_id, istatus);
2204 		LDC_EXIT_LOCK(ldcp);
2205 		return (1);
2206 	}
2207 
2208 	ldcp->ldc_status = istatus;
2209 	rv = ldc_up(ldcp->ldc_handle);
2210 	if (rv != 0) {
2211 		/*
2212 		 * Not a fatal error for ldc_up() to fail, as peer
2213 		 * end point may simply not be ready yet.
2214 		 */
2215 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
2216 			ldcp->ldc_id, rv);
2217 		LDC_EXIT_LOCK(ldcp);
2218 		return (1);
2219 	}
2220 
2221 	/*
2222 	 * ldc_up() call is non-blocking so need to explicitly
2223 	 * check channel status to see if in fact the channel
2224 	 * is UP.
2225 	 */
2226 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2227 		DERR(vswp, "%s: unable to get status", __func__);
2228 		LDC_EXIT_LOCK(ldcp);
2229 		return (1);
2230 
2231 	} else if (istatus != LDC_UP) {
2232 		DERR(vswp, "%s: id(%lld) status(%d) is not UP",
2233 		    __func__, ldcp->ldc_id, istatus);
2234 	} else {
2235 		ldcp->ldc_status = istatus;
2236 	}
2237 
2238 	LDC_EXIT_LOCK(ldcp);
2239 
2240 	D1(vswp, "%s: exit", __func__);
2241 	return (0);
2242 }
2243 
2244 /* disable callbacks on the channel */
2245 static int
2246 vsw_ldc_uninit(vsw_ldc_t *ldcp)
2247 {
2248 	vsw_t	*vswp = ldcp->ldc_vswp;
2249 	int	rv;
2250 
2251 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
2252 
2253 	LDC_ENTER_LOCK(ldcp);
2254 
2255 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
2256 	if (rv != 0) {
2257 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
2258 			"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
2259 		LDC_EXIT_LOCK(ldcp);
2260 		return (1);
2261 	}
2262 
2263 	ldcp->ldc_status = LDC_INIT;
2264 
2265 	LDC_EXIT_LOCK(ldcp);
2266 
2267 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
2268 
2269 	return (0);
2270 }
2271 
2272 static int
2273 vsw_init_ldcs(vsw_port_t *port)
2274 {
2275 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2276 	vsw_ldc_t	*ldcp;
2277 
2278 	READ_ENTER(&ldcl->lockrw);
2279 	ldcp =  ldcl->head;
2280 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2281 		(void) vsw_ldc_init(ldcp);
2282 	}
2283 	RW_EXIT(&ldcl->lockrw);
2284 
2285 	return (0);
2286 }
2287 
2288 static int
2289 vsw_uninit_ldcs(vsw_port_t *port)
2290 {
2291 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2292 	vsw_ldc_t	*ldcp;
2293 
2294 	D1(NULL, "vsw_uninit_ldcs: enter\n");
2295 
2296 	READ_ENTER(&ldcl->lockrw);
2297 	ldcp =  ldcl->head;
2298 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2299 		(void) vsw_ldc_uninit(ldcp);
2300 	}
2301 	RW_EXIT(&ldcl->lockrw);
2302 
2303 	D1(NULL, "vsw_uninit_ldcs: exit\n");
2304 
2305 	return (0);
2306 }
2307 
2308 /*
2309  * Wait until the callback(s) associated with the ldcs under the specified
2310  * port have completed.
2311  *
2312  * Prior to this function being invoked each channel under this port
2313  * should have been quiesced via ldc_set_cb_mode(DISABLE).
2314  *
2315  * A short explaination of what we are doing below..
2316  *
2317  * The simplest approach would be to have a reference counter in
2318  * the ldc structure which is increment/decremented by the callbacks as
2319  * they use the channel. The drain function could then simply disable any
2320  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
2321  * there is a tiny window here - before the callback is able to get the lock
2322  * on the channel it is interrupted and this function gets to execute. It
2323  * sees that the ref count is zero and believes its free to delete the
2324  * associated data structures.
2325  *
2326  * We get around this by taking advantage of the fact that before the ldc
2327  * framework invokes a callback it sets a flag to indicate that there is a
2328  * callback active (or about to become active). If when we attempt to
2329  * unregister a callback when this active flag is set then the unregister
2330  * will fail with EWOULDBLOCK.
2331  *
2332  * If the unregister fails we do a cv_timedwait. We will either be signaled
2333  * by the callback as it is exiting (note we have to wait a short period to
2334  * allow the callback to return fully to the ldc framework and it to clear
2335  * the active flag), or by the timer expiring. In either case we again attempt
2336  * the unregister. We repeat this until we can succesfully unregister the
2337  * callback.
2338  *
2339  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
2340  * the case where the callback has finished but the ldc framework has not yet
2341  * cleared the active flag. In this case we would never get a cv_signal.
2342  */
2343 static int
2344 vsw_drain_ldcs(vsw_port_t *port)
2345 {
2346 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2347 	vsw_ldc_t	*ldcp;
2348 	vsw_t		*vswp = port->p_vswp;
2349 
2350 	D1(vswp, "%s: enter", __func__);
2351 
2352 	READ_ENTER(&ldcl->lockrw);
2353 
2354 	ldcp = ldcl->head;
2355 
2356 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2357 		/*
2358 		 * If we can unregister the channel callback then we
2359 		 * know that there is no callback either running or
2360 		 * scheduled to run for this channel so move on to next
2361 		 * channel in the list.
2362 		 */
2363 		mutex_enter(&ldcp->drain_cv_lock);
2364 
2365 		/* prompt active callbacks to quit */
2366 		ldcp->drain_state = VSW_LDC_DRAINING;
2367 
2368 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
2369 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
2370 				ldcp->ldc_id);
2371 			mutex_exit(&ldcp->drain_cv_lock);
2372 			continue;
2373 		} else {
2374 			/*
2375 			 * If we end up here we know that either 1) a callback
2376 			 * is currently executing, 2) is about to start (i.e.
2377 			 * the ldc framework has set the active flag but
2378 			 * has not actually invoked the callback yet, or 3)
2379 			 * has finished and has returned to the ldc framework
2380 			 * but the ldc framework has not yet cleared the
2381 			 * active bit.
2382 			 *
2383 			 * Wait for it to finish.
2384 			 */
2385 			while (ldc_unreg_callback(ldcp->ldc_handle)
2386 								== EWOULDBLOCK)
2387 				(void) cv_timedwait(&ldcp->drain_cv,
2388 					&ldcp->drain_cv_lock, lbolt + hz);
2389 
2390 			mutex_exit(&ldcp->drain_cv_lock);
2391 			D2(vswp, "%s: unreg callback for chan %ld after "
2392 				"timeout", __func__, ldcp->ldc_id);
2393 		}
2394 	}
2395 	RW_EXIT(&ldcl->lockrw);
2396 
2397 	D1(vswp, "%s: exit", __func__);
2398 	return (0);
2399 }
2400 
2401 /*
2402  * Wait until all tasks which reference this port have completed.
2403  *
2404  * Prior to this function being invoked each channel under this port
2405  * should have been quiesced via ldc_set_cb_mode(DISABLE).
2406  */
2407 static int
2408 vsw_drain_port_taskq(vsw_port_t *port)
2409 {
2410 	vsw_t		*vswp = port->p_vswp;
2411 
2412 	D1(vswp, "%s: enter", __func__);
2413 
2414 	/*
2415 	 * Mark the port as in the process of being detached, and
2416 	 * dispatch a marker task to the queue so we know when all
2417 	 * relevant tasks have completed.
2418 	 */
2419 	mutex_enter(&port->state_lock);
2420 	port->state = VSW_PORT_DETACHING;
2421 
2422 	if ((vswp->taskq_p == NULL) ||
2423 		(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
2424 			port, DDI_NOSLEEP) != DDI_SUCCESS)) {
2425 		DERR(vswp, "%s: unable to dispatch marker task",
2426 			__func__);
2427 		mutex_exit(&port->state_lock);
2428 		return (1);
2429 	}
2430 
2431 	/*
2432 	 * Wait for the marker task to finish.
2433 	 */
2434 	while (port->state != VSW_PORT_DETACHABLE)
2435 		cv_wait(&port->state_cv, &port->state_lock);
2436 
2437 	mutex_exit(&port->state_lock);
2438 
2439 	D1(vswp, "%s: exit", __func__);
2440 
2441 	return (0);
2442 }
2443 
2444 static void
2445 vsw_marker_task(void *arg)
2446 {
2447 	vsw_port_t	*port = arg;
2448 	vsw_t		*vswp = port->p_vswp;
2449 
2450 	D1(vswp, "%s: enter", __func__);
2451 
2452 	mutex_enter(&port->state_lock);
2453 
2454 	/*
2455 	 * No further tasks should be dispatched which reference
2456 	 * this port so ok to mark it as safe to detach.
2457 	 */
2458 	port->state = VSW_PORT_DETACHABLE;
2459 
2460 	cv_signal(&port->state_cv);
2461 
2462 	mutex_exit(&port->state_lock);
2463 
2464 	D1(vswp, "%s: exit", __func__);
2465 }
2466 
2467 static vsw_port_t *
2468 vsw_lookup_port(vsw_t *vswp, int p_instance)
2469 {
2470 	vsw_port_list_t *plist = &vswp->plist;
2471 	vsw_port_t	*port;
2472 
2473 	for (port = plist->head; port != NULL; port = port->p_next) {
2474 		if (port->p_instance == p_instance) {
2475 			D2(vswp, "vsw_lookup_port: found p_instance\n");
2476 			return (port);
2477 		}
2478 	}
2479 
2480 	return (NULL);
2481 }
2482 
2483 /*
2484  * Search for and remove the specified port from the port
2485  * list. Returns 0 if able to locate and remove port, otherwise
2486  * returns 1.
2487  */
2488 static int
2489 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
2490 {
2491 	vsw_port_list_t *plist = &vswp->plist;
2492 	vsw_port_t	*curr_p, *prev_p;
2493 
2494 	if (plist->head == NULL)
2495 		return (1);
2496 
2497 	curr_p = prev_p = plist->head;
2498 
2499 	while (curr_p != NULL) {
2500 		if (curr_p == port) {
2501 			if (prev_p == curr_p) {
2502 				plist->head = curr_p->p_next;
2503 			} else {
2504 				prev_p->p_next = curr_p->p_next;
2505 			}
2506 			plist->num_ports--;
2507 			break;
2508 		} else {
2509 			prev_p = curr_p;
2510 			curr_p = curr_p->p_next;
2511 		}
2512 	}
2513 	return (0);
2514 }
2515 
2516 /*
2517  * Interrupt handler for ldc messages.
2518  */
2519 static uint_t
2520 vsw_ldc_cb(uint64_t event, caddr_t arg)
2521 {
2522 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2523 	vsw_t 		*vswp = ldcp->ldc_vswp;
2524 	ldc_status_t	lstatus;
2525 	int		rv;
2526 
2527 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2528 
2529 	mutex_enter(&ldcp->ldc_cblock);
2530 
2531 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
2532 		mutex_exit(&ldcp->ldc_cblock);
2533 		return (LDC_SUCCESS);
2534 	}
2535 
2536 	if (event & LDC_EVT_UP) {
2537 		/*
2538 		 * Channel has come up, get the state and then start
2539 		 * the handshake.
2540 		 */
2541 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
2542 		if (rv != 0) {
2543 			cmn_err(CE_WARN, "Unable to read channel state");
2544 		}
2545 		ldcp->ldc_status = lstatus;
2546 
2547 		D2(vswp, "%s: id(%ld) event(%llx) UP:  status(%ld)",
2548 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2549 
2550 		vsw_restart_handshake(ldcp);
2551 
2552 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
2553 	}
2554 
2555 	if (event & LDC_EVT_READ) {
2556 		/*
2557 		 * Data available for reading.
2558 		 */
2559 		D2(vswp, "%s: id(ld) event(%llx) data READ",
2560 				__func__, ldcp->ldc_id, event);
2561 
2562 		vsw_process_pkt(ldcp);
2563 
2564 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
2565 
2566 		goto vsw_cb_exit;
2567 	}
2568 
2569 	if (event & LDC_EVT_RESET) {
2570 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
2571 		if (rv != 0) {
2572 			cmn_err(CE_WARN, "Unable to read channel state");
2573 		} else {
2574 			ldcp->ldc_status = lstatus;
2575 		}
2576 		D2(vswp, "%s: id(%ld) event(%llx) RESET:  status (%ld)",
2577 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2578 	}
2579 
2580 	if (event & LDC_EVT_DOWN) {
2581 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
2582 		if (rv != 0) {
2583 			cmn_err(CE_WARN, "Unable to read channel state");
2584 		} else {
2585 			ldcp->ldc_status = lstatus;
2586 		}
2587 
2588 		D2(vswp, "%s: id(%ld) event(%llx) DOWN:  status (%ld)",
2589 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2590 
2591 	}
2592 
2593 	/*
2594 	 * Catch either LDC_EVT_WRITE which we don't support or any
2595 	 * unknown event.
2596 	 */
2597 	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
2598 					| LDC_EVT_DOWN | LDC_EVT_READ)) {
2599 
2600 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
2601 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2602 	}
2603 
2604 vsw_cb_exit:
2605 	mutex_exit(&ldcp->ldc_cblock);
2606 
2607 	/*
2608 	 * Let the drain function know we are finishing if it
2609 	 * is waiting.
2610 	 */
2611 	mutex_enter(&ldcp->drain_cv_lock);
2612 	if (ldcp->drain_state == VSW_LDC_DRAINING)
2613 		cv_signal(&ldcp->drain_cv);
2614 	mutex_exit(&ldcp->drain_cv_lock);
2615 
2616 	return (LDC_SUCCESS);
2617 }
2618 
2619 /*
2620  * (Re)start a handshake with our peer by sending them
2621  * our version info.
2622  */
2623 static void
2624 vsw_restart_handshake(vsw_ldc_t *ldcp)
2625 {
2626 	vsw_t		*vswp = ldcp->ldc_vswp;
2627 	vsw_port_t	*port;
2628 	vsw_ldc_list_t	*ldcl;
2629 
2630 	D1(vswp, "vsw_restart_handshake: enter");
2631 
2632 	port = ldcp->ldc_port;
2633 	ldcl = &port->p_ldclist;
2634 
2635 	WRITE_ENTER(&ldcl->lockrw);
2636 
2637 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
2638 		ldcp->lane_in.lstate, ldcp->lane_out.lstate);
2639 
2640 	vsw_free_lane_resources(ldcp, INBOUND);
2641 	vsw_free_lane_resources(ldcp, OUTBOUND);
2642 	RW_EXIT(&ldcl->lockrw);
2643 
2644 	ldcp->lane_in.lstate = 0;
2645 	ldcp->lane_out.lstate = 0;
2646 
2647 	/*
2648 	 * Remove parent port from any multicast groups
2649 	 * it may have registered with. Client must resend
2650 	 * multicast add command after handshake completes.
2651 	 */
2652 	(void) vsw_del_fdb(vswp, port);
2653 
2654 	vsw_del_mcst_port(port);
2655 
2656 	ldcp->hphase = VSW_MILESTONE0;
2657 
2658 	ldcp->peer_session = 0;
2659 	ldcp->session_status = 0;
2660 
2661 	/*
2662 	 * We now increment the transaction group id. This allows
2663 	 * us to identify and disard any tasks which are still pending
2664 	 * on the taskq and refer to the handshake session we are about
2665 	 * to restart. These stale messages no longer have any real
2666 	 * meaning.
2667 	 */
2668 	mutex_enter(&ldcp->hss_lock);
2669 	ldcp->hss_id++;
2670 	mutex_exit(&ldcp->hss_lock);
2671 
2672 	if (ldcp->hcnt++ > vsw_num_handshakes) {
2673 		cmn_err(CE_WARN, "exceeded number of permitted "
2674 			"handshake attempts (%d) on channel %ld",
2675 			ldcp->hcnt, ldcp->ldc_id);
2676 		return;
2677 	}
2678 
2679 	vsw_send_ver(ldcp);
2680 
2681 	D1(vswp, "vsw_restart_handshake: exit");
2682 }
2683 
2684 /*
2685  * returns 0 if legal for event signified by flag to have
2686  * occured at the time it did. Otherwise returns 1.
2687  */
2688 int
2689 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
2690 {
2691 	vsw_t		*vswp = ldcp->ldc_vswp;
2692 	uint64_t	state;
2693 	uint64_t	phase;
2694 
2695 	if (dir == INBOUND)
2696 		state = ldcp->lane_in.lstate;
2697 	else
2698 		state = ldcp->lane_out.lstate;
2699 
2700 	phase = ldcp->hphase;
2701 
2702 	switch (flag) {
2703 	case VSW_VER_INFO_RECV:
2704 		if (phase > VSW_MILESTONE0) {
2705 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
2706 				" when in state %d\n", ldcp->ldc_id, phase);
2707 			vsw_restart_handshake(ldcp);
2708 			return (1);
2709 		}
2710 		break;
2711 
2712 	case VSW_VER_ACK_RECV:
2713 	case VSW_VER_NACK_RECV:
2714 		if (!(state & VSW_VER_INFO_SENT)) {
2715 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
2716 				" or VER_NACK when in state %d\n",
2717 				ldcp->ldc_id, phase);
2718 			vsw_restart_handshake(ldcp);
2719 			return (1);
2720 		} else
2721 			state &= ~VSW_VER_INFO_SENT;
2722 		break;
2723 
2724 	case VSW_ATTR_INFO_RECV:
2725 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
2726 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
2727 				" when in state %d\n", ldcp->ldc_id, phase);
2728 			vsw_restart_handshake(ldcp);
2729 			return (1);
2730 		}
2731 		break;
2732 
2733 	case VSW_ATTR_ACK_RECV:
2734 	case VSW_ATTR_NACK_RECV:
2735 		if (!(state & VSW_ATTR_INFO_SENT)) {
2736 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
2737 				" or ATTR_NACK when in state %d\n",
2738 				ldcp->ldc_id, phase);
2739 			vsw_restart_handshake(ldcp);
2740 			return (1);
2741 		} else
2742 			state &= ~VSW_ATTR_INFO_SENT;
2743 		break;
2744 
2745 	case VSW_DRING_INFO_RECV:
2746 		if (phase < VSW_MILESTONE1) {
2747 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
2748 				" when in state %d\n", ldcp->ldc_id, phase);
2749 			vsw_restart_handshake(ldcp);
2750 			return (1);
2751 		}
2752 		break;
2753 
2754 	case VSW_DRING_ACK_RECV:
2755 	case VSW_DRING_NACK_RECV:
2756 		if (!(state & VSW_DRING_INFO_SENT)) {
2757 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
2758 				" or DRING_NACK when in state %d\n",
2759 				ldcp->ldc_id, phase);
2760 			vsw_restart_handshake(ldcp);
2761 			return (1);
2762 		} else
2763 			state &= ~VSW_DRING_INFO_SENT;
2764 		break;
2765 
2766 	case VSW_RDX_INFO_RECV:
2767 		if (phase < VSW_MILESTONE3) {
2768 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
2769 				" when in state %d\n", ldcp->ldc_id, phase);
2770 			vsw_restart_handshake(ldcp);
2771 			return (1);
2772 		}
2773 		break;
2774 
2775 	case VSW_RDX_ACK_RECV:
2776 	case VSW_RDX_NACK_RECV:
2777 		if (!(state & VSW_RDX_INFO_SENT)) {
2778 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
2779 				" or RDX_NACK when in state %d\n",
2780 				ldcp->ldc_id, phase);
2781 			vsw_restart_handshake(ldcp);
2782 			return (1);
2783 		} else
2784 			state &= ~VSW_RDX_INFO_SENT;
2785 		break;
2786 
2787 	case VSW_MCST_INFO_RECV:
2788 		if (phase < VSW_MILESTONE3) {
2789 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
2790 				" when in state %d\n", ldcp->ldc_id, phase);
2791 			vsw_restart_handshake(ldcp);
2792 			return (1);
2793 		}
2794 		break;
2795 
2796 	default:
2797 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
2798 				ldcp->ldc_id, flag);
2799 		return (1);
2800 	}
2801 
2802 	if (dir == INBOUND)
2803 		ldcp->lane_in.lstate = state;
2804 	else
2805 		ldcp->lane_out.lstate = state;
2806 
2807 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
2808 
2809 	return (0);
2810 }
2811 
2812 void
2813 vsw_next_milestone(vsw_ldc_t *ldcp)
2814 {
2815 	vsw_t		*vswp = ldcp->ldc_vswp;
2816 
2817 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
2818 		ldcp->ldc_id, ldcp->hphase);
2819 
2820 	DUMP_FLAGS(ldcp->lane_in.lstate);
2821 	DUMP_FLAGS(ldcp->lane_out.lstate);
2822 
2823 	switch (ldcp->hphase) {
2824 
2825 	case VSW_MILESTONE0:
2826 		/*
2827 		 * If we haven't started to handshake with our peer,
2828 		 * start to do so now.
2829 		 */
2830 		if (ldcp->lane_out.lstate == 0) {
2831 			D2(vswp, "%s: (chan %lld) starting handshake "
2832 				"with peer", __func__, ldcp->ldc_id);
2833 			vsw_restart_handshake(ldcp);
2834 		}
2835 
2836 		/*
2837 		 * Only way to pass this milestone is to have successfully
2838 		 * negotiated version info.
2839 		 */
2840 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
2841 			(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
2842 
2843 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
2844 				__func__, ldcp->ldc_id);
2845 
2846 			/*
2847 			 * Next milestone is passed when attribute
2848 			 * information has been successfully exchanged.
2849 			 */
2850 			ldcp->hphase = VSW_MILESTONE1;
2851 			vsw_send_attr(ldcp);
2852 
2853 		}
2854 		break;
2855 
2856 	case VSW_MILESTONE1:
2857 		/*
2858 		 * Only way to pass this milestone is to have successfully
2859 		 * negotiated attribute information.
2860 		 */
2861 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
2862 
2863 			ldcp->hphase = VSW_MILESTONE2;
2864 
2865 			/*
2866 			 * If the peer device has said it wishes to
2867 			 * use descriptor rings then we send it our ring
2868 			 * info, otherwise we just set up a private ring
2869 			 * which we use an internal buffer
2870 			 */
2871 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
2872 				vsw_send_dring_info(ldcp);
2873 		}
2874 		break;
2875 
2876 
2877 	case VSW_MILESTONE2:
2878 		/*
2879 		 * If peer has indicated in its attribute message that
2880 		 * it wishes to use descriptor rings then the only way
2881 		 * to pass this milestone is for us to have received
2882 		 * valid dring info.
2883 		 *
2884 		 * If peer is not using descriptor rings then just fall
2885 		 * through.
2886 		 */
2887 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
2888 			(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
2889 			break;
2890 
2891 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
2892 				__func__, ldcp->ldc_id);
2893 
2894 		ldcp->hphase = VSW_MILESTONE3;
2895 		vsw_send_rdx(ldcp);
2896 		break;
2897 
2898 	case VSW_MILESTONE3:
2899 		/*
2900 		 * Pass this milestone when all paramaters have been
2901 		 * successfully exchanged and RDX sent in both directions.
2902 		 *
2903 		 * Mark outbound lane as available to transmit data.
2904 		 */
2905 		if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) &&
2906 			(ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) {
2907 
2908 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
2909 				__func__, ldcp->ldc_id);
2910 			D2(vswp, "%s: ** handshake complete **", __func__);
2911 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
2912 			ldcp->hphase = VSW_MILESTONE4;
2913 			ldcp->hcnt = 0;
2914 			DISPLAY_STATE();
2915 		}
2916 		break;
2917 
2918 	case VSW_MILESTONE4:
2919 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
2920 							ldcp->ldc_id);
2921 		break;
2922 
2923 	default:
2924 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
2925 			ldcp->ldc_id, ldcp->hphase);
2926 	}
2927 
2928 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
2929 		ldcp->hphase);
2930 }
2931 
2932 /*
2933  * Check if major version is supported.
2934  *
2935  * Returns 0 if finds supported major number, and if necessary
2936  * adjusts the minor field.
2937  *
2938  * Returns 1 if can't match major number exactly. Sets mjor/minor
2939  * to next lowest support values, or to zero if no other values possible.
2940  */
2941 static int
2942 vsw_supported_version(vio_ver_msg_t *vp)
2943 {
2944 	int	i;
2945 
2946 	D1(NULL, "vsw_supported_version: enter");
2947 
2948 	for (i = 0; i < VSW_NUM_VER; i++) {
2949 		if (vsw_versions[i].ver_major == vp->ver_major) {
2950 			/*
2951 			 * Matching or lower major version found. Update
2952 			 * minor number if necessary.
2953 			 */
2954 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
2955 				D2(NULL, "%s: adjusting minor value"
2956 					" from %d to %d", __func__,
2957 					vp->ver_minor,
2958 					vsw_versions[i].ver_minor);
2959 				vp->ver_minor = vsw_versions[i].ver_minor;
2960 			}
2961 
2962 			return (0);
2963 		}
2964 
2965 		if (vsw_versions[i].ver_major < vp->ver_major) {
2966 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
2967 				D2(NULL, "%s: adjusting minor value"
2968 					" from %d to %d", __func__,
2969 					vp->ver_minor,
2970 					vsw_versions[i].ver_minor);
2971 				vp->ver_minor = vsw_versions[i].ver_minor;
2972 			}
2973 			return (1);
2974 		}
2975 	}
2976 
2977 	/* No match was possible, zero out fields */
2978 	vp->ver_major = 0;
2979 	vp->ver_minor = 0;
2980 
2981 	D1(NULL, "vsw_supported_version: exit");
2982 
2983 	return (1);
2984 }
2985 
2986 /*
2987  * Main routine for processing messages received over LDC.
2988  */
2989 static void
2990 vsw_process_pkt(void *arg)
2991 {
2992 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2993 	vsw_t 		*vswp = ldcp->ldc_vswp;
2994 	size_t		msglen;
2995 	vio_msg_tag_t	tag;
2996 	def_msg_t	dmsg;
2997 	int 		rv = 0;
2998 
2999 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3000 
3001 	/*
3002 	 * If channel is up read messages until channel is empty.
3003 	 */
3004 	do {
3005 		msglen = sizeof (dmsg);
3006 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
3007 
3008 		if (rv != 0) {
3009 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
3010 				"len(%d)\n", __func__, ldcp->ldc_id,
3011 							rv, msglen);
3012 			break;
3013 		}
3014 
3015 		if (msglen == 0) {
3016 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
3017 			ldcp->ldc_id);
3018 			break;
3019 		}
3020 
3021 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
3022 		    ldcp->ldc_id, msglen);
3023 
3024 		/*
3025 		 * Figure out what sort of packet we have gotten by
3026 		 * examining the msg tag, and then switch it appropriately.
3027 		 */
3028 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
3029 
3030 		switch (tag.vio_msgtype) {
3031 		case VIO_TYPE_CTRL:
3032 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
3033 			break;
3034 		case VIO_TYPE_DATA:
3035 			vsw_process_data_pkt(ldcp, &dmsg, tag);
3036 			break;
3037 		case VIO_TYPE_ERR:
3038 			vsw_process_err_pkt(ldcp, &dmsg, tag);
3039 			break;
3040 		default:
3041 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
3042 				"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
3043 			break;
3044 		}
3045 	} while (msglen);
3046 
3047 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3048 }
3049 
3050 /*
3051  * Dispatch a task to process a VIO control message.
3052  */
3053 static void
3054 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
3055 {
3056 	vsw_ctrl_task_t		*ctaskp = NULL;
3057 	vsw_port_t		*port = ldcp->ldc_port;
3058 	vsw_t			*vswp = port->p_vswp;
3059 
3060 	D1(vswp, "%s: enter", __func__);
3061 
3062 	/*
3063 	 * We need to handle RDX ACK messages in-band as once they
3064 	 * are exchanged it is possible that we will get an
3065 	 * immediate (legitimate) data packet.
3066 	 */
3067 	if ((tag.vio_subtype_env == VIO_RDX) &&
3068 		(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
3069 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV))
3070 			return;
3071 
3072 		ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV;
3073 		vsw_next_milestone(ldcp);
3074 		D2(vswp, "%s (%ld) handling RDX_ACK in place", __func__,
3075 			ldcp->ldc_id);
3076 		return;
3077 	}
3078 
3079 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
3080 
3081 	if (ctaskp == NULL) {
3082 		DERR(vswp, "%s: unable to alloc space for ctrl"
3083 			" msg", __func__);
3084 		vsw_restart_handshake(ldcp);
3085 		return;
3086 	}
3087 
3088 	ctaskp->ldcp = ldcp;
3089 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
3090 	mutex_enter(&ldcp->hss_lock);
3091 	ctaskp->hss_id = ldcp->hss_id;
3092 	mutex_exit(&ldcp->hss_lock);
3093 
3094 	/*
3095 	 * Dispatch task to processing taskq if port is not in
3096 	 * the process of being detached.
3097 	 */
3098 	mutex_enter(&port->state_lock);
3099 	if (port->state == VSW_PORT_INIT) {
3100 		if ((vswp->taskq_p == NULL) ||
3101 			(ddi_taskq_dispatch(vswp->taskq_p,
3102 			vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
3103 							!= DDI_SUCCESS)) {
3104 			DERR(vswp, "%s: unable to dispatch task to taskq",
3105 				__func__);
3106 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3107 			mutex_exit(&port->state_lock);
3108 			vsw_restart_handshake(ldcp);
3109 			return;
3110 		}
3111 	} else {
3112 		DWARN(vswp, "%s: port %d detaching, not dispatching "
3113 			"task", __func__, port->p_instance);
3114 	}
3115 
3116 	mutex_exit(&port->state_lock);
3117 
3118 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
3119 			ldcp->ldc_id);
3120 	D1(vswp, "%s: exit", __func__);
3121 }
3122 
3123 /*
3124  * Process a VIO ctrl message. Invoked from taskq.
3125  */
3126 static void
3127 vsw_process_ctrl_pkt(void *arg)
3128 {
3129 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
3130 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
3131 	vsw_t 		*vswp = ldcp->ldc_vswp;
3132 	vio_msg_tag_t	tag;
3133 	uint16_t	env;
3134 
3135 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3136 
3137 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
3138 	env = tag.vio_subtype_env;
3139 
3140 	/* stale pkt check */
3141 	mutex_enter(&ldcp->hss_lock);
3142 	if (ctaskp->hss_id < ldcp->hss_id) {
3143 		DWARN(vswp, "%s: discarding stale packet belonging to"
3144 			" earlier (%ld) handshake session", __func__,
3145 			ctaskp->hss_id);
3146 		mutex_exit(&ldcp->hss_lock);
3147 		return;
3148 	}
3149 	mutex_exit(&ldcp->hss_lock);
3150 
3151 	/* session id check */
3152 	if (ldcp->session_status & VSW_PEER_SESSION) {
3153 		if (ldcp->peer_session != tag.vio_sid) {
3154 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3155 				__func__, ldcp->ldc_id, tag.vio_sid);
3156 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3157 			vsw_restart_handshake(ldcp);
3158 			return;
3159 		}
3160 	}
3161 
3162 	/*
3163 	 * Switch on vio_subtype envelope, then let lower routines
3164 	 * decide if its an INFO, ACK or NACK packet.
3165 	 */
3166 	switch (env) {
3167 	case VIO_VER_INFO:
3168 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
3169 		break;
3170 	case VIO_DRING_REG:
3171 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
3172 		break;
3173 	case VIO_DRING_UNREG:
3174 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
3175 		break;
3176 	case VIO_ATTR_INFO:
3177 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
3178 		break;
3179 	case VNET_MCAST_INFO:
3180 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
3181 		break;
3182 	case VIO_RDX:
3183 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
3184 		break;
3185 	default:
3186 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
3187 							__func__, env);
3188 	}
3189 
3190 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3191 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3192 }
3193 
3194 /*
3195  * Version negotiation. We can end up here either because our peer
3196  * has responded to a handshake message we have sent it, or our peer
3197  * has initiated a handshake with us. If its the former then can only
3198  * be ACK or NACK, if its the later can only be INFO.
3199  *
3200  * If its an ACK we move to the next stage of the handshake, namely
3201  * attribute exchange. If its a NACK we see if we can specify another
3202  * version, if we can't we stop.
3203  *
3204  * If it is an INFO we reset all params associated with communication
3205  * in that direction over this channel (remember connection is
3206  * essentially 2 independent simplex channels).
3207  */
3208 void
3209 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
3210 {
3211 	vio_ver_msg_t	*ver_pkt;
3212 	vsw_t 		*vswp = ldcp->ldc_vswp;
3213 
3214 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3215 
3216 	/*
3217 	 * We know this is a ctrl/version packet so
3218 	 * cast it into the correct structure.
3219 	 */
3220 	ver_pkt = (vio_ver_msg_t *)pkt;
3221 
3222 	switch (ver_pkt->tag.vio_subtype) {
3223 	case VIO_SUBTYPE_INFO:
3224 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
3225 
3226 		/*
3227 		 * Record the session id, which we will use from now
3228 		 * until we see another VER_INFO msg. Even then the
3229 		 * session id in most cases will be unchanged, execpt
3230 		 * if channel was reset.
3231 		 */
3232 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
3233 			(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
3234 			DERR(vswp, "%s: updating session id for chan %lld "
3235 				"from %llx to %llx", __func__, ldcp->ldc_id,
3236 				ldcp->peer_session, ver_pkt->tag.vio_sid);
3237 		}
3238 
3239 		ldcp->peer_session = ver_pkt->tag.vio_sid;
3240 		ldcp->session_status |= VSW_PEER_SESSION;
3241 
3242 		/* Legal message at this time ? */
3243 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
3244 			return;
3245 
3246 		/*
3247 		 * First check the device class. Currently only expect
3248 		 * to be talking to a network device. In the future may
3249 		 * also talk to another switch.
3250 		 */
3251 		if (ver_pkt->dev_class != VDEV_NETWORK) {
3252 			DERR(vswp, "%s: illegal device class %d", __func__,
3253 				ver_pkt->dev_class);
3254 
3255 			ver_pkt->tag.vio_sid = ldcp->local_session;
3256 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3257 
3258 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3259 
3260 			vsw_send_msg(ldcp, (void *)ver_pkt,
3261 					sizeof (vio_ver_msg_t));
3262 
3263 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
3264 			vsw_next_milestone(ldcp);
3265 			return;
3266 		} else {
3267 			ldcp->dev_class = ver_pkt->dev_class;
3268 		}
3269 
3270 		/*
3271 		 * Now check the version.
3272 		 */
3273 		if (vsw_supported_version(ver_pkt) == 0) {
3274 			/*
3275 			 * Support this major version and possibly
3276 			 * adjusted minor version.
3277 			 */
3278 
3279 			D2(vswp, "%s: accepted ver %d:%d", __func__,
3280 				ver_pkt->ver_major, ver_pkt->ver_minor);
3281 
3282 			/* Store accepted values */
3283 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
3284 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3285 
3286 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3287 
3288 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
3289 		} else {
3290 			/*
3291 			 * NACK back with the next lower major/minor
3292 			 * pairing we support (if don't suuport any more
3293 			 * versions then they will be set to zero.
3294 			 */
3295 
3296 			D2(vswp, "%s: replying with ver %d:%d", __func__,
3297 				ver_pkt->ver_major, ver_pkt->ver_minor);
3298 
3299 			/* Store updated values */
3300 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
3301 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3302 
3303 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3304 
3305 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
3306 		}
3307 
3308 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3309 		ver_pkt->tag.vio_sid = ldcp->local_session;
3310 		vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t));
3311 
3312 		vsw_next_milestone(ldcp);
3313 		break;
3314 
3315 	case VIO_SUBTYPE_ACK:
3316 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
3317 
3318 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
3319 			return;
3320 
3321 		/* Store updated values */
3322 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
3323 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3324 
3325 
3326 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
3327 		vsw_next_milestone(ldcp);
3328 
3329 		break;
3330 
3331 	case VIO_SUBTYPE_NACK:
3332 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
3333 
3334 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
3335 			return;
3336 
3337 		/*
3338 		 * If our peer sent us a NACK with the ver fields set to
3339 		 * zero then there is nothing more we can do. Otherwise see
3340 		 * if we support either the version suggested, or a lesser
3341 		 * one.
3342 		 */
3343 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
3344 			DERR(vswp, "%s: peer unable to negotiate any "
3345 				"further.", __func__);
3346 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
3347 			vsw_next_milestone(ldcp);
3348 			return;
3349 		}
3350 
3351 		/*
3352 		 * Check to see if we support this major version or
3353 		 * a lower one. If we don't then maj/min will be set
3354 		 * to zero.
3355 		 */
3356 		(void) vsw_supported_version(ver_pkt);
3357 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
3358 			/* Nothing more we can do */
3359 			DERR(vswp, "%s: version negotiation failed.\n",
3360 								__func__);
3361 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
3362 			vsw_next_milestone(ldcp);
3363 		} else {
3364 			/* found a supported major version */
3365 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
3366 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
3367 
3368 			D2(vswp, "%s: resending with updated values (%x, %x)",
3369 				__func__, ver_pkt->ver_major,
3370 				ver_pkt->ver_minor);
3371 
3372 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
3373 			ver_pkt->tag.vio_sid = ldcp->local_session;
3374 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3375 
3376 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3377 
3378 			vsw_send_msg(ldcp, (void *)ver_pkt,
3379 					sizeof (vio_ver_msg_t));
3380 
3381 			vsw_next_milestone(ldcp);
3382 
3383 		}
3384 		break;
3385 
3386 	default:
3387 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3388 			ver_pkt->tag.vio_subtype);
3389 	}
3390 
3391 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
3392 }
3393 
3394 /*
3395  * Process an attribute packet. We can end up here either because our peer
3396  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
3397  * peer has sent us an attribute INFO message
3398  *
3399  * If its an ACK we then move to the next stage of the handshake which
3400  * is to send our descriptor ring info to our peer. If its a NACK then
3401  * there is nothing more we can (currently) do.
3402  *
3403  * If we get a valid/acceptable INFO packet (and we have already negotiated
3404  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
3405  * NACK back and reset channel state to INACTIV.
3406  *
3407  * FUTURE: in time we will probably negotiate over attributes, but for
3408  * the moment unacceptable attributes are regarded as a fatal error.
3409  *
3410  */
3411 void
3412 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
3413 {
3414 	vnet_attr_msg_t		*attr_pkt;
3415 	vsw_t			*vswp = ldcp->ldc_vswp;
3416 	vsw_port_t		*port = ldcp->ldc_port;
3417 	uint64_t		macaddr = 0;
3418 	int			i;
3419 
3420 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3421 
3422 	/*
3423 	 * We know this is a ctrl/attr packet so
3424 	 * cast it into the correct structure.
3425 	 */
3426 	attr_pkt = (vnet_attr_msg_t *)pkt;
3427 
3428 	switch (attr_pkt->tag.vio_subtype) {
3429 	case VIO_SUBTYPE_INFO:
3430 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3431 
3432 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
3433 			return;
3434 
3435 		/*
3436 		 * If the attributes are unacceptable then we NACK back.
3437 		 */
3438 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
3439 
3440 			DERR(vswp, "%s (chan %d): invalid attributes",
3441 				__func__, ldcp->ldc_id);
3442 
3443 			vsw_free_lane_resources(ldcp, INBOUND);
3444 
3445 			attr_pkt->tag.vio_sid = ldcp->local_session;
3446 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3447 
3448 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
3449 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
3450 			vsw_send_msg(ldcp, (void *)attr_pkt,
3451 					sizeof (vnet_attr_msg_t));
3452 
3453 			vsw_next_milestone(ldcp);
3454 			return;
3455 		}
3456 
3457 		/*
3458 		 * Otherwise store attributes for this lane and update
3459 		 * lane state.
3460 		 */
3461 		ldcp->lane_in.mtu = attr_pkt->mtu;
3462 		ldcp->lane_in.addr = attr_pkt->addr;
3463 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
3464 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
3465 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
3466 
3467 		macaddr = ldcp->lane_in.addr;
3468 		for (i = ETHERADDRL - 1; i >= 0; i--) {
3469 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
3470 			macaddr >>= 8;
3471 		}
3472 
3473 		/* create the fdb entry for this port/mac address */
3474 		(void) vsw_add_fdb(vswp, port);
3475 
3476 		/* setup device specifc xmit routines */
3477 		mutex_enter(&port->tx_lock);
3478 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
3479 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
3480 			port->transmit = vsw_dringsend;
3481 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
3482 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
3483 			vsw_create_privring(ldcp);
3484 			port->transmit = vsw_descrsend;
3485 		}
3486 		mutex_exit(&port->tx_lock);
3487 
3488 		attr_pkt->tag.vio_sid = ldcp->local_session;
3489 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3490 
3491 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
3492 
3493 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
3494 
3495 		vsw_send_msg(ldcp, (void *)attr_pkt,
3496 					sizeof (vnet_attr_msg_t));
3497 
3498 		vsw_next_milestone(ldcp);
3499 		break;
3500 
3501 	case VIO_SUBTYPE_ACK:
3502 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3503 
3504 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
3505 			return;
3506 
3507 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
3508 		vsw_next_milestone(ldcp);
3509 		break;
3510 
3511 	case VIO_SUBTYPE_NACK:
3512 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3513 
3514 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
3515 			return;
3516 
3517 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
3518 		vsw_next_milestone(ldcp);
3519 		break;
3520 
3521 	default:
3522 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3523 			attr_pkt->tag.vio_subtype);
3524 	}
3525 
3526 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3527 }
3528 
3529 /*
3530  * Process a dring info packet. We can end up here either because our peer
3531  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3532  * peer has sent us a dring INFO message.
3533  *
3534  * If we get a valid/acceptable INFO packet (and we have already negotiated
3535  * a version) we ACK back and update the lane state, otherwise we NACK back.
3536  *
3537  * FUTURE: nothing to stop client from sending us info on multiple dring's
3538  * but for the moment we will just use the first one we are given.
3539  *
3540  */
3541 void
3542 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3543 {
3544 	vio_dring_reg_msg_t	*dring_pkt;
3545 	vsw_t			*vswp = ldcp->ldc_vswp;
3546 	ldc_mem_info_t		minfo;
3547 	dring_info_t		*dp, *dbp;
3548 	int			dring_found = 0;
3549 
3550 	/*
3551 	 * We know this is a ctrl/dring packet so
3552 	 * cast it into the correct structure.
3553 	 */
3554 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
3555 
3556 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3557 
3558 	switch (dring_pkt->tag.vio_subtype) {
3559 	case VIO_SUBTYPE_INFO:
3560 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3561 
3562 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3563 			return;
3564 
3565 		/*
3566 		 * If the dring params are unacceptable then we NACK back.
3567 		 */
3568 		if (vsw_check_dring_info(dring_pkt)) {
3569 
3570 			DERR(vswp, "%s (%lld): invalid dring info",
3571 				__func__, ldcp->ldc_id);
3572 
3573 			vsw_free_lane_resources(ldcp, INBOUND);
3574 
3575 			dring_pkt->tag.vio_sid = ldcp->local_session;
3576 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3577 
3578 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3579 
3580 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3581 
3582 			vsw_send_msg(ldcp, (void *)dring_pkt,
3583 					sizeof (vio_dring_reg_msg_t));
3584 
3585 			vsw_next_milestone(ldcp);
3586 			return;
3587 		}
3588 
3589 		/*
3590 		 * Otherwise, attempt to map in the dring using the
3591 		 * cookie. If that succeeds we send back a unique dring
3592 		 * identifier that the sending side will use in future
3593 		 * to refer to this descriptor ring.
3594 		 */
3595 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
3596 
3597 		dp->num_descriptors = dring_pkt->num_descriptors;
3598 		dp->descriptor_size = dring_pkt->descriptor_size;
3599 		dp->options = dring_pkt->options;
3600 		dp->ncookies = dring_pkt->ncookies;
3601 
3602 		/*
3603 		 * Note: should only get one cookie. Enforced in
3604 		 * the ldc layer.
3605 		 */
3606 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
3607 			sizeof (ldc_mem_cookie_t));
3608 
3609 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
3610 			dp->num_descriptors, dp->descriptor_size);
3611 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
3612 			dp->options, dp->ncookies);
3613 
3614 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
3615 			dp->ncookies, dp->num_descriptors,
3616 			dp->descriptor_size, LDC_SHADOW_MAP,
3617 			&(dp->handle))) != 0) {
3618 
3619 			DERR(vswp, "%s: dring_map failed\n", __func__);
3620 
3621 			kmem_free(dp, sizeof (dring_info_t));
3622 			vsw_free_lane_resources(ldcp, INBOUND);
3623 
3624 			dring_pkt->tag.vio_sid = ldcp->local_session;
3625 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3626 
3627 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3628 
3629 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3630 			vsw_send_msg(ldcp, (void *)dring_pkt,
3631 				sizeof (vio_dring_reg_msg_t));
3632 
3633 			vsw_next_milestone(ldcp);
3634 			return;
3635 		}
3636 
3637 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
3638 
3639 			DERR(vswp, "%s: dring_addr failed\n", __func__);
3640 
3641 			kmem_free(dp, sizeof (dring_info_t));
3642 			vsw_free_lane_resources(ldcp, INBOUND);
3643 
3644 			dring_pkt->tag.vio_sid = ldcp->local_session;
3645 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3646 
3647 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3648 
3649 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3650 			vsw_send_msg(ldcp, (void *)dring_pkt,
3651 				sizeof (vio_dring_reg_msg_t));
3652 
3653 			vsw_next_milestone(ldcp);
3654 			return;
3655 		} else {
3656 			/* store the address of the pub part of ring */
3657 			dp->pub_addr = minfo.vaddr;
3658 		}
3659 
3660 		/* no private section as we are importing */
3661 		dp->priv_addr = NULL;
3662 
3663 		/*
3664 		 * Using simple mono increasing int for ident at
3665 		 * the moment.
3666 		 */
3667 		dp->ident = ldcp->next_ident;
3668 		ldcp->next_ident++;
3669 
3670 		dp->end_idx = 0;
3671 		dp->next = NULL;
3672 
3673 		/*
3674 		 * Link it onto the end of the list of drings
3675 		 * for this lane.
3676 		 */
3677 		if (ldcp->lane_in.dringp == NULL) {
3678 			D2(vswp, "%s: adding first INBOUND dring", __func__);
3679 			ldcp->lane_in.dringp = dp;
3680 		} else {
3681 			dbp = ldcp->lane_in.dringp;
3682 
3683 			while (dbp->next != NULL)
3684 				dbp = dbp->next;
3685 
3686 			dbp->next = dp;
3687 		}
3688 
3689 		/* acknowledge it */
3690 		dring_pkt->tag.vio_sid = ldcp->local_session;
3691 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3692 		dring_pkt->dring_ident = dp->ident;
3693 
3694 		vsw_send_msg(ldcp, (void *)dring_pkt,
3695 				sizeof (vio_dring_reg_msg_t));
3696 
3697 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
3698 		vsw_next_milestone(ldcp);
3699 		break;
3700 
3701 	case VIO_SUBTYPE_ACK:
3702 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3703 
3704 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
3705 			return;
3706 
3707 		/*
3708 		 * Peer is acknowledging our dring info and will have
3709 		 * sent us a dring identifier which we will use to
3710 		 * refer to this ring w.r.t. our peer.
3711 		 */
3712 		dp = ldcp->lane_out.dringp;
3713 		if (dp != NULL) {
3714 			/*
3715 			 * Find the ring this ident should be associated
3716 			 * with.
3717 			 */
3718 			if (vsw_dring_match(dp, dring_pkt)) {
3719 				dring_found = 1;
3720 
3721 			} else while (dp != NULL) {
3722 				if (vsw_dring_match(dp, dring_pkt)) {
3723 					dring_found = 1;
3724 					break;
3725 				}
3726 				dp = dp->next;
3727 			}
3728 
3729 			if (dring_found == 0) {
3730 				DERR(NULL, "%s: unrecognised ring cookie",
3731 					__func__);
3732 				vsw_restart_handshake(ldcp);
3733 				return;
3734 			}
3735 
3736 		} else {
3737 			DERR(vswp, "%s: DRING ACK received but no drings "
3738 				"allocated", __func__);
3739 			vsw_restart_handshake(ldcp);
3740 			return;
3741 		}
3742 
3743 		/* store ident */
3744 		dp->ident = dring_pkt->dring_ident;
3745 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
3746 		vsw_next_milestone(ldcp);
3747 		break;
3748 
3749 	case VIO_SUBTYPE_NACK:
3750 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3751 
3752 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3753 			return;
3754 
3755 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
3756 		vsw_next_milestone(ldcp);
3757 		break;
3758 
3759 	default:
3760 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3761 			dring_pkt->tag.vio_subtype);
3762 	}
3763 
3764 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3765 }
3766 
3767 /*
3768  * Process a request from peer to unregister a dring.
3769  *
3770  * For the moment we just restart the handshake if our
3771  * peer endpoint attempts to unregister a dring.
3772  */
3773 void
3774 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3775 {
3776 	vsw_t			*vswp = ldcp->ldc_vswp;
3777 	vio_dring_unreg_msg_t	*dring_pkt;
3778 
3779 	/*
3780 	 * We know this is a ctrl/dring packet so
3781 	 * cast it into the correct structure.
3782 	 */
3783 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3784 
3785 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3786 
3787 	switch (dring_pkt->tag.vio_subtype) {
3788 	case VIO_SUBTYPE_INFO:
3789 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3790 
3791 		DWARN(vswp, "%s: restarting handshake..", __func__);
3792 		vsw_restart_handshake(ldcp);
3793 		break;
3794 
3795 	case VIO_SUBTYPE_ACK:
3796 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3797 
3798 		DWARN(vswp, "%s: restarting handshake..", __func__);
3799 		vsw_restart_handshake(ldcp);
3800 		break;
3801 
3802 	case VIO_SUBTYPE_NACK:
3803 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3804 
3805 		DWARN(vswp, "%s: restarting handshake..", __func__);
3806 		vsw_restart_handshake(ldcp);
3807 		break;
3808 
3809 	default:
3810 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3811 			dring_pkt->tag.vio_subtype);
3812 		vsw_restart_handshake(ldcp);
3813 	}
3814 
3815 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3816 }
3817 
3818 #define	SND_MCST_NACK(ldcp, pkt) \
3819 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3820 	pkt->tag.vio_sid = ldcp->local_session; \
3821 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t));
3822 
3823 /*
3824  * Process a multicast request from a vnet.
3825  *
3826  * Vnet's specify a multicast address that they are interested in. This
3827  * address is used as a key into the hash table which forms the multicast
3828  * forwarding database (mFDB).
3829  *
3830  * The table keys are the multicast addresses, while the table entries
3831  * are pointers to lists of ports which wish to receive packets for the
3832  * specified multicast address.
3833  *
3834  * When a multicast packet is being switched we use the address as a key
3835  * into the hash table, and then walk the appropriate port list forwarding
3836  * the pkt to each port in turn.
3837  *
3838  * If a vnet is no longer interested in a particular multicast grouping
3839  * we simply find the correct location in the hash table and then delete
3840  * the relevant port from the port list.
3841  *
3842  * To deal with the case whereby a port is being deleted without first
3843  * removing itself from the lists in the hash table, we maintain a list
3844  * of multicast addresses the port has registered an interest in, within
3845  * the port structure itself. We then simply walk that list of addresses
3846  * using them as keys into the hash table and remove the port from the
3847  * appropriate lists.
3848  */
3849 static void
3850 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3851 {
3852 	vnet_mcast_msg_t	*mcst_pkt;
3853 	vsw_port_t		*port = ldcp->ldc_port;
3854 	vsw_t			*vswp = ldcp->ldc_vswp;
3855 	int			i;
3856 
3857 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3858 
3859 	/*
3860 	 * We know this is a ctrl/mcast packet so
3861 	 * cast it into the correct structure.
3862 	 */
3863 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3864 
3865 	switch (mcst_pkt->tag.vio_subtype) {
3866 	case VIO_SUBTYPE_INFO:
3867 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3868 
3869 		/*
3870 		 * Check if in correct state to receive a multicast
3871 		 * message (i.e. handshake complete). If not reset
3872 		 * the handshake.
3873 		 */
3874 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3875 			return;
3876 
3877 		/*
3878 		 * Before attempting to add or remove address check
3879 		 * that they are valid multicast addresses.
3880 		 * If not, then NACK back.
3881 		 */
3882 		for (i = 0; i < mcst_pkt->count; i++) {
3883 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3884 				DERR(vswp, "%s: invalid multicast address",
3885 								__func__);
3886 				SND_MCST_NACK(ldcp, mcst_pkt);
3887 				return;
3888 			}
3889 		}
3890 
3891 		/*
3892 		 * Now add/remove the addresses. If this fails we
3893 		 * NACK back.
3894 		 */
3895 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3896 			SND_MCST_NACK(ldcp, mcst_pkt);
3897 			return;
3898 		}
3899 
3900 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3901 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3902 
3903 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3904 
3905 		vsw_send_msg(ldcp, (void *)mcst_pkt,
3906 					sizeof (vnet_mcast_msg_t));
3907 		break;
3908 
3909 	case VIO_SUBTYPE_ACK:
3910 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3911 
3912 		/*
3913 		 * We shouldn't ever get a multicast ACK message as
3914 		 * at the moment we never request multicast addresses
3915 		 * to be set on some other device. This may change in
3916 		 * the future if we have cascading switches.
3917 		 */
3918 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3919 			return;
3920 
3921 				/* Do nothing */
3922 		break;
3923 
3924 	case VIO_SUBTYPE_NACK:
3925 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3926 
3927 		/*
3928 		 * We shouldn't get a multicast NACK packet for the
3929 		 * same reasons as we shouldn't get a ACK packet.
3930 		 */
3931 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3932 			return;
3933 
3934 				/* Do nothing */
3935 		break;
3936 
3937 	default:
3938 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3939 			mcst_pkt->tag.vio_subtype);
3940 	}
3941 
3942 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3943 }
3944 
3945 static void
3946 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3947 {
3948 	vio_rdx_msg_t	*rdx_pkt;
3949 	vsw_t		*vswp = ldcp->ldc_vswp;
3950 
3951 	/*
3952 	 * We know this is a ctrl/rdx packet so
3953 	 * cast it into the correct structure.
3954 	 */
3955 	rdx_pkt = (vio_rdx_msg_t *)pkt;
3956 
3957 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3958 
3959 	switch (rdx_pkt->tag.vio_subtype) {
3960 	case VIO_SUBTYPE_INFO:
3961 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3962 
3963 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV))
3964 			return;
3965 
3966 		rdx_pkt->tag.vio_sid = ldcp->local_session;
3967 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3968 
3969 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3970 
3971 		ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT;
3972 
3973 		vsw_send_msg(ldcp, (void *)rdx_pkt,
3974 				sizeof (vio_rdx_msg_t));
3975 
3976 		vsw_next_milestone(ldcp);
3977 		break;
3978 
3979 	case VIO_SUBTYPE_ACK:
3980 		/*
3981 		 * Should be handled in-band by callback handler.
3982 		 */
3983 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3984 		vsw_restart_handshake(ldcp);
3985 		break;
3986 
3987 	case VIO_SUBTYPE_NACK:
3988 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3989 
3990 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV))
3991 			return;
3992 
3993 		ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV;
3994 		vsw_next_milestone(ldcp);
3995 		break;
3996 
3997 	default:
3998 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3999 			rdx_pkt->tag.vio_subtype);
4000 	}
4001 
4002 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4003 }
4004 
4005 static void
4006 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
4007 {
4008 	uint16_t	env = tag.vio_subtype_env;
4009 	vsw_t		*vswp = ldcp->ldc_vswp;
4010 
4011 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4012 
4013 	/* session id check */
4014 	if (ldcp->session_status & VSW_PEER_SESSION) {
4015 		if (ldcp->peer_session != tag.vio_sid) {
4016 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
4017 				__func__, ldcp->ldc_id, tag.vio_sid);
4018 			vsw_restart_handshake(ldcp);
4019 			return;
4020 		}
4021 	}
4022 
4023 	/*
4024 	 * It is an error for us to be getting data packets
4025 	 * before the handshake has completed.
4026 	 */
4027 	if (ldcp->hphase != VSW_MILESTONE4) {
4028 		DERR(vswp, "%s: got data packet before handshake complete "
4029 			"hphase %d (%x: %x)", __func__, ldcp->hphase,
4030 			ldcp->lane_in.lstate, ldcp->lane_out.lstate);
4031 		DUMP_FLAGS(ldcp->lane_in.lstate);
4032 		DUMP_FLAGS(ldcp->lane_out.lstate);
4033 		vsw_restart_handshake(ldcp);
4034 		return;
4035 	}
4036 
4037 	/*
4038 	 * Switch on vio_subtype envelope, then let lower routines
4039 	 * decide if its an INFO, ACK or NACK packet.
4040 	 */
4041 	if (env == VIO_DRING_DATA) {
4042 		vsw_process_data_dring_pkt(ldcp, dpkt);
4043 	} else if (env == VIO_PKT_DATA) {
4044 		vsw_process_data_raw_pkt(ldcp, dpkt);
4045 	} else if (env == VIO_DESC_DATA) {
4046 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
4047 	} else {
4048 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
4049 							__func__, env);
4050 	}
4051 
4052 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4053 }
4054 
4055 #define	SND_DRING_NACK(ldcp, pkt) \
4056 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4057 	pkt->tag.vio_sid = ldcp->local_session; \
4058 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t));
4059 
4060 static void
4061 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
4062 {
4063 	vio_dring_msg_t		*dring_pkt;
4064 	vnet_public_desc_t	*pub_addr = NULL;
4065 	vsw_private_desc_t	*priv_addr = NULL;
4066 	dring_info_t		*dp = NULL;
4067 	vsw_t			*vswp = ldcp->ldc_vswp;
4068 	mblk_t			*mp = NULL;
4069 	mblk_t			*bp = NULL;
4070 	mblk_t			*bpt = NULL;
4071 	size_t			nbytes = 0;
4072 	size_t			off = 0;
4073 	uint64_t		ncookies = 0;
4074 	uint64_t		chain = 0;
4075 	uint64_t		j, len, num;
4076 	uint32_t		start, end, datalen;
4077 	int			i, last_sync, rv;
4078 	boolean_t		ack_needed = B_FALSE;
4079 	boolean_t		sync_needed = B_TRUE;
4080 
4081 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4082 
4083 	/*
4084 	 * We know this is a data/dring packet so
4085 	 * cast it into the correct structure.
4086 	 */
4087 	dring_pkt = (vio_dring_msg_t *)dpkt;
4088 
4089 	/*
4090 	 * Switch on the vio_subtype. If its INFO then we need to
4091 	 * process the data. If its an ACK we need to make sure
4092 	 * it makes sense (i.e did we send an earlier data/info),
4093 	 * and if its a NACK then we maybe attempt a retry.
4094 	 */
4095 	switch (dring_pkt->tag.vio_subtype) {
4096 	case VIO_SUBTYPE_INFO:
4097 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
4098 
4099 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
4100 				dring_pkt->dring_ident)) == NULL) {
4101 
4102 			DERR(vswp, "%s(%lld): unable to find dring from "
4103 				"ident 0x%llx", __func__, ldcp->ldc_id,
4104 				dring_pkt->dring_ident);
4105 
4106 			SND_DRING_NACK(ldcp, dring_pkt);
4107 			return;
4108 		}
4109 
4110 		start = end = 0;
4111 		start = dring_pkt->start_idx;
4112 		end = dring_pkt->end_idx;
4113 
4114 		D3(vswp, "%s(%lld): start index %ld : end %ld\n",
4115 			__func__, ldcp->ldc_id, start, end);
4116 
4117 		/* basic sanity check */
4118 		len = dp->num_descriptors;
4119 		if (end > len) {
4120 			DERR(vswp, "%s(%lld): endpoint %lld outside ring"
4121 				" length %lld", __func__, ldcp->ldc_id,
4122 				end, len);
4123 
4124 			SND_DRING_NACK(ldcp, dring_pkt);
4125 			return;
4126 		}
4127 
4128 		/* sync data */
4129 		if ((rv = ldc_mem_dring_acquire(dp->handle,
4130 						start, end)) != 0) {
4131 			DERR(vswp, "%s(%lld): unable to acquire dring : err %d",
4132 				__func__, ldcp->ldc_id, rv);
4133 			return;
4134 		}
4135 
4136 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
4137 
4138 		j = num = 0;
4139 
4140 		/* calculate # descriptors taking into a/c wrap around */
4141 		num = end >= start ? end - start + 1: (len - start + 1) + end;
4142 
4143 		last_sync = start;
4144 
4145 		for (i = start; j < num; i = (i + 1) % len, j++) {
4146 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
4147 
4148 			/*
4149 			 * Data is padded to align on 8 byte boundary,
4150 			 * datalen is actual data length, i.e. minus that
4151 			 * padding.
4152 			 */
4153 			datalen = pub_addr->nbytes;
4154 
4155 			/*
4156 			 * Does peer wish us to ACK when we have finished
4157 			 * with this descriptor ?
4158 			 */
4159 			if (pub_addr->hdr.ack)
4160 				ack_needed = B_TRUE;
4161 
4162 			D2(vswp, "%s(%lld): processing desc %lld at pos"
4163 				" 0x%llx : dstate 0x%lx : datalen 0x%lx",
4164 				__func__, ldcp->ldc_id, i, pub_addr,
4165 				pub_addr->hdr.dstate, datalen);
4166 
4167 			/*
4168 			 * XXXX : Is it a fatal error to be told to
4169 			 * process a packet when the READY bit is not
4170 			 * set ?
4171 			 */
4172 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
4173 				DERR(vswp, "%s(%d): descriptor %lld at pos "
4174 				" 0x%llx not READY (0x%lx)", __func__,
4175 				ldcp->ldc_id, i, pub_addr,
4176 				pub_addr->hdr.dstate);
4177 
4178 				SND_DRING_NACK(ldcp, dring_pkt);
4179 				(void) ldc_mem_dring_release(dp->handle,
4180 					start, end);
4181 				return;
4182 			}
4183 
4184 			/*
4185 			 * Mark that we are starting to process descriptor.
4186 			 */
4187 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
4188 
4189 			/*
4190 			 * allocb(9F) returns an aligned data block. We
4191 			 * need to ensure that we ask ldc for an aligned
4192 			 * number of bytes also.
4193 			 */
4194 			nbytes = datalen;
4195 			if (nbytes & 0x7) {
4196 				off = 8 - (nbytes & 0x7);
4197 				nbytes += off;
4198 			}
4199 			mp = allocb(datalen, BPRI_MED);
4200 			if (mp == NULL) {
4201 				DERR(vswp, "%s(%lld): allocb failed",
4202 					__func__, ldcp->ldc_id);
4203 				(void) ldc_mem_dring_release(dp->handle,
4204 					start, end);
4205 				return;
4206 			}
4207 
4208 			ncookies = pub_addr->ncookies;
4209 			rv = ldc_mem_copy(ldcp->ldc_handle,
4210 				(caddr_t)mp->b_rptr, 0, &nbytes,
4211 				pub_addr->memcookie, ncookies,
4212 				LDC_COPY_IN);
4213 
4214 			if (rv != 0) {
4215 				DERR(vswp, "%s(%d): unable to copy in "
4216 					"data from %d cookies", __func__,
4217 					ldcp->ldc_id, ncookies);
4218 				freemsg(mp);
4219 				(void) ldc_mem_dring_release(dp->handle,
4220 					start, end);
4221 				return;
4222 			} else {
4223 				D2(vswp, "%s(%d): copied in %ld bytes"
4224 					" using %d cookies", __func__,
4225 					ldcp->ldc_id, nbytes, ncookies);
4226 			}
4227 
4228 			/* point to the actual end of data */
4229 			mp->b_wptr = mp->b_rptr + datalen;
4230 
4231 			/* build a chain of received packets */
4232 			if (bp == NULL) {
4233 				/* first pkt */
4234 				bp = mp;
4235 				bp->b_next = bp->b_prev = NULL;
4236 				bpt = bp;
4237 				chain = 1;
4238 			} else {
4239 				mp->b_next = NULL;
4240 				mp->b_prev = bpt;
4241 				bpt->b_next = mp;
4242 				bpt = mp;
4243 				chain++;
4244 			}
4245 
4246 			/* mark we are finished with this descriptor */
4247 			pub_addr->hdr.dstate = VIO_DESC_DONE;
4248 
4249 			/*
4250 			 * Send an ACK back to peer if requested, and sync
4251 			 * the rings up to this point so the remote side sees
4252 			 * the descriptor flag in a consistent state.
4253 			 */
4254 			if (ack_needed) {
4255 				if ((rv = ldc_mem_dring_release(
4256 					dp->handle, last_sync, i)) != 0) {
4257 					DERR(vswp, "%s(%lld): unable to sync"
4258 						" from %d to %d", __func__,
4259 						ldcp->ldc_id, last_sync, i);
4260 				}
4261 
4262 				ack_needed = B_FALSE;
4263 
4264 				if (i == end)
4265 					sync_needed = B_FALSE;
4266 				else
4267 					sync_needed = B_TRUE;
4268 
4269 				last_sync = (i + 1) % len;
4270 
4271 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4272 				dring_pkt->tag.vio_sid = ldcp->local_session;
4273 				vsw_send_msg(ldcp, (void *)dring_pkt,
4274 					sizeof (vio_dring_msg_t));
4275 			}
4276 		}
4277 
4278 		if (sync_needed) {
4279 			if ((rv = ldc_mem_dring_release(dp->handle,
4280 					last_sync, end)) != 0) {
4281 				DERR(vswp, "%s(%lld): unable to sync"
4282 					" from %d to %d", __func__,
4283 					ldcp->ldc_id, last_sync, end);
4284 			}
4285 		}
4286 
4287 		/* send the chain of packets to be switched */
4288 		D3(vswp, "%s(%lld): switching chain of %d msgs", __func__,
4289 			ldcp->ldc_id, chain);
4290 		vsw_switch_frame(vswp, bp, VSW_VNETPORT,
4291 					ldcp->ldc_port, NULL);
4292 
4293 		break;
4294 
4295 	case VIO_SUBTYPE_ACK:
4296 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
4297 		/*
4298 		 * Verify that the relevant descriptors are all
4299 		 * marked as DONE
4300 		 */
4301 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
4302 			dring_pkt->dring_ident)) == NULL) {
4303 			DERR(vswp, "%s: unknown ident in ACK", __func__);
4304 			return;
4305 		}
4306 
4307 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
4308 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4309 
4310 		start = end = 0;
4311 		start = dring_pkt->start_idx;
4312 		end = dring_pkt->end_idx;
4313 		len = dp->num_descriptors;
4314 
4315 
4316 		j = num = 0;
4317 		/* calculate # descriptors taking into a/c wrap around */
4318 		num = end >= start ? end - start + 1: (len - start + 1) + end;
4319 
4320 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
4321 			__func__, ldcp->ldc_id, start, end, num);
4322 
4323 		for (i = start; j < num; i = (i + 1) % len, j++) {
4324 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
4325 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4326 
4327 			if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
4328 				DERR(vswp, "%s: descriptor %lld at pos "
4329 					" 0x%llx not DONE (0x%lx)\n", __func__,
4330 					i, pub_addr, pub_addr->hdr.dstate);
4331 				return;
4332 			} else {
4333 				/* clear all the fields */
4334 				bzero(priv_addr->datap, priv_addr->datalen);
4335 				priv_addr->datalen = 0;
4336 
4337 				pub_addr->hdr.dstate = VIO_DESC_FREE;
4338 				pub_addr->hdr.ack = 0;
4339 				priv_addr->dstate = VIO_DESC_FREE;
4340 
4341 				D3(vswp, "clearing descp %d : pub state "
4342 					"0x%llx : priv state 0x%llx", i,
4343 					pub_addr->hdr.dstate,
4344 					priv_addr->dstate);
4345 			}
4346 		}
4347 
4348 		break;
4349 
4350 	case VIO_SUBTYPE_NACK:
4351 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
4352 						__func__, ldcp->ldc_id);
4353 		/*
4354 		 * Something is badly wrong if we are getting NACK's
4355 		 * for our data pkts. So reset the channel.
4356 		 */
4357 		vsw_restart_handshake(ldcp);
4358 
4359 		break;
4360 
4361 	default:
4362 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4363 			ldcp->ldc_id, dring_pkt->tag.vio_subtype);
4364 	}
4365 
4366 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4367 }
4368 
4369 /*
4370  * VIO_PKT_DATA (a.k.a raw data mode )
4371  *
4372  * Note - currently not supported. Do nothing.
4373  */
4374 static void
4375 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
4376 {
4377 	_NOTE(ARGUNUSED(dpkt))
4378 
4379 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4380 
4381 	DERR(NULL, "%s (%lld): currently  not supported",
4382 						__func__, ldcp->ldc_id);
4383 
4384 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4385 }
4386 
4387 #define	SND_IBND_DESC_NACK(ldcp, pkt) \
4388 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4389 	pkt->tag.vio_sid = ldcp->local_session; \
4390 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t));
4391 
4392 /*
4393  * Process an in-band descriptor message (most likely from
4394  * OBP).
4395  */
4396 static void
4397 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
4398 {
4399 	vio_ibnd_desc_t		*ibnd_desc;
4400 	dring_info_t		*dp = NULL;
4401 	vsw_private_desc_t	*priv_addr = NULL;
4402 	vsw_t			*vswp = ldcp->ldc_vswp;
4403 	mblk_t			*mp = NULL;
4404 	size_t			nbytes = 0;
4405 	size_t			off = 0;
4406 	uint64_t		idx = 0;
4407 	uint32_t		datalen = 0;
4408 	uint64_t		ncookies = 0;
4409 	int			rv;
4410 
4411 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4412 
4413 	ibnd_desc = (vio_ibnd_desc_t *)pkt;
4414 
4415 	switch (ibnd_desc->hdr.tag.vio_subtype) {
4416 	case VIO_SUBTYPE_INFO:
4417 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4418 
4419 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
4420 			return;
4421 
4422 		/*
4423 		 * Data is padded to align on a 8 byte boundary,
4424 		 * nbytes is actual data length, i.e. minus that
4425 		 * padding.
4426 		 */
4427 		datalen = ibnd_desc->nbytes;
4428 
4429 		D2(vswp, "%s(%lld): processing inband desc : "
4430 			": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
4431 
4432 		ncookies = ibnd_desc->ncookies;
4433 
4434 		/*
4435 		 * allocb(9F) returns an aligned data block. We
4436 		 * need to ensure that we ask ldc for an aligned
4437 		 * number of bytes also.
4438 		 */
4439 		nbytes = datalen;
4440 		if (nbytes & 0x7) {
4441 			off = 8 - (nbytes & 0x7);
4442 			nbytes += off;
4443 		}
4444 
4445 		mp = allocb(datalen, BPRI_MED);
4446 		if (mp == NULL) {
4447 			DERR(vswp, "%s(%lld): allocb failed",
4448 					__func__, ldcp->ldc_id);
4449 			return;
4450 		}
4451 
4452 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
4453 			0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
4454 			LDC_COPY_IN);
4455 
4456 		if (rv != 0) {
4457 			DERR(vswp, "%s(%d): unable to copy in data from "
4458 				"%d cookie(s)", __func__,
4459 				ldcp->ldc_id, ncookies);
4460 			freemsg(mp);
4461 			return;
4462 		} else {
4463 			D2(vswp, "%s(%d): copied in %ld bytes using %d "
4464 				"cookies", __func__, ldcp->ldc_id, nbytes,
4465 				ncookies);
4466 		}
4467 
4468 		/* point to the actual end of data */
4469 		mp->b_wptr = mp->b_rptr + datalen;
4470 
4471 		/*
4472 		 * We ACK back every in-band descriptor message we process
4473 		 */
4474 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
4475 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
4476 		vsw_send_msg(ldcp, (void *)ibnd_desc,
4477 				sizeof (vio_ibnd_desc_t));
4478 
4479 		/* send the packet to be switched */
4480 		vsw_switch_frame(vswp, mp, VSW_VNETPORT,
4481 					ldcp->ldc_port, NULL);
4482 
4483 		break;
4484 
4485 	case VIO_SUBTYPE_ACK:
4486 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4487 
4488 		/* Verify the ACK is valid */
4489 		idx = ibnd_desc->hdr.desc_handle;
4490 
4491 		if (idx >= VSW_RING_NUM_EL) {
4492 			cmn_err(CE_WARN, "%s: corrupted ACK received "
4493 				"(idx %ld)", __func__, idx);
4494 			return;
4495 		}
4496 
4497 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4498 			DERR(vswp, "%s: no dring found", __func__);
4499 			return;
4500 		}
4501 
4502 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4503 
4504 		/* move to correct location in ring */
4505 		priv_addr += idx;
4506 
4507 		/*
4508 		 * When we sent the in-band message to our peer we
4509 		 * marked the copy in our private ring as READY. We now
4510 		 * check that the descriptor we are being ACK'ed for is in
4511 		 * fact READY, i.e. it is one we have shared with our peer.
4512 		 */
4513 		if (priv_addr->dstate != VIO_DESC_READY) {
4514 			cmn_err(CE_WARN, "%s: (%ld) desc at index %ld not "
4515 				"READY (0x%lx)", __func__, ldcp->ldc_id, idx,
4516 				priv_addr->dstate);
4517 			cmn_err(CE_CONT, "%s: bound %d: ncookies %ld\n",
4518 				__func__, priv_addr->bound,
4519 				priv_addr->ncookies);
4520 			cmn_err(CE_CONT, "datalen %ld\n", priv_addr->datalen);
4521 			return;
4522 		} else {
4523 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
4524 				ldcp->ldc_id, idx);
4525 
4526 			/* release resources associated with sent msg */
4527 			bzero(priv_addr->datap, priv_addr->datalen);
4528 			priv_addr->datalen = 0;
4529 			priv_addr->dstate = VIO_DESC_FREE;
4530 		}
4531 		break;
4532 
4533 	case VIO_SUBTYPE_NACK:
4534 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4535 
4536 		/*
4537 		 * We should only get a NACK if our peer doesn't like
4538 		 * something about a message we have sent it. If this
4539 		 * happens we just release the resources associated with
4540 		 * the message. (We are relying on higher layers to decide
4541 		 * whether or not to resend.
4542 		 */
4543 
4544 		/* limit check */
4545 		idx = ibnd_desc->hdr.desc_handle;
4546 
4547 		if (idx >= VSW_RING_NUM_EL) {
4548 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
4549 				__func__, idx);
4550 			return;
4551 		}
4552 
4553 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4554 			DERR(vswp, "%s: no dring found", __func__);
4555 			return;
4556 		}
4557 
4558 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4559 
4560 		/* move to correct location in ring */
4561 		priv_addr += idx;
4562 
4563 		/* release resources associated with sent msg */
4564 		bzero(priv_addr->datap, priv_addr->datalen);
4565 		priv_addr->datalen = 0;
4566 		priv_addr->dstate = VIO_DESC_FREE;
4567 
4568 		break;
4569 
4570 	default:
4571 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4572 			ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
4573 	}
4574 
4575 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4576 }
4577 
4578 static void
4579 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
4580 {
4581 	_NOTE(ARGUNUSED(epkt))
4582 
4583 	vsw_t		*vswp = ldcp->ldc_vswp;
4584 	uint16_t	env = tag.vio_subtype_env;
4585 
4586 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4587 
4588 	/*
4589 	 * Error vio_subtypes have yet to be defined. So for
4590 	 * the moment we can't do anything.
4591 	 */
4592 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
4593 
4594 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4595 }
4596 
4597 /*
4598  * Switch the given ethernet frame when operating in layer 2 mode.
4599  *
4600  * vswp: pointer to the vsw instance
4601  * mp: pointer to chain of ethernet frame(s) to be switched
4602  * caller: identifies the source of this frame as:
4603  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
4604  *		2. VSW_PHYSDEV - the physical ethernet device
4605  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
4606  * arg: argument provided by the caller.
4607  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
4608  *		2. for PHYSDEV - NULL
4609  *		3. for LOCALDEV - pointer to to this vsw_t(self)
4610  */
4611 void
4612 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
4613 			vsw_port_t *arg, mac_resource_handle_t mrh)
4614 {
4615 	struct ether_header	*ehp;
4616 	vsw_port_t		*port = NULL;
4617 	mblk_t			*bp, *ret_m;
4618 	mblk_t			*nmp = NULL;
4619 	vsw_port_list_t		*plist = &vswp->plist;
4620 
4621 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
4622 
4623 	/*
4624 	 * PERF: rather than breaking up the chain here, scan it
4625 	 * to find all mblks heading to same destination and then
4626 	 * pass that sub-chain to the lower transmit functions.
4627 	 */
4628 
4629 	/* process the chain of packets */
4630 	bp = mp;
4631 	while (bp) {
4632 		mp = bp;
4633 		bp = bp->b_next;
4634 		mp->b_next = mp->b_prev = NULL;
4635 		ehp = (struct ether_header *)mp->b_rptr;
4636 
4637 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
4638 			__func__, MBLKSIZE(mp), MBLKL(mp));
4639 
4640 		READ_ENTER(&vswp->if_lockrw);
4641 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
4642 			/*
4643 			 * If destination is VSW_LOCALDEV (vsw as an eth
4644 			 * interface) and if the device is up & running,
4645 			 * send the packet up the stack on this host.
4646 			 * If the virtual interface is down, drop the packet.
4647 			 */
4648 			if (caller != VSW_LOCALDEV) {
4649 				if (vswp->if_state & VSW_IF_UP) {
4650 					RW_EXIT(&vswp->if_lockrw);
4651 					mac_rx(vswp->if_mh, mrh, mp);
4652 				} else {
4653 					RW_EXIT(&vswp->if_lockrw);
4654 					/* Interface down, drop pkt */
4655 					freemsg(mp);
4656 				}
4657 			} else {
4658 				RW_EXIT(&vswp->if_lockrw);
4659 				freemsg(mp);
4660 			}
4661 			continue;
4662 		}
4663 		RW_EXIT(&vswp->if_lockrw);
4664 
4665 		READ_ENTER(&plist->lockrw);
4666 		port = vsw_lookup_fdb(vswp, ehp);
4667 		if (port) {
4668 			/*
4669 			 * Mark the port as in-use.
4670 			 */
4671 			mutex_enter(&port->ref_lock);
4672 			port->ref_cnt++;
4673 			mutex_exit(&port->ref_lock);
4674 			RW_EXIT(&plist->lockrw);
4675 
4676 			/*
4677 			 * If plumbed and in promisc mode then copy msg
4678 			 * and send up the stack.
4679 			 */
4680 			READ_ENTER(&vswp->if_lockrw);
4681 			if (VSW_U_P(vswp->if_state)) {
4682 				RW_EXIT(&vswp->if_lockrw);
4683 				nmp = copymsg(mp);
4684 				if (nmp)
4685 					mac_rx(vswp->if_mh, mrh, nmp);
4686 			} else {
4687 				RW_EXIT(&vswp->if_lockrw);
4688 			}
4689 
4690 			/*
4691 			 * If the destination is in FDB, the packet
4692 			 * should be forwarded to the correponding
4693 			 * vsw_port (connected to a vnet device -
4694 			 * VSW_VNETPORT)
4695 			 */
4696 			(void) vsw_portsend(port, mp);
4697 
4698 			/*
4699 			 * Decrement use count in port and check if
4700 			 * should wake delete thread.
4701 			 */
4702 			mutex_enter(&port->ref_lock);
4703 			port->ref_cnt--;
4704 			if (port->ref_cnt == 0)
4705 				cv_signal(&port->ref_cv);
4706 			mutex_exit(&port->ref_lock);
4707 		} else {
4708 			RW_EXIT(&plist->lockrw);
4709 			/*
4710 			 * Destination not in FDB.
4711 			 *
4712 			 * If the destination is broadcast or
4713 			 * multicast forward the packet to all
4714 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
4715 			 * except the caller.
4716 			 */
4717 			if (IS_BROADCAST(ehp)) {
4718 				D3(vswp, "%s: BROADCAST pkt", __func__);
4719 				(void) vsw_forward_all(vswp, mp,
4720 								caller, arg);
4721 			} else if (IS_MULTICAST(ehp)) {
4722 				D3(vswp, "%s: MULTICAST pkt", __func__);
4723 				(void) vsw_forward_grp(vswp, mp,
4724 							caller, arg);
4725 			} else {
4726 				/*
4727 				 * If the destination is unicast, and came
4728 				 * from either a logical network device or
4729 				 * the switch itself when it is plumbed, then
4730 				 * send it out on the physical device and also
4731 				 * up the stack if the logical interface is
4732 				 * in promiscious mode.
4733 				 *
4734 				 * NOTE:  The assumption here is that if we
4735 				 * cannot find the destination in our fdb, its
4736 				 * a unicast address, and came from either a
4737 				 * vnet or down the stack (when plumbed) it
4738 				 * must be destinded for an ethernet device
4739 				 * outside our ldoms.
4740 				 */
4741 				if (caller == VSW_VNETPORT) {
4742 					READ_ENTER(&vswp->if_lockrw);
4743 					if (VSW_U_P(vswp->if_state)) {
4744 						RW_EXIT(&vswp->if_lockrw);
4745 						nmp = copymsg(mp);
4746 						if (nmp)
4747 							mac_rx(vswp->if_mh,
4748 								mrh, nmp);
4749 					} else {
4750 						RW_EXIT(&vswp->if_lockrw);
4751 					}
4752 					if ((ret_m = vsw_tx_msg(vswp, mp))
4753 								!= NULL) {
4754 						DERR(vswp, "%s: drop mblks to "
4755 							"phys dev", __func__);
4756 						freemsg(ret_m);
4757 					}
4758 
4759 				} else if (caller == VSW_PHYSDEV) {
4760 					/*
4761 					 * Pkt seen because card in promisc
4762 					 * mode. Send up stack if plumbed in
4763 					 * promisc mode, else drop it.
4764 					 */
4765 					READ_ENTER(&vswp->if_lockrw);
4766 					if (VSW_U_P(vswp->if_state)) {
4767 						RW_EXIT(&vswp->if_lockrw);
4768 						mac_rx(vswp->if_mh, mrh, mp);
4769 					} else {
4770 						RW_EXIT(&vswp->if_lockrw);
4771 						freemsg(mp);
4772 					}
4773 
4774 				} else if (caller == VSW_LOCALDEV) {
4775 					/*
4776 					 * Pkt came down the stack, send out
4777 					 * over physical device.
4778 					 */
4779 					if ((ret_m = vsw_tx_msg(vswp, mp))
4780 								!= NULL) {
4781 						DERR(vswp, "%s: drop mblks to "
4782 							"phys dev", __func__);
4783 						freemsg(ret_m);
4784 					}
4785 				}
4786 			}
4787 		}
4788 	}
4789 	D1(vswp, "%s: exit\n", __func__);
4790 }
4791 
4792 /*
4793  * Switch ethernet frame when in layer 3 mode (i.e. using IP
4794  * layer to do the routing).
4795  *
4796  * There is a large amount of overlap between this function and
4797  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
4798  * both these functions.
4799  */
4800 void
4801 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
4802 			vsw_port_t *arg, mac_resource_handle_t mrh)
4803 {
4804 	struct ether_header	*ehp;
4805 	vsw_port_t		*port = NULL;
4806 	mblk_t			*bp = NULL;
4807 	vsw_port_list_t		*plist = &vswp->plist;
4808 
4809 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
4810 
4811 	/*
4812 	 * In layer 3 mode should only ever be switching packets
4813 	 * between IP layer and vnet devices. So make sure thats
4814 	 * who is invoking us.
4815 	 */
4816 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
4817 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
4818 		freemsgchain(mp);
4819 		return;
4820 	}
4821 
4822 	/* process the chain of packets */
4823 	bp = mp;
4824 	while (bp) {
4825 		mp = bp;
4826 		bp = bp->b_next;
4827 		mp->b_next = mp->b_prev = NULL;
4828 		ehp = (struct ether_header *)mp->b_rptr;
4829 
4830 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
4831 			__func__, MBLKSIZE(mp), MBLKL(mp));
4832 
4833 		READ_ENTER(&plist->lockrw);
4834 		port = vsw_lookup_fdb(vswp, ehp);
4835 		if (port) {
4836 			/*
4837 			 * Mark port as in-use.
4838 			 */
4839 			mutex_enter(&port->ref_lock);
4840 			port->ref_cnt++;
4841 			mutex_exit(&port->ref_lock);
4842 			RW_EXIT(&plist->lockrw);
4843 
4844 			D2(vswp, "%s: sending to target port", __func__);
4845 			(void) vsw_portsend(port, mp);
4846 
4847 			/*
4848 			 * Finished with port so decrement ref count and
4849 			 * check if should wake delete thread.
4850 			 */
4851 			mutex_enter(&port->ref_lock);
4852 			port->ref_cnt--;
4853 			if (port->ref_cnt == 0)
4854 				cv_signal(&port->ref_cv);
4855 			mutex_exit(&port->ref_lock);
4856 		} else {
4857 			RW_EXIT(&plist->lockrw);
4858 			/*
4859 			 * Destination not in FDB
4860 			 *
4861 			 * If the destination is broadcast or
4862 			 * multicast forward the packet to all
4863 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
4864 			 * except the caller.
4865 			 */
4866 			if (IS_BROADCAST(ehp)) {
4867 				D2(vswp, "%s: BROADCAST pkt", __func__);
4868 				(void) vsw_forward_all(vswp, mp,
4869 								caller, arg);
4870 			} else if (IS_MULTICAST(ehp)) {
4871 				D2(vswp, "%s: MULTICAST pkt", __func__);
4872 				(void) vsw_forward_grp(vswp, mp,
4873 							caller, arg);
4874 			} else {
4875 				/*
4876 				 * Unicast pkt from vnet that we don't have
4877 				 * an FDB entry for, so must be destinded for
4878 				 * the outside world. Attempt to send up to the
4879 				 * IP layer to allow it to deal with it.
4880 				 */
4881 				if (caller == VSW_VNETPORT) {
4882 					READ_ENTER(&vswp->if_lockrw);
4883 					if (vswp->if_state & VSW_IF_UP) {
4884 						RW_EXIT(&vswp->if_lockrw);
4885 						D2(vswp, "%s: sending up",
4886 							__func__);
4887 						mac_rx(vswp->if_mh, mrh, mp);
4888 					} else {
4889 						RW_EXIT(&vswp->if_lockrw);
4890 						/* Interface down, drop pkt */
4891 						D2(vswp, "%s I/F down",
4892 								__func__);
4893 						freemsg(mp);
4894 					}
4895 				}
4896 			}
4897 		}
4898 	}
4899 
4900 	D1(vswp, "%s: exit", __func__);
4901 }
4902 
4903 /*
4904  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
4905  * except the caller (port on which frame arrived).
4906  */
4907 static int
4908 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
4909 {
4910 	vsw_port_list_t	*plist = &vswp->plist;
4911 	vsw_port_t	*portp;
4912 	mblk_t		*nmp = NULL;
4913 	mblk_t		*ret_m = NULL;
4914 	int		skip_port = 0;
4915 
4916 	D1(vswp, "vsw_forward_all: enter\n");
4917 
4918 	/*
4919 	 * Broadcast message from inside ldoms so send to outside
4920 	 * world if in either of layer 2 modes.
4921 	 */
4922 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
4923 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
4924 		((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
4925 
4926 		nmp = dupmsg(mp);
4927 		if (nmp) {
4928 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
4929 				DERR(vswp, "%s: dropping pkt(s) "
4930 				"consisting of %ld bytes of data for"
4931 				" physical device", __func__, MBLKL(ret_m));
4932 			freemsg(ret_m);
4933 			}
4934 		}
4935 	}
4936 
4937 	if (caller == VSW_VNETPORT)
4938 		skip_port = 1;
4939 
4940 	/*
4941 	 * Broadcast message from other vnet (layer 2 or 3) or outside
4942 	 * world (layer 2 only), send up stack if plumbed.
4943 	 */
4944 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
4945 		READ_ENTER(&vswp->if_lockrw);
4946 		if (vswp->if_state & VSW_IF_UP) {
4947 			RW_EXIT(&vswp->if_lockrw);
4948 			nmp = copymsg(mp);
4949 			if (nmp)
4950 				mac_rx(vswp->if_mh, NULL, nmp);
4951 		} else {
4952 			RW_EXIT(&vswp->if_lockrw);
4953 		}
4954 	}
4955 
4956 	/* send it to all VNETPORTs */
4957 	READ_ENTER(&plist->lockrw);
4958 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
4959 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
4960 		/*
4961 		 * Caution ! - don't reorder these two checks as arg
4962 		 * will be NULL if the caller is PHYSDEV. skip_port is
4963 		 * only set if caller is VNETPORT.
4964 		 */
4965 		if ((skip_port) && (portp == arg))
4966 			continue;
4967 		else {
4968 			nmp = dupmsg(mp);
4969 			if (nmp) {
4970 				(void) vsw_portsend(portp, nmp);
4971 			} else {
4972 				DERR(vswp, "vsw_forward_all: nmp NULL");
4973 			}
4974 		}
4975 	}
4976 	RW_EXIT(&plist->lockrw);
4977 
4978 	freemsg(mp);
4979 
4980 	D1(vswp, "vsw_forward_all: exit\n");
4981 	return (0);
4982 }
4983 
4984 /*
4985  * Forward pkts to any devices or interfaces which have registered
4986  * an interest in them (i.e. multicast groups).
4987  */
4988 static int
4989 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
4990 {
4991 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
4992 	mfdb_ent_t		*entp = NULL;
4993 	mfdb_ent_t		*tpp = NULL;
4994 	vsw_port_t 		*port;
4995 	uint64_t		key = 0;
4996 	mblk_t			*nmp = NULL;
4997 	mblk_t			*ret_m = NULL;
4998 	boolean_t		check_if = B_TRUE;
4999 
5000 	/*
5001 	 * Convert address to hash table key
5002 	 */
5003 	KEY_HASH(key, ehp->ether_dhost);
5004 
5005 	D1(vswp, "%s: key 0x%llx", __func__, key);
5006 
5007 	/*
5008 	 * If pkt came from either a vnet or down the stack (if we are
5009 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
5010 	 * over the physical adapter, and then check to see if any other
5011 	 * vnets are interested in it.
5012 	 */
5013 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
5014 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
5015 		((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
5016 		nmp = dupmsg(mp);
5017 		if (nmp) {
5018 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
5019 				DERR(vswp, "%s: dropping pkt(s) "
5020 					"consisting of %ld bytes of "
5021 					"data for physical device",
5022 					__func__, MBLKL(ret_m));
5023 				freemsg(ret_m);
5024 			}
5025 		}
5026 	}
5027 
5028 	READ_ENTER(&vswp->mfdbrw);
5029 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
5030 				(mod_hash_val_t *)&entp) != 0) {
5031 		D3(vswp, "%s: no table entry found for addr 0x%llx",
5032 								__func__, key);
5033 	} else {
5034 		/*
5035 		 * Send to list of devices associated with this address...
5036 		 */
5037 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
5038 
5039 			/* dont send to ourselves */
5040 			if ((caller == VSW_VNETPORT) &&
5041 				(tpp->d_addr == (void *)arg)) {
5042 				port = (vsw_port_t *)tpp->d_addr;
5043 				D3(vswp, "%s: not sending to ourselves"
5044 					" : port %d", __func__,
5045 					port->p_instance);
5046 				continue;
5047 
5048 			} else if ((caller == VSW_LOCALDEV) &&
5049 				(tpp->d_type == VSW_LOCALDEV)) {
5050 				D3(vswp, "%s: not sending back up stack",
5051 					__func__);
5052 				continue;
5053 			}
5054 
5055 			if (tpp->d_type == VSW_VNETPORT) {
5056 				port = (vsw_port_t *)tpp->d_addr;
5057 				D3(vswp, "%s: sending to port %ld for "
5058 					" addr 0x%llx", __func__,
5059 					port->p_instance, key);
5060 
5061 				nmp = dupmsg(mp);
5062 				if (nmp)
5063 					(void) vsw_portsend(port, nmp);
5064 			} else {
5065 				if (vswp->if_state & VSW_IF_UP) {
5066 					nmp = copymsg(mp);
5067 					if (nmp)
5068 						mac_rx(vswp->if_mh, NULL, nmp);
5069 					check_if = B_FALSE;
5070 					D3(vswp, "%s: sending up stack"
5071 						" for addr 0x%llx", __func__,
5072 						key);
5073 				}
5074 			}
5075 		}
5076 	}
5077 
5078 	RW_EXIT(&vswp->mfdbrw);
5079 
5080 	/*
5081 	 * If the pkt came from either a vnet or from physical device,
5082 	 * and if we havent already sent the pkt up the stack then we
5083 	 * check now if we can/should (i.e. the interface is plumbed
5084 	 * and in promisc mode).
5085 	 */
5086 	if ((check_if) &&
5087 		((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
5088 		READ_ENTER(&vswp->if_lockrw);
5089 		if (VSW_U_P(vswp->if_state)) {
5090 			RW_EXIT(&vswp->if_lockrw);
5091 			D3(vswp, "%s: (caller %d) finally sending up stack"
5092 				" for addr 0x%llx", __func__, caller, key);
5093 			nmp = copymsg(mp);
5094 			if (nmp)
5095 				mac_rx(vswp->if_mh, NULL, nmp);
5096 		} else {
5097 			RW_EXIT(&vswp->if_lockrw);
5098 		}
5099 	}
5100 
5101 	freemsg(mp);
5102 
5103 	D1(vswp, "%s: exit", __func__);
5104 
5105 	return (0);
5106 }
5107 
5108 /* transmit the packet over the given port */
5109 static int
5110 vsw_portsend(vsw_port_t *port, mblk_t *mp)
5111 {
5112 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
5113 	vsw_ldc_t 	*ldcp;
5114 	int		status = 0;
5115 
5116 
5117 	READ_ENTER(&ldcl->lockrw);
5118 	/*
5119 	 * Note for now, we have a single channel.
5120 	 */
5121 	ldcp = ldcl->head;
5122 	if (ldcp == NULL) {
5123 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
5124 		freemsg(mp);
5125 		RW_EXIT(&ldcl->lockrw);
5126 		return (1);
5127 	}
5128 
5129 	/*
5130 	 * Send the message out using the appropriate
5131 	 * transmit function which will free mblock when it
5132 	 * is finished with it.
5133 	 */
5134 	mutex_enter(&port->tx_lock);
5135 	if (port->transmit != NULL)
5136 		status = (*port->transmit)(ldcp, mp);
5137 	else {
5138 		freemsg(mp);
5139 	}
5140 	mutex_exit(&port->tx_lock);
5141 
5142 	RW_EXIT(&ldcl->lockrw);
5143 
5144 	return (status);
5145 }
5146 
5147 /*
5148  * Send packet out via descriptor ring to a logical device.
5149  */
5150 static int
5151 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
5152 {
5153 	vio_dring_msg_t		dring_pkt;
5154 	dring_info_t		*dp = NULL;
5155 	vsw_private_desc_t	*priv_desc = NULL;
5156 	vsw_t			*vswp = ldcp->ldc_vswp;
5157 	mblk_t			*bp;
5158 	size_t			n, size;
5159 	caddr_t			bufp;
5160 	int			idx;
5161 	int			status = LDC_TX_SUCCESS;
5162 
5163 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
5164 
5165 	/* TODO: make test a macro */
5166 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
5167 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
5168 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
5169 			"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
5170 			ldcp->lane_out.lstate);
5171 		freemsg(mp);
5172 		return (LDC_TX_FAILURE);
5173 	}
5174 
5175 	/*
5176 	 * Note - using first ring only, this may change
5177 	 * in the future.
5178 	 */
5179 	if ((dp = ldcp->lane_out.dringp) == NULL) {
5180 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
5181 			" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
5182 		freemsg(mp);
5183 		return (LDC_TX_FAILURE);
5184 	}
5185 
5186 	mutex_enter(&dp->dlock);
5187 
5188 	size = msgsize(mp);
5189 	if (size > (size_t)ETHERMAX) {
5190 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
5191 		    ldcp->ldc_id, size);
5192 		status = LDC_TX_FAILURE;
5193 		goto vsw_dringsend_free_exit;
5194 	}
5195 
5196 	/*
5197 	 * Find a free descriptor
5198 	 *
5199 	 * Note: for the moment we are assuming that we will only
5200 	 * have one dring going from the switch to each of its
5201 	 * peers. This may change in the future.
5202 	 */
5203 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
5204 		DERR(vswp, "%s(%lld): no descriptor available for ring "
5205 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
5206 
5207 		/* nothing more we can do */
5208 		status = LDC_TX_NORESOURCES;
5209 		goto vsw_dringsend_free_exit;
5210 	} else {
5211 		D2(vswp, "%s(%lld): free private descriptor found at pos "
5212 			"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
5213 			priv_desc);
5214 	}
5215 
5216 	/* copy data into the descriptor */
5217 	bufp = priv_desc->datap;
5218 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
5219 		n = MBLKL(bp);
5220 		bcopy(bp->b_rptr, bufp, n);
5221 		bufp += n;
5222 	}
5223 
5224 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
5225 	priv_desc->dstate = VIO_DESC_READY;
5226 
5227 	/*
5228 	 * Copy relevant sections of private descriptor
5229 	 * to public section
5230 	 */
5231 	vsw_dring_priv2pub(priv_desc);
5232 
5233 	/*
5234 	 * Send a vio_dring_msg to peer to prompt them to read
5235 	 * the updated descriptor ring.
5236 	 */
5237 	dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
5238 	dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
5239 	dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
5240 	dring_pkt.tag.vio_sid = ldcp->local_session;
5241 
5242 	/* Note - for now using first ring */
5243 	dring_pkt.dring_ident = dp->ident;
5244 
5245 	/*
5246 	 * Access to the seq_num is implicitly protected by the
5247 	 * fact that we have only one dring associated with the
5248 	 * lane currently and we hold the associated dring lock.
5249 	 */
5250 	dring_pkt.seq_num = ldcp->lane_out.seq_num++;
5251 
5252 	/* Note - only updating single descrip at time at the moment */
5253 	dring_pkt.start_idx = idx;
5254 	dring_pkt.end_idx = idx;
5255 
5256 	D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
5257 		ldcp->ldc_id, dp, dring_pkt.dring_ident);
5258 	D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", __func__,
5259 		ldcp->ldc_id, dring_pkt.start_idx, dring_pkt.end_idx,
5260 		dring_pkt.seq_num);
5261 
5262 	vsw_send_msg(ldcp, (void *)&dring_pkt, sizeof (vio_dring_msg_t));
5263 
5264 vsw_dringsend_free_exit:
5265 
5266 	mutex_exit(&dp->dlock);
5267 
5268 	/* free the message block */
5269 	freemsg(mp);
5270 
5271 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
5272 	return (status);
5273 }
5274 
5275 /*
5276  * Send an in-band descriptor message over ldc.
5277  */
5278 static int
5279 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
5280 {
5281 	vsw_t			*vswp = ldcp->ldc_vswp;
5282 	vio_ibnd_desc_t		ibnd_msg;
5283 	vsw_private_desc_t	*priv_desc = NULL;
5284 	dring_info_t		*dp = NULL;
5285 	size_t			n, size = 0;
5286 	caddr_t			bufp;
5287 	mblk_t			*bp;
5288 	int			idx, i;
5289 	int			status = LDC_TX_SUCCESS;
5290 	static int		warn_msg = 1;
5291 
5292 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5293 
5294 	ASSERT(mp != NULL);
5295 
5296 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
5297 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
5298 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
5299 			__func__, ldcp->ldc_id, ldcp->ldc_status,
5300 			ldcp->lane_out.lstate);
5301 		freemsg(mp);
5302 		return (LDC_TX_FAILURE);
5303 	}
5304 
5305 	/*
5306 	 * only expect single dring to exist, which we use
5307 	 * as an internal buffer, rather than a transfer channel.
5308 	 */
5309 	if ((dp = ldcp->lane_out.dringp) == NULL) {
5310 		DERR(vswp, "%s(%lld): no dring for outbound lane",
5311 			__func__, ldcp->ldc_id);
5312 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
5313 			__func__, ldcp->ldc_id, ldcp->ldc_status,
5314 			ldcp->lane_out.lstate);
5315 		freemsg(mp);
5316 		return (LDC_TX_FAILURE);
5317 	}
5318 
5319 	mutex_enter(&dp->dlock);
5320 
5321 	size = msgsize(mp);
5322 	if (size > (size_t)ETHERMAX) {
5323 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
5324 		    ldcp->ldc_id, size);
5325 		status = LDC_TX_FAILURE;
5326 		goto vsw_descrsend_free_exit;
5327 	}
5328 
5329 	/*
5330 	 * Find a free descriptor in our buffer ring
5331 	 */
5332 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
5333 		if (warn_msg) {
5334 			DERR(vswp, "%s(%lld): no descriptor available for ring "
5335 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
5336 			warn_msg = 0;
5337 		}
5338 
5339 		/* nothing more we can do */
5340 		status = LDC_TX_NORESOURCES;
5341 		goto vsw_descrsend_free_exit;
5342 	} else {
5343 		D2(vswp, "%s(%lld): free private descriptor found at pos "
5344 			"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
5345 			priv_desc);
5346 		warn_msg = 1;
5347 	}
5348 
5349 	/* copy data into the descriptor */
5350 	bufp = priv_desc->datap;
5351 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
5352 		n = MBLKL(bp);
5353 		bcopy(bp->b_rptr, bufp, n);
5354 		bufp += n;
5355 	}
5356 
5357 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
5358 	priv_desc->dstate = VIO_DESC_READY;
5359 
5360 	/* create and send the in-band descp msg */
5361 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
5362 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
5363 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
5364 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
5365 
5366 	/*
5367 	 * Access to the seq_num is implicitly protected by the
5368 	 * fact that we have only one dring associated with the
5369 	 * lane currently and we hold the associated dring lock.
5370 	 */
5371 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
5372 
5373 	/*
5374 	 * Copy the mem cookies describing the data from the
5375 	 * private region of the descriptor ring into the inband
5376 	 * descriptor.
5377 	 */
5378 	for (i = 0; i < priv_desc->ncookies; i++) {
5379 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
5380 			sizeof (ldc_mem_cookie_t));
5381 	}
5382 
5383 	ibnd_msg.hdr.desc_handle = idx;
5384 	ibnd_msg.ncookies = priv_desc->ncookies;
5385 	ibnd_msg.nbytes = size;
5386 
5387 	vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t));
5388 
5389 vsw_descrsend_free_exit:
5390 
5391 	mutex_exit(&dp->dlock);
5392 
5393 	/* free the allocated message blocks */
5394 	freemsg(mp);
5395 
5396 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5397 	return (status);
5398 }
5399 
5400 static void
5401 vsw_send_ver(vsw_ldc_t *ldcp)
5402 {
5403 	vsw_t		*vswp = ldcp->ldc_vswp;
5404 	lane_t		*lp = &ldcp->lane_out;
5405 	vio_ver_msg_t	ver_msg;
5406 
5407 	D1(vswp, "%s enter", __func__);
5408 
5409 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5410 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5411 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
5412 	ver_msg.tag.vio_sid = ldcp->local_session;
5413 
5414 	ver_msg.ver_major = vsw_versions[0].ver_major;
5415 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
5416 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
5417 
5418 	lp->lstate |= VSW_VER_INFO_SENT;
5419 	lp->ver_major = ver_msg.ver_major;
5420 	lp->ver_minor = ver_msg.ver_minor;
5421 
5422 	DUMP_TAG(ver_msg.tag);
5423 
5424 	vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t));
5425 
5426 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
5427 }
5428 
5429 static void
5430 vsw_send_attr(vsw_ldc_t *ldcp)
5431 {
5432 	vsw_t			*vswp = ldcp->ldc_vswp;
5433 	lane_t			*lp = &ldcp->lane_out;
5434 	vnet_attr_msg_t		attr_msg;
5435 
5436 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5437 
5438 	/*
5439 	 * Subtype is set to INFO by default
5440 	 */
5441 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5442 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5443 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
5444 	attr_msg.tag.vio_sid = ldcp->local_session;
5445 
5446 	/* payload copied from default settings for lane */
5447 	attr_msg.mtu = lp->mtu;
5448 	attr_msg.addr_type = lp->addr_type;
5449 	attr_msg.xfer_mode = lp->xfer_mode;
5450 	attr_msg.ack_freq = lp->xfer_mode;
5451 
5452 	READ_ENTER(&vswp->if_lockrw);
5453 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
5454 	RW_EXIT(&vswp->if_lockrw);
5455 
5456 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
5457 
5458 	DUMP_TAG(attr_msg.tag);
5459 
5460 	vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t));
5461 
5462 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5463 }
5464 
5465 /*
5466  * Create dring info msg (which also results in the creation of
5467  * a dring).
5468  */
5469 static vio_dring_reg_msg_t *
5470 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
5471 {
5472 	vio_dring_reg_msg_t	*mp;
5473 	dring_info_t		*dp;
5474 	vsw_t			*vswp = ldcp->ldc_vswp;
5475 
5476 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
5477 
5478 	/*
5479 	 * If we can't create a dring, obviously no point sending
5480 	 * a message.
5481 	 */
5482 	if ((dp = vsw_create_dring(ldcp)) == NULL)
5483 		return (NULL);
5484 
5485 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
5486 
5487 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
5488 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
5489 	mp->tag.vio_subtype_env = VIO_DRING_REG;
5490 	mp->tag.vio_sid = ldcp->local_session;
5491 
5492 	/* payload */
5493 	mp->num_descriptors = dp->num_descriptors;
5494 	mp->descriptor_size = dp->descriptor_size;
5495 	mp->options = dp->options;
5496 	mp->ncookies = dp->ncookies;
5497 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
5498 
5499 	mp->dring_ident = 0;
5500 
5501 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
5502 
5503 	return (mp);
5504 }
5505 
5506 static void
5507 vsw_send_dring_info(vsw_ldc_t *ldcp)
5508 {
5509 	vio_dring_reg_msg_t	*dring_msg;
5510 	vsw_t			*vswp = ldcp->ldc_vswp;
5511 
5512 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
5513 
5514 	dring_msg = vsw_create_dring_info_pkt(ldcp);
5515 	if (dring_msg == NULL) {
5516 		cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg");
5517 		return;
5518 	}
5519 
5520 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
5521 
5522 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
5523 
5524 	vsw_send_msg(ldcp, dring_msg,
5525 		sizeof (vio_dring_reg_msg_t));
5526 
5527 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
5528 
5529 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
5530 }
5531 
5532 static void
5533 vsw_send_rdx(vsw_ldc_t *ldcp)
5534 {
5535 	vsw_t		*vswp = ldcp->ldc_vswp;
5536 	vio_rdx_msg_t	rdx_msg;
5537 
5538 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5539 
5540 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5541 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5542 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
5543 	rdx_msg.tag.vio_sid = ldcp->local_session;
5544 
5545 	ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT;
5546 
5547 	DUMP_TAG(rdx_msg.tag);
5548 
5549 	vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t));
5550 
5551 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
5552 }
5553 
5554 /*
5555  * Generic routine to send message out over ldc channel.
5556  */
5557 static void
5558 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size)
5559 {
5560 	int		rv;
5561 	size_t		msglen = size;
5562 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
5563 	vsw_t		*vswp = ldcp->ldc_vswp;
5564 
5565 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
5566 			ldcp->ldc_id, size);
5567 
5568 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
5569 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
5570 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
5571 
5572 	mutex_enter(&ldcp->ldc_txlock);
5573 	do {
5574 		msglen = size;
5575 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
5576 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
5577 
5578 	mutex_exit(&ldcp->ldc_txlock);
5579 
5580 	if ((rv != 0) || (msglen != size)) {
5581 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
5582 			"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
5583 			rv, size, msglen);
5584 	}
5585 
5586 	D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes",
5587 			ldcp->ldc_id, msglen);
5588 }
5589 
5590 /*
5591  * Add an entry into FDB, for the given mac address and port_id.
5592  * Returns 0 on success, 1 on failure.
5593  *
5594  * Lock protecting FDB must be held by calling process.
5595  */
5596 static int
5597 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
5598 {
5599 	uint64_t	addr = 0;
5600 
5601 	D1(vswp, "%s: enter", __func__);
5602 
5603 	KEY_HASH(addr, port->p_macaddr);
5604 
5605 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
5606 
5607 	/*
5608 	 * Note: duplicate keys will be rejected by mod_hash.
5609 	 */
5610 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
5611 				(mod_hash_val_t)port) != 0) {
5612 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
5613 		return (1);
5614 	}
5615 
5616 	D1(vswp, "%s: exit", __func__);
5617 	return (0);
5618 }
5619 
5620 /*
5621  * Remove an entry from FDB.
5622  * Returns 0 on success, 1 on failure.
5623  */
5624 static int
5625 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
5626 {
5627 	uint64_t	addr = 0;
5628 
5629 	D1(vswp, "%s: enter", __func__);
5630 
5631 	KEY_HASH(addr, port->p_macaddr);
5632 
5633 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
5634 
5635 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
5636 
5637 	D1(vswp, "%s: enter", __func__);
5638 
5639 	return (0);
5640 }
5641 
5642 /*
5643  * Search fdb for a given mac address.
5644  * Returns pointer to the entry if found, else returns NULL.
5645  */
5646 static vsw_port_t *
5647 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
5648 {
5649 	uint64_t	key = 0;
5650 	vsw_port_t	*port = NULL;
5651 
5652 	D1(vswp, "%s: enter", __func__);
5653 
5654 	KEY_HASH(key, ehp->ether_dhost);
5655 
5656 	D2(vswp, "%s: key = 0x%llx", __func__, key);
5657 
5658 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
5659 				(mod_hash_val_t *)&port) != 0) {
5660 		return (NULL);
5661 	}
5662 
5663 	D1(vswp, "%s: exit", __func__);
5664 
5665 	return (port);
5666 }
5667 
5668 /*
5669  * Add or remove multicast address(es).
5670  *
5671  * Returns 0 on success, 1 on failure.
5672  */
5673 static int
5674 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
5675 {
5676 	mcst_addr_t		*mcst_p = NULL;
5677 	vsw_t			*vswp = port->p_vswp;
5678 	uint64_t		addr = 0x0;
5679 	int			i;
5680 
5681 	D1(vswp, "%s: enter", __func__);
5682 
5683 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
5684 
5685 	for (i = 0; i < mcst_pkt->count; i++) {
5686 		/*
5687 		 * Convert address into form that can be used
5688 		 * as hash table key.
5689 		 */
5690 		KEY_HASH(addr, mcst_pkt->mca[i]);
5691 
5692 		/*
5693 		 * Add or delete the specified address/port combination.
5694 		 */
5695 		if (mcst_pkt->set == 0x1) {
5696 			D3(vswp, "%s: adding multicast address 0x%llx for "
5697 				"port %ld", __func__, addr, port->p_instance);
5698 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
5699 				/*
5700 				 * Update the list of multicast
5701 				 * addresses contained within the
5702 				 * port structure to include this new
5703 				 * one.
5704 				 */
5705 				mcst_p = kmem_alloc(sizeof (mcst_addr_t),
5706 								KM_NOSLEEP);
5707 				if (mcst_p == NULL) {
5708 					DERR(vswp, "%s: unable to alloc mem",
5709 						__func__);
5710 					return (1);
5711 				}
5712 
5713 				mcst_p->nextp = NULL;
5714 				mcst_p->addr = addr;
5715 
5716 				mutex_enter(&port->mca_lock);
5717 				mcst_p->nextp = port->mcap;
5718 				port->mcap = mcst_p;
5719 				mutex_exit(&port->mca_lock);
5720 
5721 				/*
5722 				 * Program the address into HW. If the addr
5723 				 * has already been programmed then the MAC
5724 				 * just increments a ref counter (which is
5725 				 * used when the address is being deleted)
5726 				 *
5727 				 * Note:
5728 				 * For the moment we dont care if this
5729 				 * succeeds because the card must be in
5730 				 * promics mode. When we have the ability
5731 				 * to program multiple unicst address into
5732 				 * the card then we will need to check this
5733 				 * return value.
5734 				 */
5735 				if (vswp->mh != NULL)
5736 					(void) mac_multicst_add(vswp->mh,
5737 						(uchar_t *)&mcst_pkt->mca[i]);
5738 
5739 			} else {
5740 				DERR(vswp, "%s: error adding multicast "
5741 					"address 0x%llx for port %ld",
5742 					__func__, addr, port->p_instance);
5743 				return (1);
5744 			}
5745 		} else {
5746 			/*
5747 			 * Delete an entry from the multicast hash
5748 			 * table and update the address list
5749 			 * appropriately.
5750 			 */
5751 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
5752 				D3(vswp, "%s: deleting multicast address "
5753 					"0x%llx for port %ld", __func__, addr,
5754 					port->p_instance);
5755 
5756 				vsw_del_addr(VSW_VNETPORT, port, addr);
5757 
5758 				/*
5759 				 * Remove the address from HW. The address
5760 				 * will actually only be removed once the ref
5761 				 * count within the MAC layer has dropped to
5762 				 * zero. I.e. we can safely call this fn even
5763 				 * if other ports are interested in this
5764 				 * address.
5765 				 */
5766 				if (vswp->mh != NULL)
5767 					(void) mac_multicst_remove(vswp->mh,
5768 						(uchar_t *)&mcst_pkt->mca[i]);
5769 
5770 			} else {
5771 				DERR(vswp, "%s: error deleting multicast "
5772 					"addr 0x%llx for port %ld",
5773 					__func__, addr, port->p_instance);
5774 				return (1);
5775 			}
5776 		}
5777 	}
5778 	D1(vswp, "%s: exit", __func__);
5779 	return (0);
5780 }
5781 
5782 /*
5783  * Add a new multicast entry.
5784  *
5785  * Search hash table based on address. If match found then
5786  * update associated val (which is chain of ports), otherwise
5787  * create new key/val (addr/port) pair and insert into table.
5788  */
5789 static int
5790 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
5791 {
5792 	int		dup = 0;
5793 	int		rv = 0;
5794 	mfdb_ent_t	*ment = NULL;
5795 	mfdb_ent_t	*tmp_ent = NULL;
5796 	mfdb_ent_t	*new_ent = NULL;
5797 	void		*tgt = NULL;
5798 
5799 	if (devtype == VSW_VNETPORT) {
5800 		/*
5801 		 * Being invoked from a vnet.
5802 		 */
5803 		ASSERT(arg != NULL);
5804 		tgt = arg;
5805 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
5806 			((vsw_port_t *)arg)->p_instance, addr);
5807 	} else {
5808 		/*
5809 		 * We are being invoked via the m_multicst mac entry
5810 		 * point.
5811 		 */
5812 		D2(NULL, "%s: address 0x%llx", __func__, addr);
5813 		tgt = (void *)vswp;
5814 	}
5815 
5816 	WRITE_ENTER(&vswp->mfdbrw);
5817 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
5818 				(mod_hash_val_t *)&ment) != 0) {
5819 
5820 		/* address not currently in table */
5821 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
5822 		ment->d_addr = (void *)tgt;
5823 		ment->d_type = devtype;
5824 		ment->nextp = NULL;
5825 
5826 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
5827 			(mod_hash_val_t)ment) != 0) {
5828 			DERR(vswp, "%s: hash table insertion failed", __func__);
5829 			kmem_free(ment, sizeof (mfdb_ent_t));
5830 			rv = 1;
5831 		} else {
5832 			D2(vswp, "%s: added initial entry for 0x%llx to "
5833 				"table", __func__, addr);
5834 		}
5835 	} else {
5836 		/*
5837 		 * Address in table. Check to see if specified port
5838 		 * is already associated with the address. If not add
5839 		 * it now.
5840 		 */
5841 		tmp_ent = ment;
5842 		while (tmp_ent != NULL) {
5843 			if (tmp_ent->d_addr == (void *)tgt) {
5844 				if (devtype == VSW_VNETPORT) {
5845 					DERR(vswp, "%s: duplicate port entry "
5846 						"found for portid %ld and key "
5847 						"0x%llx", __func__,
5848 						((vsw_port_t *)arg)->p_instance,
5849 						addr);
5850 				} else {
5851 					DERR(vswp, "%s: duplicate entry found"
5852 						"for key 0x%llx",
5853 						__func__, addr);
5854 				}
5855 				rv = 1;
5856 				dup = 1;
5857 				break;
5858 			}
5859 			tmp_ent = tmp_ent->nextp;
5860 		}
5861 
5862 		/*
5863 		 * Port not on list so add it to end now.
5864 		 */
5865 		if (0 == dup) {
5866 			D2(vswp, "%s: added entry for 0x%llx to table",
5867 				__func__, addr);
5868 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
5869 			new_ent->d_addr = (void *)tgt;
5870 			new_ent->d_type = devtype;
5871 			new_ent->nextp = NULL;
5872 
5873 			tmp_ent = ment;
5874 			while (tmp_ent->nextp != NULL)
5875 				tmp_ent = tmp_ent->nextp;
5876 
5877 			tmp_ent->nextp = new_ent;
5878 		}
5879 	}
5880 
5881 	RW_EXIT(&vswp->mfdbrw);
5882 	return (rv);
5883 }
5884 
5885 /*
5886  * Remove a multicast entry from the hashtable.
5887  *
5888  * Search hash table based on address. If match found, scan
5889  * list of ports associated with address. If specified port
5890  * found remove it from list.
5891  */
5892 static int
5893 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
5894 {
5895 	mfdb_ent_t	*ment = NULL;
5896 	mfdb_ent_t	*curr_p, *prev_p;
5897 	void		*tgt = NULL;
5898 
5899 	D1(vswp, "%s: enter", __func__);
5900 
5901 	if (devtype == VSW_VNETPORT) {
5902 		tgt = (vsw_port_t *)arg;
5903 		D2(vswp, "%s: removing port %d from mFDB for address"
5904 			" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
5905 			addr);
5906 	} else {
5907 		D2(vswp, "%s: removing entry", __func__);
5908 		tgt = (void *)vswp;
5909 	}
5910 
5911 	WRITE_ENTER(&vswp->mfdbrw);
5912 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
5913 				(mod_hash_val_t *)&ment) != 0) {
5914 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
5915 		RW_EXIT(&vswp->mfdbrw);
5916 		return (1);
5917 	}
5918 
5919 	prev_p = curr_p = ment;
5920 
5921 	while (curr_p != NULL) {
5922 		if (curr_p->d_addr == (void *)tgt) {
5923 			if (devtype == VSW_VNETPORT) {
5924 				D2(vswp, "%s: port %d found", __func__,
5925 					((vsw_port_t *)tgt)->p_instance);
5926 			} else {
5927 				D2(vswp, "%s: instance found", __func__);
5928 			}
5929 
5930 			if (prev_p == curr_p) {
5931 				/*
5932 				 * head of list, if no other element is in
5933 				 * list then destroy this entry, otherwise
5934 				 * just replace it with updated value.
5935 				 */
5936 				ment = curr_p->nextp;
5937 				kmem_free(curr_p, sizeof (mfdb_ent_t));
5938 				if (ment == NULL) {
5939 					(void) mod_hash_destroy(vswp->mfdb,
5940 							(mod_hash_val_t)addr);
5941 				} else {
5942 					(void) mod_hash_replace(vswp->mfdb,
5943 							(mod_hash_key_t)addr,
5944 							(mod_hash_val_t)ment);
5945 				}
5946 			} else {
5947 				/*
5948 				 * Not head of list, no need to do
5949 				 * replacement, just adjust list pointers.
5950 				 */
5951 				prev_p->nextp = curr_p->nextp;
5952 				kmem_free(curr_p, sizeof (mfdb_ent_t));
5953 			}
5954 			break;
5955 		}
5956 
5957 		prev_p = curr_p;
5958 		curr_p = curr_p->nextp;
5959 	}
5960 
5961 	RW_EXIT(&vswp->mfdbrw);
5962 
5963 	D1(vswp, "%s: exit", __func__);
5964 
5965 	return (0);
5966 }
5967 
5968 /*
5969  * Port is being deleted, but has registered an interest in one
5970  * or more multicast groups. Using the list of addresses maintained
5971  * within the port structure find the appropriate entry in the hash
5972  * table and remove this port from the list of interested ports.
5973  */
5974 static void
5975 vsw_del_mcst_port(vsw_port_t *port)
5976 {
5977 	mcst_addr_t	*mcst_p = NULL;
5978 	vsw_t		*vswp = port->p_vswp;
5979 
5980 	D1(vswp, "%s: enter", __func__);
5981 
5982 	mutex_enter(&port->mca_lock);
5983 	while (port->mcap != NULL) {
5984 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
5985 					port->mcap->addr, port);
5986 
5987 		mcst_p = port->mcap->nextp;
5988 		kmem_free(port->mcap, sizeof (mcst_addr_t));
5989 		port->mcap = mcst_p;
5990 	}
5991 	mutex_exit(&port->mca_lock);
5992 
5993 	D1(vswp, "%s: exit", __func__);
5994 }
5995 
5996 /*
5997  * This vsw instance is detaching, but has registered an interest in one
5998  * or more multicast groups. Using the list of addresses maintained
5999  * within the vsw structure find the appropriate entry in the hash
6000  * table and remove this instance from the list of interested ports.
6001  */
6002 static void
6003 vsw_del_mcst_vsw(vsw_t *vswp)
6004 {
6005 	mcst_addr_t	*next_p = NULL;
6006 
6007 	D1(vswp, "%s: enter", __func__);
6008 
6009 	mutex_enter(&vswp->mca_lock);
6010 
6011 	while (vswp->mcap != NULL) {
6012 		DERR(vswp, "%s: deleting addr 0x%llx",
6013 			__func__, vswp->mcap->addr);
6014 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
6015 				vswp->mcap->addr, NULL);
6016 
6017 		next_p = vswp->mcap->nextp;
6018 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
6019 		vswp->mcap = next_p;
6020 	}
6021 
6022 	vswp->mcap = NULL;
6023 	mutex_exit(&vswp->mca_lock);
6024 
6025 	D1(vswp, "%s: exit", __func__);
6026 }
6027 
6028 
6029 /*
6030  * Remove the specified address from the list of address maintained
6031  * in this port node.
6032  */
6033 static void
6034 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
6035 {
6036 	vsw_t		*vswp = NULL;
6037 	vsw_port_t	*port = NULL;
6038 	mcst_addr_t	*prev_p = NULL;
6039 	mcst_addr_t	*curr_p = NULL;
6040 
6041 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
6042 		__func__, devtype, addr);
6043 
6044 	if (devtype == VSW_VNETPORT) {
6045 		port = (vsw_port_t *)arg;
6046 		mutex_enter(&port->mca_lock);
6047 		prev_p = curr_p = port->mcap;
6048 	} else {
6049 		vswp = (vsw_t *)arg;
6050 		mutex_enter(&vswp->mca_lock);
6051 		prev_p = curr_p = vswp->mcap;
6052 	}
6053 
6054 	while (curr_p != NULL) {
6055 		if (curr_p->addr == addr) {
6056 			D2(NULL, "%s: address found", __func__);
6057 			/* match found */
6058 			if (prev_p == curr_p) {
6059 				/* list head */
6060 				if (devtype == VSW_VNETPORT)
6061 					port->mcap = curr_p->nextp;
6062 				else
6063 					vswp->mcap = curr_p->nextp;
6064 			} else {
6065 				prev_p->nextp = curr_p->nextp;
6066 			}
6067 			kmem_free(curr_p, sizeof (mcst_addr_t));
6068 			break;
6069 		} else {
6070 			prev_p = curr_p;
6071 			curr_p = curr_p->nextp;
6072 		}
6073 	}
6074 
6075 	if (devtype == VSW_VNETPORT)
6076 		mutex_exit(&port->mca_lock);
6077 	else
6078 		mutex_exit(&vswp->mca_lock);
6079 
6080 	D1(NULL, "%s: exit", __func__);
6081 }
6082 
6083 /*
6084  * Creates a descriptor ring (dring) and links it into the
6085  * link of outbound drings for this channel.
6086  *
6087  * Returns NULL if creation failed.
6088  */
6089 static dring_info_t *
6090 vsw_create_dring(vsw_ldc_t *ldcp)
6091 {
6092 	vsw_private_desc_t	*priv_addr = NULL;
6093 	vsw_t			*vswp = ldcp->ldc_vswp;
6094 	ldc_mem_info_t		minfo;
6095 	dring_info_t		*dp, *tp;
6096 	int			i;
6097 
6098 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
6099 
6100 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
6101 
6102 	/* create public section of ring */
6103 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
6104 			VSW_PUB_SIZE, &dp->handle)) != 0) {
6105 
6106 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
6107 			"failed", ldcp->ldc_id);
6108 		goto create_fail_exit;
6109 	}
6110 
6111 	ASSERT(dp->handle != NULL);
6112 
6113 	/*
6114 	 * Get the base address of the public section of the ring.
6115 	 */
6116 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
6117 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
6118 			ldcp->ldc_id);
6119 		goto dring_fail_exit;
6120 	} else {
6121 		ASSERT(minfo.vaddr != 0);
6122 		dp->pub_addr = minfo.vaddr;
6123 	}
6124 
6125 	dp->num_descriptors = VSW_RING_NUM_EL;
6126 	dp->descriptor_size = VSW_PUB_SIZE;
6127 	dp->options = VIO_TX_DRING;
6128 	dp->ncookies = 1;	/* guaranteed by ldc */
6129 
6130 	/*
6131 	 * create private portion of ring
6132 	 */
6133 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
6134 		(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
6135 
6136 	if (vsw_setup_ring(ldcp, dp)) {
6137 		DERR(vswp, "%s: unable to setup ring", __func__);
6138 		goto dring_fail_exit;
6139 	}
6140 
6141 	/* haven't used any descriptors yet */
6142 	dp->end_idx = 0;
6143 
6144 	/* bind dring to the channel */
6145 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
6146 		LDC_SHADOW_MAP, LDC_MEM_RW,
6147 		&dp->cookie[0], &dp->ncookies)) != 0) {
6148 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
6149 			"%lld", ldcp->ldc_id);
6150 		goto dring_fail_exit;
6151 	}
6152 
6153 	/*
6154 	 * Only ever create rings for outgoing lane. Link it onto
6155 	 * end of list.
6156 	 */
6157 	if (ldcp->lane_out.dringp == NULL) {
6158 		D2(vswp, "vsw_create_dring: adding first outbound ring");
6159 		ldcp->lane_out.dringp = dp;
6160 	} else {
6161 		tp = ldcp->lane_out.dringp;
6162 		while (tp->next != NULL)
6163 			tp = tp->next;
6164 
6165 		tp->next = dp;
6166 	}
6167 
6168 	return (dp);
6169 
6170 dring_fail_exit:
6171 	(void) ldc_mem_dring_destroy(dp->handle);
6172 
6173 create_fail_exit:
6174 	if (dp->priv_addr != NULL) {
6175 		priv_addr = dp->priv_addr;
6176 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
6177 			if (priv_addr->memhandle != NULL)
6178 				(void) ldc_mem_free_handle(
6179 						priv_addr->memhandle);
6180 			priv_addr++;
6181 		}
6182 		kmem_free(dp->priv_addr,
6183 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
6184 	}
6185 	mutex_destroy(&dp->dlock);
6186 
6187 	kmem_free(dp, sizeof (dring_info_t));
6188 	return (NULL);
6189 }
6190 
6191 /*
6192  * Create a ring consisting of just a private portion and link
6193  * it into the list of rings for the outbound lane.
6194  *
6195  * These type of rings are used primarily for temporary data
6196  * storage (i.e. as data buffers).
6197  */
6198 void
6199 vsw_create_privring(vsw_ldc_t *ldcp)
6200 {
6201 	dring_info_t		*dp, *tp;
6202 	vsw_t			*vswp = ldcp->ldc_vswp;
6203 
6204 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6205 
6206 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
6207 
6208 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
6209 
6210 	/* no public section */
6211 	dp->pub_addr = NULL;
6212 
6213 	dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
6214 					VSW_RING_NUM_EL), KM_SLEEP);
6215 
6216 	if (vsw_setup_ring(ldcp, dp)) {
6217 		DERR(vswp, "%s: setup of ring failed", __func__);
6218 		kmem_free(dp->priv_addr,
6219 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
6220 		mutex_destroy(&dp->dlock);
6221 		kmem_free(dp, sizeof (dring_info_t));
6222 		return;
6223 	}
6224 
6225 	/* haven't used any descriptors yet */
6226 	dp->end_idx = 0;
6227 
6228 	/*
6229 	 * Only ever create rings for outgoing lane. Link it onto
6230 	 * end of list.
6231 	 */
6232 	if (ldcp->lane_out.dringp == NULL) {
6233 		D2(vswp, "%s: adding first outbound privring", __func__);
6234 		ldcp->lane_out.dringp = dp;
6235 	} else {
6236 		tp = ldcp->lane_out.dringp;
6237 		while (tp->next != NULL)
6238 			tp = tp->next;
6239 
6240 		tp->next = dp;
6241 	}
6242 
6243 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6244 }
6245 
6246 /*
6247  * Setup the descriptors in the dring. Returns 0 on success, 1 on
6248  * failure.
6249  */
6250 int
6251 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
6252 {
6253 	vnet_public_desc_t	*pub_addr = NULL;
6254 	vsw_private_desc_t	*priv_addr = NULL;
6255 	vsw_t			*vswp = ldcp->ldc_vswp;
6256 	uint64_t		*tmpp;
6257 	uint64_t		offset = 0;
6258 	uint32_t		ncookies = 0;
6259 	static char		*name = "vsw_setup_ring";
6260 	int			i, j, rv;
6261 
6262 	/* note - public section may be null */
6263 	priv_addr = dp->priv_addr;
6264 	pub_addr = dp->pub_addr;
6265 
6266 	/*
6267 	 * Allocate the region of memory which will be used to hold
6268 	 * the data the descriptors will refer to.
6269 	 */
6270 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
6271 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
6272 
6273 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
6274 		dp->data_sz, dp->data_addr);
6275 
6276 	tmpp = (uint64_t *)dp->data_addr;
6277 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
6278 
6279 	/*
6280 	 * Initialise some of the private and public (if they exist)
6281 	 * descriptor fields.
6282 	 */
6283 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
6284 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
6285 			&priv_addr->memhandle)) != 0) {
6286 			DERR(vswp, "%s: alloc mem handle failed", name);
6287 			goto setup_ring_cleanup;
6288 		}
6289 
6290 		priv_addr->datap = (void *)tmpp;
6291 
6292 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
6293 			(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
6294 			LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
6295 			&(priv_addr->memcookie[0]), &ncookies);
6296 		if (rv != 0) {
6297 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
6298 				"(rv %d)", name, ldcp->ldc_id, rv);
6299 			goto setup_ring_cleanup;
6300 		}
6301 		priv_addr->bound = 1;
6302 
6303 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
6304 			name, i, priv_addr->memcookie[0].addr,
6305 			priv_addr->memcookie[0].size);
6306 
6307 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
6308 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
6309 				"invalid num of cookies (%d) for size 0x%llx",
6310 				name, ldcp->ldc_id, ncookies,
6311 				VSW_RING_EL_DATA_SZ);
6312 
6313 			goto setup_ring_cleanup;
6314 		} else {
6315 			for (j = 1; j < ncookies; j++) {
6316 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
6317 					&(priv_addr->memcookie[j]));
6318 				if (rv != 0) {
6319 					DERR(vswp, "%s: ldc_mem_nextcookie "
6320 						"failed rv (%d)", name, rv);
6321 					goto setup_ring_cleanup;
6322 				}
6323 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
6324 					"size 0x%llx", name, j,
6325 					priv_addr->memcookie[j].addr,
6326 					priv_addr->memcookie[j].size);
6327 			}
6328 
6329 		}
6330 		priv_addr->ncookies = ncookies;
6331 		priv_addr->dstate = VIO_DESC_FREE;
6332 
6333 		if (pub_addr != NULL) {
6334 
6335 			/* link pub and private sides */
6336 			priv_addr->descp = pub_addr;
6337 
6338 			pub_addr->hdr.dstate = VIO_DESC_FREE;
6339 			pub_addr++;
6340 		}
6341 
6342 		/*
6343 		 * move to next element in the dring and the next
6344 		 * position in the data buffer.
6345 		 */
6346 		priv_addr++;
6347 		tmpp += offset;
6348 	}
6349 
6350 	return (0);
6351 
6352 setup_ring_cleanup:
6353 	priv_addr = dp->priv_addr;
6354 
6355 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
6356 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
6357 		(void) ldc_mem_free_handle(priv_addr->memhandle);
6358 
6359 		priv_addr++;
6360 	}
6361 	kmem_free(dp->data_addr, dp->data_sz);
6362 
6363 	return (1);
6364 }
6365 
6366 /*
6367  * Searches the private section of a ring for a free descriptor,
6368  * starting at the location of the last free descriptor found
6369  * previously.
6370  *
6371  * Returns 0 if free descriptor is available, 1 otherwise.
6372  *
6373  * FUTURE: might need to return contiguous range of descriptors
6374  * as dring info msg assumes all will be contiguous.
6375  */
6376 static int
6377 vsw_dring_find_free_desc(dring_info_t *dringp,
6378 		vsw_private_desc_t **priv_p, int *idx)
6379 {
6380 	vsw_private_desc_t	*addr;
6381 	uint64_t		i;
6382 	uint64_t		j = 0;
6383 	uint64_t		start = dringp->end_idx;
6384 	int			num = VSW_RING_NUM_EL;
6385 	int			ret = 1;
6386 
6387 	D1(NULL, "%s enter\n", __func__);
6388 
6389 	addr = dringp->priv_addr;
6390 
6391 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
6392 			__func__, dringp, start);
6393 
6394 	for (i = start; j < num; i = (i + 1) % num, j++) {
6395 		addr = (vsw_private_desc_t *)dringp->priv_addr + i;
6396 		D2(NULL, "%s: descriptor %lld : dstate 0x%llx\n",
6397 			__func__, i, addr->dstate);
6398 		if (addr->dstate == VIO_DESC_FREE) {
6399 			D2(NULL, "%s: descriptor %lld is available",
6400 								__func__, i);
6401 			*priv_p = addr;
6402 			*idx = i;
6403 			dringp->end_idx = (i + 1) % num;
6404 			ret = 0;
6405 			break;
6406 		}
6407 	}
6408 
6409 	/* ring full */
6410 	if (ret == 1) {
6411 		D2(NULL, "%s: no desp free: started at %d", __func__, start);
6412 	}
6413 
6414 	D1(NULL, "%s: exit\n", __func__);
6415 
6416 	return (ret);
6417 }
6418 
6419 /*
6420  * Copy relevant fields from the private descriptor into the
6421  * associated public side.
6422  */
6423 static void
6424 vsw_dring_priv2pub(vsw_private_desc_t *priv)
6425 {
6426 	vnet_public_desc_t	*pub;
6427 	int			i;
6428 
6429 	D1(NULL, "vsw_dring_priv2pub enter\n");
6430 
6431 	pub = priv->descp;
6432 
6433 	pub->ncookies = priv->ncookies;
6434 	pub->nbytes = priv->datalen;
6435 
6436 	for (i = 0; i < pub->ncookies; i++) {
6437 		bcopy(&priv->memcookie[i], &pub->memcookie[i],
6438 			sizeof (ldc_mem_cookie_t));
6439 	}
6440 
6441 	pub->hdr.ack = 1;
6442 	pub->hdr.dstate = VIO_DESC_READY;
6443 
6444 	D1(NULL, "vsw_dring_priv2pub exit");
6445 }
6446 
6447 /*
6448  * Map from a dring identifier to the ring itself. Returns
6449  * pointer to ring or NULL if no match found.
6450  */
6451 static dring_info_t *
6452 vsw_ident2dring(lane_t *lane, uint64_t ident)
6453 {
6454 	dring_info_t	*dp = NULL;
6455 
6456 	if ((dp = lane->dringp) == NULL) {
6457 		return (NULL);
6458 	} else {
6459 		if (dp->ident == ident)
6460 			return (dp);
6461 
6462 		while (dp != NULL) {
6463 			if (dp->ident == ident)
6464 				break;
6465 			dp = dp->next;
6466 		}
6467 	}
6468 
6469 	return (dp);
6470 }
6471 
6472 /*
6473  * Set the default lane attributes. These are copied into
6474  * the attr msg we send to our peer. If they are not acceptable
6475  * then (currently) the handshake ends.
6476  */
6477 static void
6478 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
6479 {
6480 	bzero(lp, sizeof (lane_t));
6481 
6482 	READ_ENTER(&vswp->if_lockrw);
6483 	ether_copy(&(vswp->if_addr), &(lp->addr));
6484 	RW_EXIT(&vswp->if_lockrw);
6485 
6486 	lp->mtu = VSW_MTU;
6487 	lp->addr_type = ADDR_TYPE_MAC;
6488 	lp->xfer_mode = VIO_DRING_MODE;
6489 	lp->ack_freq = 0;	/* for shared mode */
6490 	lp->seq_num = VNET_ISS;
6491 }
6492 
6493 /*
6494  * Verify that the attributes are acceptable.
6495  *
6496  * FUTURE: If some attributes are not acceptable, change them
6497  * our desired values.
6498  */
6499 static int
6500 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
6501 {
6502 	int	ret = 0;
6503 
6504 	D1(NULL, "vsw_check_attr enter\n");
6505 
6506 	/*
6507 	 * Note we currently only support in-band descriptors
6508 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
6509 	 */
6510 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
6511 			(pkt->xfer_mode != VIO_DRING_MODE)) {
6512 		D2(NULL, "vsw_check_attr: unknown mode %x\n",
6513 			pkt->xfer_mode);
6514 		ret = 1;
6515 	}
6516 
6517 	/* Only support MAC addresses at moment. */
6518 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
6519 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
6520 			"or address 0x%llx\n", pkt->addr_type,
6521 			pkt->addr);
6522 		ret = 1;
6523 	}
6524 
6525 	/*
6526 	 * MAC address supplied by device should match that stored
6527 	 * in the vsw-port OBP node. Need to decide what to do if they
6528 	 * don't match, for the moment just warn but don't fail.
6529 	 */
6530 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
6531 		DERR(NULL, "vsw_check_attr: device supplied address "
6532 			"0x%llx doesn't match node address 0x%llx\n",
6533 			pkt->addr, port->p_macaddr);
6534 	}
6535 
6536 	/*
6537 	 * Ack freq only makes sense in pkt mode, in shared
6538 	 * mode the ring descriptors say whether or not to
6539 	 * send back an ACK.
6540 	 */
6541 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
6542 				(pkt->ack_freq > 0)) {
6543 		D2(NULL, "vsw_check_attr: non zero ack freq "
6544 			" in SHM mode\n");
6545 		ret = 1;
6546 	}
6547 
6548 	/*
6549 	 * Note: for the moment we only support ETHER
6550 	 * frames. This may change in the future.
6551 	 */
6552 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
6553 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
6554 			pkt->mtu);
6555 		ret = 1;
6556 	}
6557 
6558 	D1(NULL, "vsw_check_attr exit\n");
6559 
6560 	return (ret);
6561 }
6562 
6563 /*
6564  * Returns 1 if there is a problem, 0 otherwise.
6565  */
6566 static int
6567 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
6568 {
6569 	_NOTE(ARGUNUSED(pkt))
6570 
6571 	int	ret = 0;
6572 
6573 	D1(NULL, "vsw_check_dring_info enter\n");
6574 
6575 	if ((pkt->num_descriptors == 0) ||
6576 		(pkt->descriptor_size == 0) ||
6577 		(pkt->ncookies != 1)) {
6578 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
6579 		ret = 1;
6580 	}
6581 
6582 	D1(NULL, "vsw_check_dring_info exit\n");
6583 
6584 	return (ret);
6585 }
6586 
6587 /*
6588  * Returns 1 if two memory cookies match. Otherwise returns 0.
6589  */
6590 static int
6591 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
6592 {
6593 	if ((m1->addr != m2->addr) ||
6594 		(m2->size != m2->size)) {
6595 		return (0);
6596 	} else {
6597 		return (1);
6598 	}
6599 }
6600 
6601 /*
6602  * Returns 1 if ring described in reg message matches that
6603  * described by dring_info structure. Otherwise returns 0.
6604  */
6605 static int
6606 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
6607 {
6608 	if ((msg->descriptor_size != dp->descriptor_size) ||
6609 		(msg->num_descriptors != dp->num_descriptors) ||
6610 		(msg->ncookies != dp->ncookies) ||
6611 		!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
6612 		return (0);
6613 	} else {
6614 		return (1);
6615 	}
6616 
6617 }
6618 
6619 static caddr_t
6620 vsw_print_ethaddr(uint8_t *a, char *ebuf)
6621 {
6622 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
6623 	    a[0], a[1], a[2], a[3], a[4], a[5]);
6624 	return (ebuf);
6625 }
6626 
6627 /*
6628  * Reset and free all the resources associated with
6629  * the channel.
6630  */
6631 static void
6632 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
6633 {
6634 	dring_info_t		*dp, *dpp;
6635 	lane_t			*lp = NULL;
6636 	int			rv = 0;
6637 
6638 	ASSERT(ldcp != NULL);
6639 
6640 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
6641 
6642 	if (dir == INBOUND) {
6643 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
6644 			" of channel %lld", __func__, ldcp->ldc_id);
6645 		lp = &ldcp->lane_in;
6646 	} else {
6647 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
6648 			" of channel %lld", __func__, ldcp->ldc_id);
6649 		lp = &ldcp->lane_out;
6650 	}
6651 
6652 	lp->lstate = VSW_LANE_INACTIV;
6653 	lp->seq_num = VNET_ISS;
6654 	if (lp->dringp) {
6655 		if (dir == INBOUND) {
6656 			dp = lp->dringp;
6657 			while (dp != NULL) {
6658 				dpp = dp->next;
6659 				if (dp->handle != NULL)
6660 					(void) ldc_mem_dring_unmap(dp->handle);
6661 				kmem_free(dp, sizeof (dring_info_t));
6662 				dp = dpp;
6663 			}
6664 		} else {
6665 			/*
6666 			 * unbind, destroy exported dring, free dring struct
6667 			 */
6668 			dp = lp->dringp;
6669 			rv = vsw_free_ring(dp);
6670 		}
6671 		if (rv == 0) {
6672 			lp->dringp = NULL;
6673 		}
6674 	}
6675 
6676 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
6677 }
6678 
6679 /*
6680  * Free ring and all associated resources.
6681  */
6682 static int
6683 vsw_free_ring(dring_info_t *dp)
6684 {
6685 	vsw_private_desc_t	*paddr = NULL;
6686 	dring_info_t		*dpp;
6687 	int			i, rv = 1;
6688 
6689 	while (dp != NULL) {
6690 		mutex_enter(&dp->dlock);
6691 		dpp = dp->next;
6692 		if (dp->priv_addr != NULL) {
6693 			/*
6694 			 * First unbind and free the memory handles
6695 			 * stored in each descriptor within the ring.
6696 			 */
6697 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
6698 				paddr = (vsw_private_desc_t *)
6699 						dp->priv_addr + i;
6700 				if (paddr->memhandle != NULL) {
6701 					if (paddr->bound == 1) {
6702 						rv = ldc_mem_unbind_handle(
6703 							paddr->memhandle);
6704 
6705 						if (rv != 0) {
6706 							DERR(NULL, "error "
6707 							"unbinding handle for "
6708 							"ring 0x%llx at pos %d",
6709 							dp, i);
6710 							mutex_exit(&dp->dlock);
6711 							return (rv);
6712 						}
6713 						paddr->bound = 0;
6714 					}
6715 
6716 					rv = ldc_mem_free_handle(
6717 							paddr->memhandle);
6718 					if (rv != 0) {
6719 						DERR(NULL, "error freeing "
6720 							"handle for ring "
6721 							"0x%llx at pos %d",
6722 							dp, i);
6723 						mutex_exit(&dp->dlock);
6724 						return (rv);
6725 					}
6726 					paddr->memhandle = NULL;
6727 				}
6728 			}
6729 			kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
6730 					* VSW_RING_NUM_EL));
6731 		}
6732 
6733 		/*
6734 		 * Now unbind and destroy the ring itself.
6735 		 */
6736 		if (dp->handle != NULL) {
6737 			(void) ldc_mem_dring_unbind(dp->handle);
6738 			(void) ldc_mem_dring_destroy(dp->handle);
6739 		}
6740 
6741 		if (dp->data_addr != NULL) {
6742 			kmem_free(dp->data_addr, dp->data_sz);
6743 		}
6744 
6745 		mutex_exit(&dp->dlock);
6746 		mutex_destroy(&dp->dlock);
6747 		kmem_free(dp, sizeof (dring_info_t));
6748 
6749 		dp = dpp;
6750 	}
6751 	return (0);
6752 }
6753 
6754 /*
6755  * Debugging routines
6756  */
6757 static void
6758 display_state(void)
6759 {
6760 	vsw_t		*vswp;
6761 	vsw_port_list_t	*plist;
6762 	vsw_port_t 	*port;
6763 	vsw_ldc_list_t	*ldcl;
6764 	vsw_ldc_t 	*ldcp;
6765 
6766 	cmn_err(CE_NOTE, "***** system state *****");
6767 
6768 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
6769 		plist = &vswp->plist;
6770 		READ_ENTER(&plist->lockrw);
6771 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
6772 			vswp->instance, plist->num_ports);
6773 
6774 		for (port = plist->head; port != NULL; port = port->p_next) {
6775 			ldcl = &port->p_ldclist;
6776 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
6777 				port->p_instance, ldcl->num_ldcs);
6778 			READ_ENTER(&ldcl->lockrw);
6779 			ldcp = ldcl->head;
6780 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
6781 				cmn_err(CE_CONT, "chan %lu : dev %d : "
6782 					"status %d : phase %u\n",
6783 					ldcp->ldc_id, ldcp->dev_class,
6784 					ldcp->ldc_status, ldcp->hphase);
6785 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
6786 					"psession %lu\n",
6787 					ldcp->ldc_id,
6788 					ldcp->local_session,
6789 					ldcp->peer_session);
6790 
6791 				cmn_err(CE_CONT, "Inbound lane:\n");
6792 				display_lane(&ldcp->lane_in);
6793 				cmn_err(CE_CONT, "Outbound lane:\n");
6794 				display_lane(&ldcp->lane_out);
6795 			}
6796 			RW_EXIT(&ldcl->lockrw);
6797 		}
6798 		RW_EXIT(&plist->lockrw);
6799 	}
6800 	cmn_err(CE_NOTE, "***** system state *****");
6801 }
6802 
6803 static void
6804 display_lane(lane_t *lp)
6805 {
6806 	dring_info_t	*drp;
6807 
6808 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
6809 		lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
6810 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
6811 		lp->addr_type, lp->addr, lp->xfer_mode);
6812 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
6813 
6814 	cmn_err(CE_CONT, "Dring info:\n");
6815 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
6816 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
6817 			drp->num_descriptors, drp->descriptor_size);
6818 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
6819 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
6820 			(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
6821 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
6822 			drp->ident, drp->end_idx);
6823 		display_ring(drp);
6824 	}
6825 }
6826 
6827 static void
6828 display_ring(dring_info_t *dringp)
6829 {
6830 	uint64_t		i;
6831 	uint64_t		priv_count = 0;
6832 	uint64_t		pub_count = 0;
6833 	vnet_public_desc_t	*pub_addr = NULL;
6834 	vsw_private_desc_t	*priv_addr = NULL;
6835 
6836 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
6837 		if (dringp->pub_addr != NULL) {
6838 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
6839 
6840 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
6841 				pub_count++;
6842 		}
6843 
6844 		if (dringp->priv_addr != NULL) {
6845 			priv_addr =
6846 				(vsw_private_desc_t *)dringp->priv_addr + i;
6847 
6848 			if (priv_addr->dstate == VIO_DESC_FREE)
6849 				priv_count++;
6850 		}
6851 	}
6852 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
6853 			i, priv_count, pub_count);
6854 }
6855 
6856 static void
6857 dump_flags(uint64_t state)
6858 {
6859 	int	i;
6860 
6861 	typedef struct flag_name {
6862 		int	flag_val;
6863 		char	*flag_name;
6864 	} flag_name_t;
6865 
6866 	flag_name_t	flags[] = {
6867 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
6868 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
6869 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
6870 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
6871 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
6872 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
6873 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
6874 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
6875 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
6876 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
6877 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
6878 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
6879 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
6880 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
6881 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
6882 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
6883 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
6884 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
6885 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
6886 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
6887 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
6888 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
6889 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
6890 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
6891 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
6892 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
6893 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
6894 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
6895 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
6896 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
6897 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
6898 
6899 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
6900 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
6901 		if (state & flags[i].flag_val)
6902 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
6903 	}
6904 }
6905