xref: /titanic_44/usr/src/uts/sun4v/io/vsw.c (revision ef69670ded4ed2349f664bb59f0d513cc0364906)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 
74 /*
75  * Function prototypes.
76  */
77 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
78 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
79 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
80 static	void vsw_get_md_properties(vsw_t *vswp);
81 static	int vsw_get_physaddr(vsw_t *);
82 static	int vsw_setup_layer2(vsw_t *);
83 static	int vsw_setup_layer3(vsw_t *);
84 
85 /* MAC layer routines */
86 static	int vsw_mac_attach(vsw_t *vswp);
87 static	void vsw_mac_detach(vsw_t *vswp);
88 static	int vsw_get_hw_maddr(vsw_t *);
89 static	int vsw_set_hw(vsw_t *, vsw_port_t *);
90 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *);
91 static	int vsw_unset_hw(vsw_t *, vsw_port_t *);
92 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *);
93 static	int vsw_reconfig_hw(vsw_t *);
94 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
95 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
96 static int vsw_mac_register(vsw_t *);
97 static int vsw_mac_unregister(vsw_t *);
98 static int vsw_m_stat(void *, uint_t, uint64_t *);
99 static void vsw_m_stop(void *arg);
100 static int vsw_m_start(void *arg);
101 static int vsw_m_unicst(void *arg, const uint8_t *);
102 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
103 static int vsw_m_promisc(void *arg, boolean_t);
104 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
105 
106 /* MDEG routines */
107 static	void vsw_mdeg_register(vsw_t *vswp);
108 static	void vsw_mdeg_unregister(vsw_t *vswp);
109 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
110 
111 /* Port add/deletion routines */
112 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
113 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
114 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
115 static	int vsw_detach_ports(vsw_t *vswp);
116 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
117 static	int vsw_port_delete(vsw_port_t *port);
118 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
119 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
120 static	int vsw_init_ldcs(vsw_port_t *port);
121 static	int vsw_uninit_ldcs(vsw_port_t *port);
122 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
123 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
124 static	int vsw_drain_ldcs(vsw_port_t *port);
125 static	int vsw_drain_port_taskq(vsw_port_t *port);
126 static	void vsw_marker_task(void *);
127 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
128 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
129 
130 /* Interrupt routines */
131 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
132 
133 /* Handshake routines */
134 static	void vsw_restart_handshake(vsw_ldc_t *);
135 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
136 static	void vsw_next_milestone(vsw_ldc_t *);
137 static	int vsw_supported_version(vio_ver_msg_t *);
138 
139 /* Data processing routines */
140 static void vsw_process_pkt(void *);
141 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
142 static void vsw_process_ctrl_pkt(void *);
143 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
144 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
145 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
146 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
147 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
148 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
149 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
150 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
151 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
152 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
153 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
154 
155 /* Switching/data transmit routines */
156 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
157 	    vsw_port_t *port, mac_resource_handle_t);
158 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
159 	    vsw_port_t *port, mac_resource_handle_t);
160 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
161 	    vsw_port_t *port);
162 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
163 	    vsw_port_t *port);
164 static	int vsw_portsend(vsw_port_t *, mblk_t *);
165 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
166 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
167 
168 /* Packet creation routines */
169 static void vsw_send_ver(vsw_ldc_t *);
170 static void vsw_send_attr(vsw_ldc_t *);
171 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
172 static void vsw_send_dring_info(vsw_ldc_t *);
173 static void vsw_send_rdx(vsw_ldc_t *);
174 
175 static void vsw_send_msg(vsw_ldc_t *, void *, int);
176 
177 /* Forwarding database (FDB) routines */
178 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
179 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
180 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
181 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
182 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
183 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
184 static	void vsw_del_addr(uint8_t, void *, uint64_t);
185 static	void vsw_del_mcst_port(vsw_port_t *);
186 static	void vsw_del_mcst_vsw(vsw_t *);
187 
188 /* Dring routines */
189 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
190 static void vsw_create_privring(vsw_ldc_t *);
191 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
192 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
193     int *);
194 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
195 
196 static void vsw_set_lane_attr(vsw_t *, lane_t *);
197 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
198 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
199 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
200 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
201 
202 /* Misc support routines */
203 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
204 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
205 static int vsw_free_ring(dring_info_t *);
206 
207 
208 /* Debugging routines */
209 static void dump_flags(uint64_t);
210 static void display_state(void);
211 static void display_lane(lane_t *);
212 static void display_ring(dring_info_t *);
213 
214 int	vsw_num_handshakes = 3;		/* # of handshake attempts */
215 int	vsw_wretries = 100;		/* # of write attempts */
216 int	vsw_chain_len = 150;		/* max # of mblks in msg chain */
217 int	vsw_desc_delay = 0;		/* delay in us */
218 int	vsw_read_attempts = 5;		/* # of reads of descriptor */
219 
220 uint32_t	vsw_mblk_size = VSW_MBLK_SIZE;
221 uint32_t	vsw_num_mblks = VSW_NUM_MBLKS;
222 
223 
224 /*
225  * mode specific frame switching function
226  */
227 void		(*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *,
228 			mac_resource_handle_t);
229 
230 static	mac_callbacks_t	vsw_m_callbacks = {
231 	0,
232 	vsw_m_stat,
233 	vsw_m_start,
234 	vsw_m_stop,
235 	vsw_m_promisc,
236 	vsw_m_multicst,
237 	vsw_m_unicst,
238 	vsw_m_tx,
239 	NULL,
240 	NULL,
241 	NULL
242 };
243 
244 static	struct	cb_ops	vsw_cb_ops = {
245 	nulldev,			/* cb_open */
246 	nulldev,			/* cb_close */
247 	nodev,				/* cb_strategy */
248 	nodev,				/* cb_print */
249 	nodev,				/* cb_dump */
250 	nodev,				/* cb_read */
251 	nodev,				/* cb_write */
252 	nodev,				/* cb_ioctl */
253 	nodev,				/* cb_devmap */
254 	nodev,				/* cb_mmap */
255 	nodev,				/* cb_segmap */
256 	nochpoll,			/* cb_chpoll */
257 	ddi_prop_op,			/* cb_prop_op */
258 	NULL,				/* cb_stream */
259 	D_MP,				/* cb_flag */
260 	CB_REV,				/* rev */
261 	nodev,				/* int (*cb_aread)() */
262 	nodev				/* int (*cb_awrite)() */
263 };
264 
265 static	struct	dev_ops	vsw_ops = {
266 	DEVO_REV,		/* devo_rev */
267 	0,			/* devo_refcnt */
268 	vsw_getinfo,		/* devo_getinfo */
269 	nulldev,		/* devo_identify */
270 	nulldev,		/* devo_probe */
271 	vsw_attach,		/* devo_attach */
272 	vsw_detach,		/* devo_detach */
273 	nodev,			/* devo_reset */
274 	&vsw_cb_ops,		/* devo_cb_ops */
275 	(struct bus_ops *)NULL,	/* devo_bus_ops */
276 	ddi_power		/* devo_power */
277 };
278 
279 extern	struct	mod_ops	mod_driverops;
280 static struct modldrv vswmodldrv = {
281 	&mod_driverops,
282 	"sun4v Virtual Switch Driver %I%",
283 	&vsw_ops,
284 };
285 
286 #define	LDC_ENTER_LOCK(ldcp)	\
287 				mutex_enter(&((ldcp)->ldc_cblock));\
288 				mutex_enter(&((ldcp)->ldc_txlock));
289 #define	LDC_EXIT_LOCK(ldcp)	\
290 				mutex_exit(&((ldcp)->ldc_txlock));\
291 				mutex_exit(&((ldcp)->ldc_cblock));
292 
293 /* Driver soft state ptr  */
294 static void	*vsw_state;
295 
296 /*
297  * Linked list of "vsw_t" structures - one per instance.
298  */
299 vsw_t		*vsw_head = NULL;
300 krwlock_t	vsw_rw;
301 
302 /*
303  * Property names
304  */
305 static char vdev_propname[] = "virtual-device";
306 static char vsw_propname[] = "virtual-network-switch";
307 static char physdev_propname[] = "vsw-phys-dev";
308 static char smode_propname[] = "vsw-switch-mode";
309 static char macaddr_propname[] = "local-mac-address";
310 static char remaddr_propname[] = "remote-mac-address";
311 static char ldcids_propname[] = "ldc-ids";
312 static char chan_propname[] = "channel-endpoint";
313 static char id_propname[] = "id";
314 static char reg_propname[] = "reg";
315 
316 /* supported versions */
317 static	ver_sup_t	vsw_versions[] = { {1, 0} };
318 
319 /*
320  * Matching criteria passed to the MDEG to register interest
321  * in changes to 'virtual-device-port' nodes identified by their
322  * 'id' property.
323  */
324 static md_prop_match_t vport_prop_match[] = {
325 	{ MDET_PROP_VAL,    "id"   },
326 	{ MDET_LIST_END,    NULL    }
327 };
328 
329 static mdeg_node_match_t vport_match = { "virtual-device-port",
330 						vport_prop_match };
331 
332 /*
333  * Specification of an MD node passed to the MDEG to filter any
334  * 'vport' nodes that do not belong to the specified node. This
335  * template is copied for each vsw instance and filled in with
336  * the appropriate 'cfg-handle' value before being passed to the MDEG.
337  */
338 static mdeg_prop_spec_t vsw_prop_template[] = {
339 	{ MDET_PROP_STR,    "name",		vsw_propname },
340 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
341 	{ MDET_LIST_END,    NULL,		NULL	}
342 };
343 
344 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
345 
346 /*
347  * Print debug messages - set to 0x1f to enable all msgs
348  * or 0x0 to turn all off.
349  */
350 int vswdbg = 0x0;
351 
352 /*
353  * debug levels:
354  * 0x01:	Function entry/exit tracing
355  * 0x02:	Internal function messages
356  * 0x04:	Verbose internal messages
357  * 0x08:	Warning messages
358  * 0x10:	Error messages
359  */
360 
361 static void
362 vswdebug(vsw_t *vswp, const char *fmt, ...)
363 {
364 	char buf[512];
365 	va_list ap;
366 
367 	va_start(ap, fmt);
368 	(void) vsprintf(buf, fmt, ap);
369 	va_end(ap);
370 
371 	if (vswp == NULL)
372 		cmn_err(CE_CONT, "%s\n", buf);
373 	else
374 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
375 }
376 
377 /*
378  * For the moment the state dump routines have their own
379  * private flag.
380  */
381 #define	DUMP_STATE	0
382 
383 #if DUMP_STATE
384 
385 #define	DUMP_TAG(tag) \
386 {			\
387 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
388 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
389 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
390 }
391 
392 #define	DUMP_TAG_PTR(tag) \
393 {			\
394 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
395 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
396 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
397 }
398 
399 #define	DUMP_FLAGS(flags) dump_flags(flags);
400 #define	DISPLAY_STATE()	display_state()
401 
402 #else
403 
404 #define	DUMP_TAG(tag)
405 #define	DUMP_TAG_PTR(tag)
406 #define	DUMP_FLAGS(state)
407 #define	DISPLAY_STATE()
408 
409 #endif	/* DUMP_STATE */
410 
411 #ifdef DEBUG
412 
413 #define	D1		\
414 if (vswdbg & 0x01)	\
415 	vswdebug
416 
417 #define	D2		\
418 if (vswdbg & 0x02)	\
419 	vswdebug
420 
421 #define	D3		\
422 if (vswdbg & 0x04)	\
423 	vswdebug
424 
425 #define	DWARN		\
426 if (vswdbg & 0x08)	\
427 	vswdebug
428 
429 #define	DERR		\
430 if (vswdbg & 0x10)	\
431 	vswdebug
432 
433 #else
434 
435 #define	DERR		if (0)	vswdebug
436 #define	DWARN		if (0)	vswdebug
437 #define	D1		if (0)	vswdebug
438 #define	D2		if (0)	vswdebug
439 #define	D3		if (0)	vswdebug
440 
441 #endif	/* DEBUG */
442 
443 static struct modlinkage modlinkage = {
444 	MODREV_1,
445 	&vswmodldrv,
446 	NULL
447 };
448 
449 int
450 _init(void)
451 {
452 	int status;
453 
454 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
455 
456 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
457 	if (status != 0) {
458 		return (status);
459 	}
460 
461 	mac_init_ops(&vsw_ops, "vsw");
462 	status = mod_install(&modlinkage);
463 	if (status != 0) {
464 		ddi_soft_state_fini(&vsw_state);
465 	}
466 	return (status);
467 }
468 
469 int
470 _fini(void)
471 {
472 	int status;
473 
474 	status = mod_remove(&modlinkage);
475 	if (status != 0)
476 		return (status);
477 	mac_fini_ops(&vsw_ops);
478 	ddi_soft_state_fini(&vsw_state);
479 
480 	rw_destroy(&vsw_rw);
481 
482 	return (status);
483 }
484 
485 int
486 _info(struct modinfo *modinfop)
487 {
488 	return (mod_info(&modlinkage, modinfop));
489 }
490 
491 static int
492 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
493 {
494 	vsw_t		*vswp;
495 	int		instance, i;
496 	char		hashname[MAXNAMELEN];
497 	char		qname[TASKQ_NAMELEN];
498 	int		rv = 1;
499 	enum		{ PROG_init = 0x0, PROG_if_lock = 0x1,
500 				PROG_fdb = 0x2, PROG_mfdb = 0x4,
501 				PROG_report_dev = 0x8, PROG_plist = 0x10,
502 				PROG_taskq = 0x20}
503 			progress;
504 
505 	progress = PROG_init;
506 
507 	switch (cmd) {
508 	case DDI_ATTACH:
509 		break;
510 	case DDI_RESUME:
511 		/* nothing to do for this non-device */
512 		return (DDI_SUCCESS);
513 	case DDI_PM_RESUME:
514 	default:
515 		return (DDI_FAILURE);
516 	}
517 
518 	instance = ddi_get_instance(dip);
519 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
520 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
521 		return (DDI_FAILURE);
522 	}
523 	vswp = ddi_get_soft_state(vsw_state, instance);
524 
525 	if (vswp == NULL) {
526 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
527 		goto vsw_attach_fail;
528 	}
529 
530 	vswp->dip = dip;
531 	vswp->instance = instance;
532 	ddi_set_driver_private(dip, (caddr_t)vswp);
533 
534 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
535 
536 	progress |= PROG_if_lock;
537 
538 	/*
539 	 * Get the various properties such as physical device name
540 	 * (vsw-phys-dev), switch mode etc from the MD.
541 	 */
542 	vsw_get_md_properties(vswp);
543 
544 	/* setup the unicast forwarding database  */
545 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
546 							vswp->instance);
547 	D2(vswp, "creating unicast hash table (%s)...", hashname);
548 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
549 		mod_hash_null_valdtor, sizeof (void *));
550 
551 	progress |= PROG_fdb;
552 
553 	/* setup the multicast fowarding database */
554 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
555 							vswp->instance);
556 	D2(vswp, "creating multicast hash table %s)...", hashname);
557 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
558 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
559 			mod_hash_null_valdtor, sizeof (void *));
560 
561 	progress |= PROG_mfdb;
562 
563 	/*
564 	 * create lock protecting list of multicast addresses
565 	 * which could come via m_multicst() entry point when plumbed.
566 	 */
567 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
568 	vswp->mcap = NULL;
569 
570 	ddi_report_dev(vswp->dip);
571 
572 	progress |= PROG_report_dev;
573 
574 	WRITE_ENTER(&vsw_rw);
575 	vswp->next = vsw_head;
576 	vsw_head = vswp;
577 	RW_EXIT(&vsw_rw);
578 
579 	/* setup the port list */
580 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
581 	vswp->plist.head = NULL;
582 
583 	progress |= PROG_plist;
584 
585 	/*
586 	 * Create the taskq which will process all the VIO
587 	 * control messages.
588 	 */
589 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
590 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
591 					TASKQ_DEFAULTPRI, 0)) == NULL) {
592 		cmn_err(CE_WARN, "Unable to create task queue");
593 		goto vsw_attach_fail;
594 	}
595 
596 	progress |= PROG_taskq;
597 
598 	/* select best switching mode */
599 	for (i = 0; i < vswp->smode_num; i++) {
600 		vswp->smode_idx = i;
601 		switch (vswp->smode[i]) {
602 		case VSW_LAYER2:
603 		case VSW_LAYER2_PROMISC:
604 			rv = vsw_setup_layer2(vswp);
605 			break;
606 
607 		case VSW_LAYER3:
608 			rv = vsw_setup_layer3(vswp);
609 			break;
610 
611 		default:
612 			DERR(vswp, "unknown switch mode");
613 			rv = 1;
614 			break;
615 		}
616 
617 		if (rv == 0)
618 			break;
619 	}
620 
621 	if (rv == 1) {
622 		cmn_err(CE_WARN, "Unable to setup switching mode");
623 		goto vsw_attach_fail;
624 	}
625 
626 	D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]);
627 
628 	/*
629 	 * Register with the MAC layer as a network device so
630 	 * we can be plumbed if desired.
631 	 *
632 	 * Do this in both layer 2 and layer 3 mode.
633 	 */
634 	vswp->if_state &= ~VSW_IF_UP;
635 	if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
636 		if (vsw_mac_register(vswp) != 0) {
637 			cmn_err(CE_WARN, "Unable to register as provider "
638 				" with MAC layer, continuing with attach");
639 		}
640 	}
641 
642 	/* prevent auto-detaching */
643 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
644 				DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
645 		cmn_err(CE_NOTE, "Unable to set \"%s\" property for "
646 			"instance %u", DDI_NO_AUTODETACH, instance);
647 	}
648 
649 	/*
650 	 * Now we have everything setup, register for MD change
651 	 * events.
652 	 */
653 	vsw_mdeg_register(vswp);
654 
655 	return (DDI_SUCCESS);
656 
657 vsw_attach_fail:
658 	DERR(NULL, "vsw_attach: failed");
659 
660 	if (progress & PROG_taskq)
661 		ddi_taskq_destroy(vswp->taskq_p);
662 
663 	if (progress & PROG_plist)
664 		rw_destroy(&vswp->plist.lockrw);
665 
666 	if (progress & PROG_report_dev) {
667 		ddi_remove_minor_node(dip, NULL);
668 		mutex_destroy(&vswp->mca_lock);
669 	}
670 
671 	if (progress & PROG_mfdb) {
672 		mod_hash_destroy_hash(vswp->mfdb);
673 		vswp->mfdb = NULL;
674 		rw_destroy(&vswp->mfdbrw);
675 	}
676 
677 	if (progress & PROG_fdb) {
678 		mod_hash_destroy_hash(vswp->fdb);
679 		vswp->fdb = NULL;
680 	}
681 
682 	if (progress & PROG_if_lock)
683 		rw_destroy(&vswp->if_lockrw);
684 
685 	ddi_soft_state_free(vsw_state, instance);
686 	return (DDI_FAILURE);
687 }
688 
689 static int
690 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
691 {
692 	vio_mblk_pool_t		*poolp, *npoolp;
693 	vsw_t			**vswpp, *vswp;
694 	int 			instance;
695 
696 	instance = ddi_get_instance(dip);
697 	vswp = ddi_get_soft_state(vsw_state, instance);
698 
699 	if (vswp == NULL) {
700 		return (DDI_FAILURE);
701 	}
702 
703 	switch (cmd) {
704 	case DDI_DETACH:
705 		break;
706 	case DDI_SUSPEND:
707 	case DDI_PM_SUSPEND:
708 	default:
709 		return (DDI_FAILURE);
710 	}
711 
712 	D2(vswp, "detaching instance %d", instance);
713 
714 	if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
715 		if (vsw_mac_unregister(vswp) != 0) {
716 			cmn_err(CE_WARN, "Unable to detach from MAC layer");
717 			return (DDI_FAILURE);
718 		}
719 		rw_destroy(&vswp->if_lockrw);
720 	}
721 
722 	vsw_mdeg_unregister(vswp);
723 
724 	/* remove mac layer callback */
725 	if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
726 		mac_rx_remove(vswp->mh, vswp->mrh);
727 		vswp->mrh = NULL;
728 	}
729 
730 	if (vsw_detach_ports(vswp) != 0) {
731 		cmn_err(CE_WARN, "Unable to detach ports");
732 		return (DDI_FAILURE);
733 	}
734 
735 	/*
736 	 * Now that the ports have been deleted, stop and close
737 	 * the physical device.
738 	 */
739 	if (vswp->mh != NULL) {
740 		mac_stop(vswp->mh);
741 		mac_close(vswp->mh);
742 
743 		vswp->mh = NULL;
744 		vswp->txinfo = NULL;
745 	}
746 
747 	/*
748 	 * Destroy any free pools that may still exist.
749 	 */
750 	poolp = vswp->rxh;
751 	while (poolp != NULL) {
752 		npoolp = vswp->rxh = poolp->nextp;
753 		if (vio_destroy_mblks(poolp) != 0) {
754 			vswp->rxh = poolp;
755 			return (DDI_FAILURE);
756 		}
757 		poolp = npoolp;
758 	}
759 
760 	/*
761 	 * Remove this instance from any entries it may be on in
762 	 * the hash table by using the list of addresses maintained
763 	 * in the vsw_t structure.
764 	 */
765 	vsw_del_mcst_vsw(vswp);
766 
767 	vswp->mcap = NULL;
768 	mutex_destroy(&vswp->mca_lock);
769 
770 	/*
771 	 * By now any pending tasks have finished and the underlying
772 	 * ldc's have been destroyed, so its safe to delete the control
773 	 * message taskq.
774 	 */
775 	if (vswp->taskq_p != NULL)
776 		ddi_taskq_destroy(vswp->taskq_p);
777 
778 	/*
779 	 * At this stage all the data pointers in the hash table
780 	 * should be NULL, as all the ports have been removed and will
781 	 * have deleted themselves from the port lists which the data
782 	 * pointers point to. Hence we can destroy the table using the
783 	 * default destructors.
784 	 */
785 	D2(vswp, "vsw_detach: destroying hash tables..");
786 	mod_hash_destroy_hash(vswp->fdb);
787 	vswp->fdb = NULL;
788 
789 	WRITE_ENTER(&vswp->mfdbrw);
790 	mod_hash_destroy_hash(vswp->mfdb);
791 	vswp->mfdb = NULL;
792 	RW_EXIT(&vswp->mfdbrw);
793 	rw_destroy(&vswp->mfdbrw);
794 
795 	ddi_remove_minor_node(dip, NULL);
796 
797 	rw_destroy(&vswp->plist.lockrw);
798 	WRITE_ENTER(&vsw_rw);
799 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
800 		if (*vswpp == vswp) {
801 			*vswpp = vswp->next;
802 			break;
803 		}
804 	}
805 	RW_EXIT(&vsw_rw);
806 	ddi_soft_state_free(vsw_state, instance);
807 
808 	return (DDI_SUCCESS);
809 }
810 
811 static int
812 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
813 {
814 	_NOTE(ARGUNUSED(dip))
815 
816 	vsw_t	*vswp = NULL;
817 	dev_t	dev = (dev_t)arg;
818 	int	instance;
819 
820 	instance = getminor(dev);
821 
822 	switch (infocmd) {
823 	case DDI_INFO_DEVT2DEVINFO:
824 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
825 			*result = NULL;
826 			return (DDI_FAILURE);
827 		}
828 		*result = vswp->dip;
829 		return (DDI_SUCCESS);
830 
831 	case DDI_INFO_DEVT2INSTANCE:
832 		*result = (void *)(uintptr_t)instance;
833 		return (DDI_SUCCESS);
834 
835 	default:
836 		*result = NULL;
837 		return (DDI_FAILURE);
838 	}
839 }
840 
841 /*
842  * Get the properties from our MD node.
843  */
844 static void
845 vsw_get_md_properties(vsw_t *vswp)
846 {
847 	md_t		*mdp = NULL;
848 	int		num_nodes = 0;
849 	int		len = 0, listsz = 0;
850 	int		num_vdev = 0;
851 	int		i, idx;
852 	boolean_t	found_node = B_FALSE;
853 	char		*smode = NULL;
854 	char		*curr_mode = NULL;
855 	char		*physname = NULL;
856 	char		*node_name = NULL;
857 	char		*dev;
858 	uint64_t 	macaddr = 0;
859 	uint64_t	md_inst, obp_inst;
860 	mde_cookie_t	*listp = NULL;
861 	mde_cookie_t	rootnode;
862 
863 	D1(vswp, "%s: enter", __func__);
864 
865 	/*
866 	 * Further down we compare the obp 'reg' property to the
867 	 * 'cfg-handle' property in the vsw MD node to determine
868 	 * if the node refers to this particular instance. So if
869 	 * we can't read the obp value then there is no point
870 	 * in proceeding further.
871 	 */
872 	if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip,
873 			DDI_PROP_DONTPASS, reg_propname) != 1) {
874 		cmn_err(CE_WARN, "Unable to read %s property "
875 			"from OBP device node", reg_propname);
876 		return;
877 	}
878 
879 	obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
880 		DDI_PROP_DONTPASS, reg_propname, 0);
881 
882 	D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst);
883 
884 	if ((mdp = md_get_handle()) == NULL) {
885 		DERR(vswp, "%s: unable to init MD", __func__);
886 		return;
887 	}
888 
889 	if ((num_nodes = md_node_count(mdp)) <= 0) {
890 		DERR(vswp, "%s: invalid number of  nodes found %d",
891 			__func__, num_nodes);
892 		(void) md_fini_handle(mdp);
893 		return;
894 	}
895 
896 	D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes);
897 
898 	/* allocate enough space for node list */
899 	listsz = num_nodes * sizeof (mde_cookie_t);
900 	listp = kmem_zalloc(listsz, KM_SLEEP);
901 
902 	rootnode = md_root_node(mdp);
903 
904 	/* Get the list of virtual devices */
905 	num_vdev = md_scan_dag(mdp, rootnode,
906 		md_find_name(mdp, vdev_propname),
907 		md_find_name(mdp, "fwd"), listp);
908 
909 	if (num_vdev <= 0) {
910 		DERR(vswp, "%s: didn't find any virtual-device nodes in MD",
911 			__func__);
912 		goto md_prop_exit;
913 	}
914 
915 	D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev);
916 
917 	/* Look for the virtual switch nodes in the list */
918 	for (idx = 0; idx < num_vdev; idx++) {
919 		if (md_get_prop_str(mdp, listp[idx],
920 				"name", &node_name) != 0) {
921 			DERR(vswp, "%s: unable to get node name", __func__);
922 			continue;
923 
924 		}
925 
926 		if (strcmp(node_name, vsw_propname) == 0) {
927 			/* Virtual switch node */
928 			if (md_get_prop_val(mdp, listp[idx],
929 				"cfg-handle", &md_inst) != 0) {
930 				DERR(vswp, "%s: unable to get cfg-handle from"
931 					" node %d", __func__, idx);
932 				goto md_prop_exit;
933 			} else if (md_inst == obp_inst) {
934 				D2(vswp, "%s: found matching node (%d)"
935 					" 0x%llx == 0x%llx", __func__, idx,
936 					md_inst, obp_inst);
937 				found_node = B_TRUE;
938 				break;
939 			}
940 		}
941 	}
942 
943 	if (!found_node) {
944 		DWARN(vswp, "%s: couldn't find correct vsw node", __func__);
945 		goto md_prop_exit;
946 	}
947 
948 	/*
949 	 * Now, having found the correct node, get the various properties.
950 	 */
951 
952 	if (md_get_prop_data(mdp, listp[idx], physdev_propname,
953 				(uint8_t **)(&physname), &len) != 0) {
954 		cmn_err(CE_WARN, "%s: unable to get name(s) of physical "
955 			"device(s) from MD", __func__);
956 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
957 		cmn_err(CE_WARN, "%s is too long a device name", physname);
958 	} else {
959 		(void) strncpy(vswp->physname, physname, strlen(physname) + 1);
960 		vswp->mdprops |= VSW_MD_PHYSNAME;
961 		D2(vswp, "%s: using first device specified (%s)",
962 			__func__, vswp->physname);
963 	}
964 
965 #ifdef DEBUG
966 	/*
967 	 * As a temporary measure to aid testing we check to see if there
968 	 * is a vsw.conf file present. If there is we use the value of the
969 	 * vsw_physname property in the file as the name of the physical
970 	 * device, overriding the value from the MD.
971 	 *
972 	 * There may be multiple devices listed, but for the moment
973 	 * we just use the first one.
974 	 */
975 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
976 		"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
977 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
978 			cmn_err(CE_WARN, "%s is too long a device name", dev);
979 		} else {
980 			cmn_err(CE_NOTE, "%s: using device name (%s) from "
981 				"config file", __func__, dev);
982 
983 			(void) strncpy(vswp->physname, dev, strlen(dev) + 1);
984 			vswp->mdprops |= VSW_MD_PHYSNAME;
985 		}
986 
987 		ddi_prop_free(dev);
988 
989 	}
990 #endif
991 
992 	/* mac address for vswitch device itself */
993 	if (md_get_prop_val(mdp, listp[idx],
994 			macaddr_propname, &macaddr) != 0) {
995 		cmn_err(CE_WARN, "!Unable to get MAC address from MD");
996 
997 		/*
998 		 * Fallback to using the mac address of the physical
999 		 * device.
1000 		 */
1001 		if (vsw_get_physaddr(vswp) == 0) {
1002 			cmn_err(CE_NOTE, "!Using MAC address from physical "
1003 				"device (%s)", vswp->physname);
1004 		}
1005 	} else {
1006 		READ_ENTER(&vswp->if_lockrw);
1007 		for (i = ETHERADDRL - 1; i >= 0; i--) {
1008 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
1009 			macaddr >>= 8;
1010 		}
1011 		RW_EXIT(&vswp->if_lockrw);
1012 		vswp->mdprops |= VSW_MD_MACADDR;
1013 	}
1014 
1015 	/*
1016 	 * Get the switch-mode property. The modes are listed in
1017 	 * decreasing order of preference, i.e. prefered mode is
1018 	 * first item in list.
1019 	 */
1020 	len = 0;
1021 	vswp->smode_num = 0;
1022 	if (md_get_prop_data(mdp, listp[idx], smode_propname,
1023 				(uint8_t **)(&smode), &len) != 0) {
1024 		/*
1025 		 * Unable to get switch-mode property from MD, nothing
1026 		 * more we can do.
1027 		 */
1028 		cmn_err(CE_WARN, "!unable to get switch mode property");
1029 		goto md_prop_exit;
1030 	}
1031 
1032 	curr_mode = smode;
1033 	/*
1034 	 * Modes of operation:
1035 	 * 'switched'	 - layer 2 switching, underlying HW in
1036 	 *			programmed mode.
1037 	 * 'promiscuous' - layer 2 switching, underlying HW in
1038 	 *			promiscuous mode.
1039 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
1040 	 *			in non-promiscuous mode.
1041 	 */
1042 	while ((curr_mode < (smode + len)) && (vswp->smode_num < NUM_SMODES)) {
1043 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
1044 		if (strcmp(curr_mode, "switched") == 0) {
1045 			vswp->smode[vswp->smode_num++] = VSW_LAYER2;
1046 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
1047 			vswp->smode[vswp->smode_num++] = VSW_LAYER2_PROMISC;
1048 		} else if (strcmp(curr_mode, "routed") == 0) {
1049 			vswp->smode[vswp->smode_num++] = VSW_LAYER3;
1050 		} else {
1051 			cmn_err(CE_WARN, "Unknown switch mode %s, setting to"
1052 				" default switched mode", curr_mode);
1053 			vswp->smode[vswp->smode_num++] = VSW_LAYER2;
1054 		}
1055 		curr_mode += strlen(curr_mode) + 1;
1056 	}
1057 
1058 	D2(vswp, "%d switching modes specified", vswp->smode_num);
1059 
1060 	if (vswp->smode_num > 0)
1061 		vswp->mdprops |= VSW_MD_SMODE;
1062 
1063 md_prop_exit:
1064 	(void) md_fini_handle(mdp);
1065 
1066 	kmem_free(listp, listsz);
1067 
1068 	D1(vswp, "%s: exit", __func__);
1069 }
1070 
1071 /*
1072  * Get the mac address of the physical device.
1073  *
1074  * Returns 0 on success, 1 on failure.
1075  */
1076 static int
1077 vsw_get_physaddr(vsw_t *vswp)
1078 {
1079 	mac_handle_t	mh;
1080 	char		drv[LIFNAMSIZ];
1081 	uint_t		ddi_instance;
1082 
1083 	D1(vswp, "%s: enter", __func__);
1084 
1085 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS)
1086 		return (1);
1087 
1088 	if (mac_open(vswp->physname, ddi_instance, &mh) != 0) {
1089 		cmn_err(CE_WARN, "!mac_open %s failed", vswp->physname);
1090 		return (1);
1091 	}
1092 
1093 	READ_ENTER(&vswp->if_lockrw);
1094 	mac_unicst_get(mh, vswp->if_addr.ether_addr_octet);
1095 	RW_EXIT(&vswp->if_lockrw);
1096 
1097 	mac_close(mh);
1098 
1099 	vswp->mdprops |= VSW_DEV_MACADDR;
1100 
1101 	D1(vswp, "%s: exit", __func__);
1102 
1103 	return (0);
1104 }
1105 
1106 /*
1107  * Check to see if the card supports the setting of multiple unicst
1108  * addresses.
1109  *
1110  * Returns 0 if card supports the programming of multiple unicast addresses
1111  * and there are free address slots available, otherwise returns 1.
1112  */
1113 static int
1114 vsw_get_hw_maddr(vsw_t *vswp)
1115 {
1116 	D1(vswp, "%s: enter", __func__);
1117 
1118 	if (vswp->mh == NULL) {
1119 		return (1);
1120 	}
1121 
1122 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
1123 		DWARN(vswp, "Unable to get capabilities of"
1124 			" underlying device (%s)", vswp->physname);
1125 		return (1);
1126 	}
1127 
1128 	if (vswp->maddr.maddr_naddrfree == 0) {
1129 		cmn_err(CE_WARN, "!device %s has no free unicast address slots",
1130 			vswp->physname);
1131 		return (1);
1132 	}
1133 
1134 	D2(vswp, "%s: %d addrs : %d free", __func__,
1135 		vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
1136 
1137 	D1(vswp, "%s: exit", __func__);
1138 
1139 	return (0);
1140 }
1141 
1142 /*
1143  * Setup for layer 2 switching.
1144  *
1145  * Returns 0 on success, 1 on failure.
1146  */
1147 static int
1148 vsw_setup_layer2(vsw_t *vswp)
1149 {
1150 	D1(vswp, "%s: enter", __func__);
1151 
1152 	vsw_switch_frame = vsw_switch_l2_frame;
1153 
1154 	/*
1155 	 * Attempt to link into the MAC layer so we can get
1156 	 * and send packets out over the physical adapter.
1157 	 */
1158 	if (vswp->mdprops & VSW_MD_PHYSNAME) {
1159 		if (vsw_mac_attach(vswp) != 0) {
1160 			/*
1161 			 * Registration with the MAC layer has failed,
1162 			 * so return 1 so that can fall back to next
1163 			 * prefered switching method.
1164 			 */
1165 			cmn_err(CE_WARN, "!Unable to join as MAC layer "
1166 				"client");
1167 			return (1);
1168 		}
1169 
1170 		if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
1171 			/*
1172 			 * Verify that underlying device can support multiple
1173 			 * unicast mac addresses, and has free capacity.
1174 			 */
1175 			if (vsw_get_hw_maddr(vswp) != 0) {
1176 				cmn_err(CE_WARN, "!unable to setup switching");
1177 				vsw_mac_detach(vswp);
1178 				return (1);
1179 			}
1180 		}
1181 
1182 	} else {
1183 		/*
1184 		 * No physical device name found in MD which is
1185 		 * required for layer 2.
1186 		 */
1187 		cmn_err(CE_WARN, "!no physical device name specified");
1188 		return (1);
1189 	}
1190 
1191 	D1(vswp, "%s: exit", __func__);
1192 
1193 	return (0);
1194 }
1195 
1196 static int
1197 vsw_setup_layer3(vsw_t *vswp)
1198 {
1199 	D1(vswp, "%s: enter", __func__);
1200 
1201 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1202 	vsw_switch_frame = vsw_switch_l3_frame;
1203 
1204 	D1(vswp, "%s: exit", __func__);
1205 
1206 	return (0);
1207 }
1208 
1209 /*
1210  * Link into the MAC layer to gain access to the services provided by
1211  * the underlying physical device driver (which should also have
1212  * registered with the MAC layer).
1213  *
1214  * Only when in layer 2 mode.
1215  */
1216 static int
1217 vsw_mac_attach(vsw_t *vswp)
1218 {
1219 	char	drv[LIFNAMSIZ];
1220 	uint_t	ddi_instance;
1221 
1222 	D1(vswp, "vsw_mac_attach: enter");
1223 
1224 	vswp->mh = NULL;
1225 	vswp->mrh = NULL;
1226 
1227 	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
1228 
1229 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
1230 		cmn_err(CE_WARN, "invalid device name: %s", vswp->physname);
1231 		goto mac_fail_exit;
1232 	}
1233 	if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
1234 		cmn_err(CE_WARN, "mac_open %s failed", vswp->physname);
1235 		goto mac_fail_exit;
1236 	}
1237 
1238 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1239 
1240 	/* register our rx callback function */
1241 	vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1242 
1243 	/* get the MAC tx fn */
1244 	vswp->txinfo = mac_tx_get(vswp->mh);
1245 
1246 	/* start the interface */
1247 	if (mac_start(vswp->mh) != 0) {
1248 		cmn_err(CE_WARN, "could not start mac interface");
1249 		goto mac_fail_exit;
1250 	}
1251 
1252 	D1(vswp, "vsw_mac_attach: exit");
1253 	return (0);
1254 
1255 mac_fail_exit:
1256 	if (vswp->mh != NULL) {
1257 		if (vswp->mrh != NULL)
1258 			mac_rx_remove(vswp->mh, vswp->mrh);
1259 
1260 		mac_close(vswp->mh);
1261 	}
1262 
1263 	vswp->mrh = NULL;
1264 	vswp->mh = NULL;
1265 	vswp->txinfo = NULL;
1266 
1267 	D1(vswp, "vsw_mac_attach: fail exit");
1268 	return (1);
1269 }
1270 
1271 static void
1272 vsw_mac_detach(vsw_t *vswp)
1273 {
1274 	D1(vswp, "vsw_mac_detach: enter");
1275 
1276 	if (vswp->mh != NULL) {
1277 		if (vswp->mrh != NULL)
1278 			mac_rx_remove(vswp->mh, vswp->mrh);
1279 
1280 		mac_stop(vswp->mh);
1281 		mac_close(vswp->mh);
1282 	}
1283 
1284 	vswp->mrh = NULL;
1285 	vswp->mh = NULL;
1286 	vswp->txinfo = NULL;
1287 
1288 	D1(vswp, "vsw_mac_detach: exit");
1289 }
1290 
1291 /*
1292  * Depending on the mode specified, the capabilites and capacity
1293  * of the underlying device setup the physical device.
1294  *
1295  * If in layer 3 mode, then do nothing.
1296  *
1297  * If in layer 2 programmed mode attempt to program the unicast address
1298  * associated with the port into the physical device. If this is not
1299  * possible due to resource exhaustion or simply because the device does
1300  * not support multiple unicast addresses then if required fallback onto
1301  * putting the card into promisc mode.
1302  *
1303  * If in promisc mode then simply set the card into promisc mode.
1304  *
1305  * Returns 0 success, 1 on failure.
1306  */
1307 static int
1308 vsw_set_hw(vsw_t *vswp, vsw_port_t *port)
1309 {
1310 	mac_multi_addr_t	mac_addr;
1311 	void			*mah;
1312 	int			err;
1313 
1314 	D1(vswp, "%s: enter", __func__);
1315 
1316 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1317 		return (0);
1318 
1319 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
1320 		return (vsw_set_hw_promisc(vswp, port));
1321 	}
1322 
1323 	if (vswp->maddr.maddr_handle == NULL)
1324 		return (1);
1325 
1326 	mah = vswp->maddr.maddr_handle;
1327 
1328 	/*
1329 	 * Attempt to program the unicast address into the HW.
1330 	 */
1331 	mac_addr.mma_addrlen = ETHERADDRL;
1332 	ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
1333 
1334 	err = vswp->maddr.maddr_add(mah, &mac_addr);
1335 	if (err != 0) {
1336 		cmn_err(CE_WARN, "!failed to program addr "
1337 			"%x:%x:%x:%x:%x:%x for port %d into device %s "
1338 			": err %d", port->p_macaddr.ether_addr_octet[0],
1339 			port->p_macaddr.ether_addr_octet[1],
1340 			port->p_macaddr.ether_addr_octet[2],
1341 			port->p_macaddr.ether_addr_octet[3],
1342 			port->p_macaddr.ether_addr_octet[4],
1343 			port->p_macaddr.ether_addr_octet[5],
1344 			port->p_instance, vswp->physname, err);
1345 
1346 		/*
1347 		 * Mark that attempt should be made to re-config sometime
1348 		 * in future if a port is deleted.
1349 		 */
1350 		vswp->recfg_reqd = B_TRUE;
1351 
1352 		/*
1353 		 * Only 1 mode specified, nothing more to do.
1354 		 */
1355 		if (vswp->smode_num == 1)
1356 			return (err);
1357 
1358 		/*
1359 		 * If promiscuous was next mode specified try to
1360 		 * set the card into that mode.
1361 		 */
1362 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
1363 			(vswp->smode[vswp->smode_idx + 1]
1364 					== VSW_LAYER2_PROMISC)) {
1365 			vswp->smode_idx += 1;
1366 			return (vsw_set_hw_promisc(vswp, port));
1367 		}
1368 		return (err);
1369 	}
1370 
1371 	port->addr_slot = mac_addr.mma_slot;
1372 	port->addr_set = VSW_ADDR_HW;
1373 
1374 	D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x for port %d "
1375 		"into slot %d of device %s",
1376 		port->p_macaddr.ether_addr_octet[0],
1377 		port->p_macaddr.ether_addr_octet[1],
1378 		port->p_macaddr.ether_addr_octet[2],
1379 		port->p_macaddr.ether_addr_octet[3],
1380 		port->p_macaddr.ether_addr_octet[4],
1381 		port->p_macaddr.ether_addr_octet[5],
1382 		port->p_instance, port->addr_slot, vswp->physname);
1383 
1384 	D1(vswp, "%s: exit", __func__);
1385 
1386 	return (0);
1387 }
1388 
1389 /*
1390  * If in layer 3 mode do nothing.
1391  *
1392  * If in layer 2 switched mode remove the address from the physical
1393  * device.
1394  *
1395  * If in layer 2 promiscuous mode disable promisc mode.
1396  *
1397  * Returns 0 on success.
1398  */
1399 static int
1400 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port)
1401 {
1402 	int		err;
1403 	void		*mah;
1404 
1405 	D1(vswp, "%s: enter", __func__);
1406 
1407 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1408 		return (0);
1409 
1410 	if (port->addr_set == VSW_ADDR_PROMISC) {
1411 		return (vsw_unset_hw_promisc(vswp, port));
1412 	}
1413 
1414 	if (port->addr_set == VSW_ADDR_HW) {
1415 		if (vswp->mh == NULL)
1416 			return (1);
1417 
1418 		if (vswp->maddr.maddr_handle == NULL)
1419 			return (1);
1420 
1421 		mah = vswp->maddr.maddr_handle;
1422 
1423 		err = vswp->maddr.maddr_remove(mah, port->addr_slot);
1424 		if (err != 0) {
1425 			cmn_err(CE_WARN, "!Unable to remove addr "
1426 				"%x:%x:%x:%x:%x:%x for port %d from device %s"
1427 				" : (err %d)",
1428 				port->p_macaddr.ether_addr_octet[0],
1429 				port->p_macaddr.ether_addr_octet[1],
1430 				port->p_macaddr.ether_addr_octet[2],
1431 				port->p_macaddr.ether_addr_octet[3],
1432 				port->p_macaddr.ether_addr_octet[4],
1433 				port->p_macaddr.ether_addr_octet[5],
1434 				port->p_instance, vswp->physname, err);
1435 			return (err);
1436 		}
1437 
1438 		port->addr_set = VSW_ADDR_UNSET;
1439 
1440 		D2(vswp, "removed addr %x:%x:%x:%x:%x:%x for "
1441 			"port %d from device %s",
1442 			port->p_macaddr.ether_addr_octet[0],
1443 			port->p_macaddr.ether_addr_octet[1],
1444 			port->p_macaddr.ether_addr_octet[2],
1445 			port->p_macaddr.ether_addr_octet[3],
1446 			port->p_macaddr.ether_addr_octet[4],
1447 			port->p_macaddr.ether_addr_octet[5],
1448 			port->p_instance, vswp->physname);
1449 	}
1450 
1451 	D1(vswp, "%s: exit", __func__);
1452 	return (0);
1453 }
1454 
1455 /*
1456  * Set network card into promisc mode.
1457  *
1458  * Returns 0 on success, 1 on failure.
1459  */
1460 static int
1461 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port)
1462 {
1463 	D1(vswp, "%s: enter", __func__);
1464 
1465 	if (vswp->mh == NULL)
1466 		return (1);
1467 
1468 	if (vswp->promisc_cnt++ == 0) {
1469 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1470 			vswp->promisc_cnt--;
1471 			return (1);
1472 		}
1473 		cmn_err(CE_NOTE, "!switching device %s into promiscuous mode",
1474 				vswp->physname);
1475 	}
1476 	port->addr_set = VSW_ADDR_PROMISC;
1477 
1478 	D1(vswp, "%s: exit", __func__);
1479 
1480 	return (0);
1481 }
1482 
1483 /*
1484  * Turn off promiscuous mode on network card.
1485  *
1486  * Returns 0 on success, 1 on failure.
1487  */
1488 static int
1489 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port)
1490 {
1491 	vsw_port_list_t 	*plist = &vswp->plist;
1492 
1493 	D1(vswp, "%s: enter", __func__);
1494 
1495 	if (vswp->mh == NULL)
1496 		return (1);
1497 
1498 	ASSERT(port->addr_set == VSW_ADDR_PROMISC);
1499 
1500 	if (--vswp->promisc_cnt == 0) {
1501 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
1502 			vswp->promisc_cnt++;
1503 			return (1);
1504 		}
1505 
1506 		/*
1507 		 * We are exiting promisc mode either because we were
1508 		 * only in promisc mode because we had failed over from
1509 		 * switched mode due to HW resource issues, or the user
1510 		 * wanted the card in promisc mode for all the ports and
1511 		 * the last port is now being deleted. Tweak the message
1512 		 * accordingly.
1513 		 */
1514 		if (plist->num_ports != 0) {
1515 			cmn_err(CE_NOTE, "!switching device %s back to "
1516 				"programmed mode", vswp->physname);
1517 		} else {
1518 			cmn_err(CE_NOTE, "!switching device %s out of "
1519 				"promiscuous mode", vswp->physname);
1520 		}
1521 	}
1522 	port->addr_set = VSW_ADDR_UNSET;
1523 
1524 	D1(vswp, "%s: exit", __func__);
1525 	return (0);
1526 }
1527 
1528 /*
1529  * Determine whether or not we are operating in our prefered
1530  * mode and if not whether the physical resources now allow us
1531  * to operate in it.
1532  *
1533  * Should only be invoked after port which is being deleted has been
1534  * removed from the port list.
1535  */
1536 static int
1537 vsw_reconfig_hw(vsw_t *vswp)
1538 {
1539 	vsw_port_list_t 	*plist = &vswp->plist;
1540 	mac_multi_addr_t	mac_addr;
1541 	vsw_port_t		*tp;
1542 	void			*mah;
1543 	int			rv = 0;
1544 	int			s_idx;
1545 
1546 	D1(vswp, "%s: enter", __func__);
1547 
1548 	if (vswp->maddr.maddr_handle == NULL)
1549 		return (1);
1550 
1551 	/*
1552 	 * Check if there are now sufficient HW resources to
1553 	 * attempt a re-config.
1554 	 */
1555 	if (plist->num_ports > vswp->maddr.maddr_naddrfree)
1556 		return (1);
1557 
1558 	/*
1559 	 * If we are in layer 2 (i.e. switched) or would like to be
1560 	 * in layer 2 then check if any ports need to be programmed
1561 	 * into the HW.
1562 	 *
1563 	 * This can happen in two cases - switched was specified as
1564 	 * the prefered mode of operation but we exhausted the HW
1565 	 * resources and so failed over to the next specifed mode,
1566 	 * or switched was the only mode specified so after HW
1567 	 * resources were exhausted there was nothing more we
1568 	 * could do.
1569 	 */
1570 	if (vswp->smode_idx > 0)
1571 		s_idx = vswp->smode_idx - 1;
1572 	else
1573 		s_idx = vswp->smode_idx;
1574 
1575 	if (vswp->smode[s_idx] == VSW_LAYER2) {
1576 		mah = vswp->maddr.maddr_handle;
1577 
1578 		D2(vswp, "%s: attempting reconfig..", __func__);
1579 
1580 		/*
1581 		 * Scan the port list for any port whose address has not
1582 		 * be programmed in HW - there should be a max of one.
1583 		 */
1584 		for (tp = plist->head; tp != NULL; tp = tp->p_next) {
1585 			if (tp->addr_set != VSW_ADDR_HW) {
1586 				mac_addr.mma_addrlen = ETHERADDRL;
1587 				ether_copy(&tp->p_macaddr, &mac_addr.mma_addr);
1588 
1589 				rv = vswp->maddr.maddr_add(mah, &mac_addr);
1590 				if (rv != 0) {
1591 					DWARN(vswp, "Error setting addr in "
1592 						"HW for port %d err %d",
1593 						tp->p_instance, rv);
1594 					goto reconfig_err_exit;
1595 				}
1596 				tp->addr_slot = mac_addr.mma_slot;
1597 
1598 				D2(vswp, "re-programmed port %d "
1599 					"addr %x:%x:%x:%x:%x:%x into slot %d"
1600 					" of device %s", tp->p_instance,
1601 					tp->p_macaddr.ether_addr_octet[0],
1602 					tp->p_macaddr.ether_addr_octet[1],
1603 					tp->p_macaddr.ether_addr_octet[2],
1604 					tp->p_macaddr.ether_addr_octet[3],
1605 					tp->p_macaddr.ether_addr_octet[4],
1606 					tp->p_macaddr.ether_addr_octet[5],
1607 					tp->addr_slot, vswp->physname);
1608 
1609 				/*
1610 				 * If up to now we had to put the card into
1611 				 * promisc mode to see this address, we
1612 				 * can now safely disable promisc mode.
1613 				 */
1614 				if (tp->addr_set == VSW_ADDR_PROMISC)
1615 					(void) vsw_unset_hw_promisc(vswp, tp);
1616 
1617 				tp->addr_set = VSW_ADDR_HW;
1618 			}
1619 		}
1620 
1621 		/* no further re-config needed */
1622 		vswp->recfg_reqd = B_FALSE;
1623 
1624 		vswp->smode_idx = s_idx;
1625 
1626 		return (0);
1627 	}
1628 
1629 reconfig_err_exit:
1630 	return (rv);
1631 }
1632 
1633 /*
1634  * receive callback routine. Invoked by MAC layer when there
1635  * are pkts being passed up from physical device.
1636  *
1637  * PERF: It may be more efficient when the card is in promisc
1638  * mode to check the dest address of the pkts here (against
1639  * the FDB) rather than checking later. Needs to be investigated.
1640  */
1641 static void
1642 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1643 {
1644 	_NOTE(ARGUNUSED(mrh))
1645 
1646 	vsw_t		*vswp = (vsw_t *)arg;
1647 
1648 	ASSERT(vswp != NULL);
1649 
1650 	D1(vswp, "vsw_rx_cb: enter");
1651 
1652 	/* switch the chain of packets received */
1653 	vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1654 
1655 	D1(vswp, "vsw_rx_cb: exit");
1656 }
1657 
1658 /*
1659  * Send a message out over the physical device via the MAC layer.
1660  *
1661  * Returns any mblks that it was unable to transmit.
1662  */
1663 static mblk_t *
1664 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
1665 {
1666 	const mac_txinfo_t	*mtp;
1667 	mblk_t			*nextp;
1668 
1669 	if (vswp->mh == NULL) {
1670 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
1671 		return (mp);
1672 	} else {
1673 		for (;;) {
1674 			nextp = mp->b_next;
1675 			mp->b_next = NULL;
1676 
1677 			mtp = vswp->txinfo;
1678 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
1679 				mp->b_next = nextp;
1680 				break;
1681 			}
1682 
1683 			if ((mp = nextp) == NULL)
1684 				break;
1685 
1686 		}
1687 
1688 	}
1689 
1690 	return (mp);
1691 }
1692 
1693 /*
1694  * Register with the MAC layer as a network device, so we
1695  * can be plumbed if necessary.
1696  */
1697 static int
1698 vsw_mac_register(vsw_t *vswp)
1699 {
1700 	mac_register_t	*macp;
1701 	int		rv;
1702 
1703 	D1(vswp, "%s: enter", __func__);
1704 
1705 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
1706 		return (EINVAL);
1707 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1708 	macp->m_driver = vswp;
1709 	macp->m_dip = vswp->dip;
1710 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
1711 	macp->m_callbacks = &vsw_m_callbacks;
1712 	macp->m_min_sdu = 0;
1713 	macp->m_max_sdu = ETHERMTU;
1714 	rv = mac_register(macp, &vswp->if_mh);
1715 	mac_free(macp);
1716 	if (rv == 0)
1717 		vswp->if_state |= VSW_IF_REG;
1718 
1719 	D1(vswp, "%s: exit", __func__);
1720 
1721 	return (rv);
1722 }
1723 
1724 static int
1725 vsw_mac_unregister(vsw_t *vswp)
1726 {
1727 	int		rv = 0;
1728 
1729 	D1(vswp, "%s: enter", __func__);
1730 
1731 	WRITE_ENTER(&vswp->if_lockrw);
1732 
1733 	if (vswp->if_state & VSW_IF_REG) {
1734 		rv = mac_unregister(vswp->if_mh);
1735 		if (rv != 0) {
1736 			DWARN(vswp, "%s: unable to unregister from MAC "
1737 				"framework", __func__);
1738 
1739 			RW_EXIT(&vswp->if_lockrw);
1740 			D1(vswp, "%s: fail exit", __func__);
1741 			return (rv);
1742 		}
1743 
1744 		/* mark i/f as down and unregistered */
1745 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
1746 	}
1747 	RW_EXIT(&vswp->if_lockrw);
1748 
1749 	vswp->mdprops &= ~(VSW_MD_MACADDR | VSW_DEV_MACADDR);
1750 
1751 	D1(vswp, "%s: exit", __func__);
1752 
1753 	return (rv);
1754 }
1755 
1756 static int
1757 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
1758 {
1759 	vsw_t			*vswp = (vsw_t *)arg;
1760 
1761 	D1(vswp, "%s: enter", __func__);
1762 
1763 	if (vswp->mh == NULL)
1764 		return (EINVAL);
1765 
1766 	/* return stats from underlying device */
1767 	*val = mac_stat_get(vswp->mh, stat);
1768 	return (0);
1769 }
1770 
1771 static void
1772 vsw_m_stop(void *arg)
1773 {
1774 	vsw_t		*vswp = (vsw_t *)arg;
1775 
1776 	D1(vswp, "%s: enter", __func__);
1777 
1778 	WRITE_ENTER(&vswp->if_lockrw);
1779 	vswp->if_state &= ~VSW_IF_UP;
1780 	RW_EXIT(&vswp->if_lockrw);
1781 
1782 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1783 }
1784 
1785 static int
1786 vsw_m_start(void *arg)
1787 {
1788 	vsw_t		*vswp = (vsw_t *)arg;
1789 
1790 	D1(vswp, "%s: enter", __func__);
1791 
1792 	WRITE_ENTER(&vswp->if_lockrw);
1793 	vswp->if_state |= VSW_IF_UP;
1794 	RW_EXIT(&vswp->if_lockrw);
1795 
1796 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1797 	return (0);
1798 }
1799 
1800 /*
1801  * Change the local interface address.
1802  */
1803 static int
1804 vsw_m_unicst(void *arg, const uint8_t *macaddr)
1805 {
1806 	vsw_t		*vswp = (vsw_t *)arg;
1807 
1808 	D1(vswp, "%s: enter", __func__);
1809 
1810 	WRITE_ENTER(&vswp->if_lockrw);
1811 	ether_copy(macaddr, &vswp->if_addr);
1812 	RW_EXIT(&vswp->if_lockrw);
1813 
1814 	D1(vswp, "%s: exit", __func__);
1815 
1816 	return (0);
1817 }
1818 
1819 static int
1820 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
1821 {
1822 	vsw_t		*vswp = (vsw_t *)arg;
1823 	mcst_addr_t	*mcst_p = NULL;
1824 	uint64_t	addr = 0x0;
1825 	int		i, ret = 0;
1826 
1827 	D1(vswp, "%s: enter", __func__);
1828 
1829 	/*
1830 	 * Convert address into form that can be used
1831 	 * as hash table key.
1832 	 */
1833 	for (i = 0; i < ETHERADDRL; i++) {
1834 		addr = (addr << 8) | mca[i];
1835 	}
1836 
1837 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
1838 
1839 	if (add) {
1840 		D2(vswp, "%s: adding multicast", __func__);
1841 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1842 			/*
1843 			 * Update the list of multicast addresses
1844 			 * contained within the vsw_t structure to
1845 			 * include this new one.
1846 			 */
1847 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
1848 			if (mcst_p == NULL) {
1849 				DERR(vswp, "%s unable to alloc mem", __func__);
1850 				return (1);
1851 			}
1852 			mcst_p->addr = addr;
1853 
1854 			mutex_enter(&vswp->mca_lock);
1855 			mcst_p->nextp = vswp->mcap;
1856 			vswp->mcap = mcst_p;
1857 			mutex_exit(&vswp->mca_lock);
1858 
1859 			/*
1860 			 * Call into the underlying driver to program the
1861 			 * address into HW.
1862 			 */
1863 			if (vswp->mh != NULL) {
1864 				ret = mac_multicst_add(vswp->mh, mca);
1865 				if (ret != 0) {
1866 					cmn_err(CE_WARN, "!unable to add "
1867 						"multicast address");
1868 					goto vsw_remove_addr;
1869 				}
1870 			}
1871 		} else {
1872 			cmn_err(CE_WARN, "!unable to add multicast address");
1873 		}
1874 		return (ret);
1875 	}
1876 
1877 vsw_remove_addr:
1878 
1879 	D2(vswp, "%s: removing multicast", __func__);
1880 	/*
1881 	 * Remove the address from the hash table..
1882 	 */
1883 	if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1884 
1885 		/*
1886 		 * ..and then from the list maintained in the
1887 		 * vsw_t structure.
1888 		 */
1889 		vsw_del_addr(VSW_LOCALDEV, vswp, addr);
1890 
1891 		if (vswp->mh != NULL)
1892 			(void) mac_multicst_remove(vswp->mh, mca);
1893 	}
1894 
1895 	D1(vswp, "%s: exit", __func__);
1896 
1897 	return (0);
1898 }
1899 
1900 static int
1901 vsw_m_promisc(void *arg, boolean_t on)
1902 {
1903 	vsw_t		*vswp = (vsw_t *)arg;
1904 
1905 	D1(vswp, "%s: enter", __func__);
1906 
1907 	WRITE_ENTER(&vswp->if_lockrw);
1908 	if (on)
1909 		vswp->if_state |= VSW_IF_PROMISC;
1910 	else
1911 		vswp->if_state &= ~VSW_IF_PROMISC;
1912 	RW_EXIT(&vswp->if_lockrw);
1913 
1914 	D1(vswp, "%s: exit", __func__);
1915 
1916 	return (0);
1917 }
1918 
1919 static mblk_t *
1920 vsw_m_tx(void *arg, mblk_t *mp)
1921 {
1922 	vsw_t		*vswp = (vsw_t *)arg;
1923 
1924 	D1(vswp, "%s: enter", __func__);
1925 
1926 	vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
1927 
1928 	D1(vswp, "%s: exit", __func__);
1929 
1930 	return (NULL);
1931 }
1932 
1933 /*
1934  * Register for machine description (MD) updates.
1935  */
1936 static void
1937 vsw_mdeg_register(vsw_t *vswp)
1938 {
1939 	mdeg_prop_spec_t	*pspecp;
1940 	mdeg_node_spec_t	*inst_specp;
1941 	mdeg_handle_t		mdeg_hdl;
1942 	size_t			templatesz;
1943 	int			inst, rv;
1944 
1945 	D1(vswp, "%s: enter", __func__);
1946 
1947 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
1948 		DDI_PROP_DONTPASS, reg_propname, -1);
1949 	if (inst == -1) {
1950 		DERR(vswp, "%s: unable to get %s property",
1951 						__func__, reg_propname);
1952 		return;
1953 	}
1954 
1955 	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
1956 
1957 	/*
1958 	 * Allocate and initialize a per-instance copy
1959 	 * of the global property spec array that will
1960 	 * uniquely identify this vsw instance.
1961 	 */
1962 	templatesz = sizeof (vsw_prop_template);
1963 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
1964 
1965 	bcopy(vsw_prop_template, pspecp, templatesz);
1966 
1967 	VSW_SET_MDEG_PROP_INST(pspecp, inst);
1968 
1969 	/* initialize the complete prop spec structure */
1970 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
1971 	inst_specp->namep = "virtual-device";
1972 	inst_specp->specp = pspecp;
1973 
1974 	/* perform the registration */
1975 	rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb,
1976 	    (void *)vswp, &mdeg_hdl);
1977 
1978 	if (rv != MDEG_SUCCESS) {
1979 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
1980 		kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
1981 		kmem_free(pspecp, templatesz);
1982 		return;
1983 	}
1984 
1985 	/* save off data that will be needed later */
1986 	vswp->inst_spec = inst_specp;
1987 	vswp->mdeg_hdl = mdeg_hdl;
1988 
1989 	D1(vswp, "%s: exit", __func__);
1990 }
1991 
1992 static void
1993 vsw_mdeg_unregister(vsw_t *vswp)
1994 {
1995 	D1(vswp, "vsw_mdeg_unregister: enter");
1996 
1997 	(void) mdeg_unregister(vswp->mdeg_hdl);
1998 
1999 	if (vswp->inst_spec->specp != NULL) {
2000 		(void) kmem_free(vswp->inst_spec->specp,
2001 			sizeof (vsw_prop_template));
2002 		vswp->inst_spec->specp = NULL;
2003 	}
2004 
2005 	if (vswp->inst_spec != NULL) {
2006 		(void) kmem_free(vswp->inst_spec,
2007 			sizeof (mdeg_node_spec_t));
2008 		vswp->inst_spec = NULL;
2009 	}
2010 
2011 	D1(vswp, "vsw_mdeg_unregister: exit");
2012 }
2013 
2014 static int
2015 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2016 {
2017 	vsw_t		*vswp;
2018 	int		idx;
2019 	md_t		*mdp;
2020 	mde_cookie_t	node;
2021 	uint64_t	inst;
2022 
2023 	if (resp == NULL)
2024 		return (MDEG_FAILURE);
2025 
2026 	vswp = (vsw_t *)cb_argp;
2027 
2028 	D1(vswp, "%s: added %d : removed %d : matched %d",
2029 		__func__, resp->added.nelem, resp->removed.nelem,
2030 		resp->match_prev.nelem);
2031 
2032 	/* process added ports */
2033 	for (idx = 0; idx < resp->added.nelem; idx++) {
2034 		mdp = resp->added.mdp;
2035 		node = resp->added.mdep[idx];
2036 
2037 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
2038 
2039 		if (vsw_port_add(vswp, mdp, &node) != 0) {
2040 			cmn_err(CE_WARN, "Unable to add new port (0x%lx)",
2041 					node);
2042 		}
2043 	}
2044 
2045 	/* process removed ports */
2046 	for (idx = 0; idx < resp->removed.nelem; idx++) {
2047 		mdp = resp->removed.mdp;
2048 		node = resp->removed.mdep[idx];
2049 
2050 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
2051 			DERR(vswp, "%s: prop(%s) not found port(%d)",
2052 				__func__, id_propname, idx);
2053 			continue;
2054 		}
2055 
2056 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
2057 
2058 		if (vsw_port_detach(vswp, inst) != 0) {
2059 			cmn_err(CE_WARN, "Unable to remove port %ld", inst);
2060 		}
2061 	}
2062 
2063 	/*
2064 	 * Currently no support for updating already active ports.
2065 	 * So, ignore the match_curr and match_priv arrays for now.
2066 	 */
2067 
2068 	D1(vswp, "%s: exit", __func__);
2069 
2070 	return (MDEG_SUCCESS);
2071 }
2072 
2073 /*
2074  * Add a new port to the system.
2075  *
2076  * Returns 0 on success, 1 on failure.
2077  */
2078 int
2079 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
2080 {
2081 	uint64_t		ldc_id;
2082 	uint8_t			*addrp;
2083 	int			i, addrsz;
2084 	int			num_nodes = 0, nchan = 0;
2085 	int			listsz = 0;
2086 	mde_cookie_t		*listp = NULL;
2087 	struct ether_addr	ea;
2088 	uint64_t		macaddr;
2089 	uint64_t		inst = 0;
2090 	vsw_port_t		*port;
2091 
2092 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
2093 		DWARN(vswp, "%s: prop(%s) not found", __func__,
2094 			id_propname);
2095 		return (1);
2096 	}
2097 
2098 	/*
2099 	 * Find the channel endpoint node(s) (which should be under this
2100 	 * port node) which contain the channel id(s).
2101 	 */
2102 	if ((num_nodes = md_node_count(mdp)) <= 0) {
2103 		DERR(vswp, "%s: invalid number of nodes found (%d)",
2104 			__func__, num_nodes);
2105 		return (1);
2106 	}
2107 
2108 	/* allocate enough space for node list */
2109 	listsz = num_nodes * sizeof (mde_cookie_t);
2110 	listp = kmem_zalloc(listsz, KM_SLEEP);
2111 
2112 	nchan = md_scan_dag(mdp, *node,
2113 		md_find_name(mdp, chan_propname),
2114 		md_find_name(mdp, "fwd"), listp);
2115 
2116 	if (nchan <= 0) {
2117 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
2118 		kmem_free(listp, listsz);
2119 		return (1);
2120 	}
2121 
2122 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
2123 
2124 	/* use property from first node found */
2125 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
2126 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
2127 			id_propname);
2128 		kmem_free(listp, listsz);
2129 		return (1);
2130 	}
2131 
2132 	/* don't need list any more */
2133 	kmem_free(listp, listsz);
2134 
2135 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
2136 
2137 	/* read mac-address property */
2138 	if (md_get_prop_data(mdp, *node, remaddr_propname,
2139 					&addrp, &addrsz)) {
2140 		DWARN(vswp, "%s: prop(%s) not found",
2141 				__func__, remaddr_propname);
2142 		return (1);
2143 	}
2144 
2145 	if (addrsz < ETHERADDRL) {
2146 		DWARN(vswp, "%s: invalid address size", __func__);
2147 		return (1);
2148 	}
2149 
2150 	macaddr = *((uint64_t *)addrp);
2151 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
2152 
2153 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2154 		ea.ether_addr_octet[i] = macaddr & 0xFF;
2155 		macaddr >>= 8;
2156 	}
2157 
2158 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
2159 		DERR(vswp, "%s: failed to attach port", __func__);
2160 		return (1);
2161 	}
2162 
2163 	port = vsw_lookup_port(vswp, (int)inst);
2164 
2165 	/* just successfuly created the port, so it should exist */
2166 	ASSERT(port != NULL);
2167 
2168 	return (0);
2169 }
2170 
2171 /*
2172  * Attach the specified port.
2173  *
2174  * Returns 0 on success, 1 on failure.
2175  */
2176 static int
2177 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
2178 struct ether_addr *macaddr)
2179 {
2180 	vsw_port_list_t		*plist = &vswp->plist;
2181 	vsw_port_t		*port, **prev_port;
2182 	int			i;
2183 
2184 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
2185 
2186 	/* port already exists? */
2187 	READ_ENTER(&plist->lockrw);
2188 	for (port = plist->head; port != NULL; port = port->p_next) {
2189 		if (port->p_instance == p_instance) {
2190 			DWARN(vswp, "%s: port instance %d already attached",
2191 				__func__, p_instance);
2192 			RW_EXIT(&plist->lockrw);
2193 			return (1);
2194 		}
2195 	}
2196 	RW_EXIT(&plist->lockrw);
2197 
2198 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
2199 	port->p_vswp = vswp;
2200 	port->p_instance = p_instance;
2201 	port->p_ldclist.num_ldcs = 0;
2202 	port->p_ldclist.head = NULL;
2203 	port->addr_set = VSW_ADDR_UNSET;
2204 
2205 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
2206 
2207 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
2208 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
2209 
2210 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
2211 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
2212 
2213 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
2214 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
2215 	port->state = VSW_PORT_INIT;
2216 
2217 	if (nids > VSW_PORT_MAX_LDCS) {
2218 		D2(vswp, "%s: using first of %d ldc ids",
2219 			__func__, nids);
2220 		nids = VSW_PORT_MAX_LDCS;
2221 	}
2222 
2223 	D2(vswp, "%s: %d nids", __func__, nids);
2224 	for (i = 0; i < nids; i++) {
2225 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
2226 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
2227 			DERR(vswp, "%s: ldc_attach failed", __func__);
2228 
2229 			rw_destroy(&port->p_ldclist.lockrw);
2230 
2231 			cv_destroy(&port->ref_cv);
2232 			mutex_destroy(&port->ref_lock);
2233 
2234 			cv_destroy(&port->state_cv);
2235 			mutex_destroy(&port->state_lock);
2236 
2237 			mutex_destroy(&port->tx_lock);
2238 			mutex_destroy(&port->mca_lock);
2239 			kmem_free(port, sizeof (vsw_port_t));
2240 			return (1);
2241 		}
2242 	}
2243 
2244 	ether_copy(macaddr, &port->p_macaddr);
2245 
2246 	WRITE_ENTER(&plist->lockrw);
2247 
2248 	/* create the fdb entry for this port/mac address */
2249 	(void) vsw_add_fdb(vswp, port);
2250 
2251 	(void) vsw_set_hw(vswp, port);
2252 
2253 	/* link it into the list of ports for this vsw instance */
2254 	prev_port = (vsw_port_t **)(&plist->head);
2255 	port->p_next = *prev_port;
2256 	*prev_port = port;
2257 	plist->num_ports++;
2258 	RW_EXIT(&plist->lockrw);
2259 
2260 	/*
2261 	 * Initialise the port and any ldc's under it.
2262 	 */
2263 	(void) vsw_init_ldcs(port);
2264 
2265 	D1(vswp, "%s: exit", __func__);
2266 	return (0);
2267 }
2268 
2269 /*
2270  * Detach the specified port.
2271  *
2272  * Returns 0 on success, 1 on failure.
2273  */
2274 static int
2275 vsw_port_detach(vsw_t *vswp, int p_instance)
2276 {
2277 	vsw_port_t	*port = NULL;
2278 	vsw_port_list_t	*plist = &vswp->plist;
2279 
2280 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
2281 
2282 	WRITE_ENTER(&plist->lockrw);
2283 
2284 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
2285 		RW_EXIT(&plist->lockrw);
2286 		return (1);
2287 	}
2288 
2289 	if (vsw_plist_del_node(vswp, port)) {
2290 		RW_EXIT(&plist->lockrw);
2291 		return (1);
2292 	}
2293 
2294 	/* Remove address if was programmed into HW. */
2295 	(void) vsw_unset_hw(vswp, port);
2296 
2297 	/* Remove the fdb entry for this port/mac address */
2298 	(void) vsw_del_fdb(vswp, port);
2299 
2300 	/* Remove any multicast addresses.. */
2301 	vsw_del_mcst_port(port);
2302 
2303 	/*
2304 	 * No longer need to hold writer lock on port list now
2305 	 * that we have unlinked the target port from the list.
2306 	 */
2307 	RW_EXIT(&plist->lockrw);
2308 
2309 	READ_ENTER(&plist->lockrw);
2310 
2311 	if (vswp->recfg_reqd)
2312 		(void) vsw_reconfig_hw(vswp);
2313 
2314 	RW_EXIT(&plist->lockrw);
2315 
2316 	if (vsw_port_delete(port)) {
2317 		return (1);
2318 	}
2319 
2320 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
2321 	return (0);
2322 }
2323 
2324 /*
2325  * Detach all active ports.
2326  *
2327  * Returns 0 on success, 1 on failure.
2328  */
2329 static int
2330 vsw_detach_ports(vsw_t *vswp)
2331 {
2332 	vsw_port_list_t 	*plist = &vswp->plist;
2333 	vsw_port_t		*port = NULL;
2334 
2335 	D1(vswp, "%s: enter", __func__);
2336 
2337 	WRITE_ENTER(&plist->lockrw);
2338 
2339 	while ((port = plist->head) != NULL) {
2340 		if (vsw_plist_del_node(vswp, port)) {
2341 			DERR(vswp, "%s: Error deleting port %d"
2342 				" from port list", __func__,
2343 				port->p_instance);
2344 			RW_EXIT(&plist->lockrw);
2345 			return (1);
2346 		}
2347 
2348 		/* Remove address if was programmed into HW. */
2349 		(void) vsw_unset_hw(vswp, port);
2350 
2351 		/* Remove the fdb entry for this port/mac address */
2352 		(void) vsw_del_fdb(vswp, port);
2353 
2354 		/* Remove any multicast addresses.. */
2355 		vsw_del_mcst_port(port);
2356 
2357 		/*
2358 		 * No longer need to hold the lock on the port list
2359 		 * now that we have unlinked the target port from the
2360 		 * list.
2361 		 */
2362 		RW_EXIT(&plist->lockrw);
2363 		if (vsw_port_delete(port)) {
2364 			DERR(vswp, "%s: Error deleting port %d",
2365 				__func__, port->p_instance);
2366 			return (1);
2367 		}
2368 		WRITE_ENTER(&plist->lockrw);
2369 	}
2370 	RW_EXIT(&plist->lockrw);
2371 
2372 	D1(vswp, "%s: exit", __func__);
2373 
2374 	return (0);
2375 }
2376 
2377 /*
2378  * Delete the specified port.
2379  *
2380  * Returns 0 on success, 1 on failure.
2381  */
2382 static int
2383 vsw_port_delete(vsw_port_t *port)
2384 {
2385 	vsw_ldc_list_t 		*ldcl;
2386 	vsw_t			*vswp = port->p_vswp;
2387 
2388 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
2389 
2390 	(void) vsw_uninit_ldcs(port);
2391 
2392 	/*
2393 	 * Wait for any pending ctrl msg tasks which reference this
2394 	 * port to finish.
2395 	 */
2396 	if (vsw_drain_port_taskq(port))
2397 		return (1);
2398 
2399 	/*
2400 	 * Wait for port reference count to hit zero.
2401 	 */
2402 	mutex_enter(&port->ref_lock);
2403 	while (port->ref_cnt != 0)
2404 		cv_wait(&port->ref_cv, &port->ref_lock);
2405 	mutex_exit(&port->ref_lock);
2406 
2407 	/*
2408 	 * Wait for any active callbacks to finish
2409 	 */
2410 	if (vsw_drain_ldcs(port))
2411 		return (1);
2412 
2413 	ldcl = &port->p_ldclist;
2414 	WRITE_ENTER(&ldcl->lockrw);
2415 	while (ldcl->num_ldcs > 0) {
2416 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
2417 			cmn_err(CE_WARN, "unable to detach ldc %ld",
2418 					ldcl->head->ldc_id);
2419 			RW_EXIT(&ldcl->lockrw);
2420 			return (1);
2421 		}
2422 	}
2423 	RW_EXIT(&ldcl->lockrw);
2424 
2425 	rw_destroy(&port->p_ldclist.lockrw);
2426 
2427 	mutex_destroy(&port->mca_lock);
2428 	mutex_destroy(&port->tx_lock);
2429 	cv_destroy(&port->ref_cv);
2430 	mutex_destroy(&port->ref_lock);
2431 
2432 	cv_destroy(&port->state_cv);
2433 	mutex_destroy(&port->state_lock);
2434 
2435 	kmem_free(port, sizeof (vsw_port_t));
2436 
2437 	D1(vswp, "%s: exit", __func__);
2438 
2439 	return (0);
2440 }
2441 
2442 /*
2443  * Attach a logical domain channel (ldc) under a specified port.
2444  *
2445  * Returns 0 on success, 1 on failure.
2446  */
2447 static int
2448 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
2449 {
2450 	vsw_t 		*vswp = port->p_vswp;
2451 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
2452 	vsw_ldc_t 	*ldcp = NULL;
2453 	ldc_attr_t 	attr;
2454 	ldc_status_t	istatus;
2455 	int 		status = DDI_FAILURE;
2456 	int		rv;
2457 
2458 	D1(vswp, "%s: enter", __func__);
2459 
2460 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
2461 	if (ldcp == NULL) {
2462 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
2463 		return (1);
2464 	}
2465 	ldcp->ldc_id = ldc_id;
2466 
2467 	/* allocate pool of receive mblks */
2468 	rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
2469 	if (rv) {
2470 		DWARN(vswp, "%s: unable to create free mblk pool for"
2471 			" channel %ld (rv %d)", __func__, ldc_id, rv);
2472 		kmem_free(ldcp, sizeof (vsw_ldc_t));
2473 		return (1);
2474 	}
2475 
2476 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
2477 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
2478 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
2479 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
2480 
2481 	/* required for handshake with peer */
2482 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
2483 	ldcp->peer_session = 0;
2484 	ldcp->session_status = 0;
2485 
2486 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
2487 	ldcp->hss_id = 1;	/* Initial handshake session id */
2488 
2489 	/* only set for outbound lane, inbound set by peer */
2490 	mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
2491 	mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
2492 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
2493 
2494 	attr.devclass = LDC_DEV_NT_SVC;
2495 	attr.instance = ddi_get_instance(vswp->dip);
2496 	attr.mode = LDC_MODE_UNRELIABLE;
2497 	attr.mtu = VSW_LDC_MTU;
2498 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
2499 	if (status != 0) {
2500 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
2501 		    __func__, ldc_id, status);
2502 		goto ldc_attach_fail;
2503 	}
2504 
2505 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
2506 	if (status != 0) {
2507 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
2508 		    __func__, ldc_id, status);
2509 		(void) ldc_fini(ldcp->ldc_handle);
2510 		goto ldc_attach_fail;
2511 	}
2512 
2513 
2514 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2515 		DERR(vswp, "%s: ldc_status failed", __func__);
2516 		return (1);
2517 	}
2518 
2519 	ldcp->ldc_status = istatus;
2520 	ldcp->ldc_port = port;
2521 	ldcp->ldc_vswp = vswp;
2522 
2523 	/* link it into the list of channels for this port */
2524 	WRITE_ENTER(&ldcl->lockrw);
2525 	ldcp->ldc_next = ldcl->head;
2526 	ldcl->head = ldcp;
2527 	ldcl->num_ldcs++;
2528 	RW_EXIT(&ldcl->lockrw);
2529 
2530 	D1(vswp, "%s: exit", __func__);
2531 	return (0);
2532 
2533 ldc_attach_fail:
2534 	mutex_destroy(&ldcp->ldc_txlock);
2535 	mutex_destroy(&ldcp->ldc_cblock);
2536 
2537 	cv_destroy(&ldcp->drain_cv);
2538 
2539 	if (ldcp->rxh != NULL) {
2540 		if (vio_destroy_mblks(ldcp->rxh) != 0) {
2541 			/*
2542 			 * Something odd has happened, as the destroy
2543 			 * will only fail if some mblks have been allocated
2544 			 * from the pool already (which shouldn't happen)
2545 			 * and have not been returned.
2546 			 *
2547 			 * Add the pool pointer to a list maintained in
2548 			 * the device instance. Another attempt will be made
2549 			 * to free the pool when the device itself detaches.
2550 			 */
2551 			cmn_err(CE_WARN, "Creation of ldc channel %ld failed"
2552 				" and cannot destroy associated mblk pool",
2553 				ldc_id);
2554 			ldcp->rxh->nextp =  vswp->rxh;
2555 			vswp->rxh = ldcp->rxh;
2556 		}
2557 	}
2558 	mutex_destroy(&ldcp->drain_cv_lock);
2559 	mutex_destroy(&ldcp->hss_lock);
2560 
2561 	mutex_destroy(&ldcp->lane_in.seq_lock);
2562 	mutex_destroy(&ldcp->lane_out.seq_lock);
2563 	kmem_free(ldcp, sizeof (vsw_ldc_t));
2564 
2565 	return (1);
2566 }
2567 
2568 /*
2569  * Detach a logical domain channel (ldc) belonging to a
2570  * particular port.
2571  *
2572  * Returns 0 on success, 1 on failure.
2573  */
2574 static int
2575 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
2576 {
2577 	vsw_t 		*vswp = port->p_vswp;
2578 	vsw_ldc_t 	*ldcp, *prev_ldcp;
2579 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2580 	int 		rv;
2581 
2582 	prev_ldcp = ldcl->head;
2583 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
2584 		if (ldcp->ldc_id == ldc_id) {
2585 			break;
2586 		}
2587 	}
2588 
2589 	/* specified ldc id not found */
2590 	if (ldcp == NULL) {
2591 		DERR(vswp, "%s: ldcp = NULL", __func__);
2592 		return (1);
2593 	}
2594 
2595 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
2596 
2597 	/*
2598 	 * Before we can close the channel we must release any mapped
2599 	 * resources (e.g. drings).
2600 	 */
2601 	vsw_free_lane_resources(ldcp, INBOUND);
2602 	vsw_free_lane_resources(ldcp, OUTBOUND);
2603 
2604 	/*
2605 	 * If the close fails we are in serious trouble, as won't
2606 	 * be able to delete the parent port.
2607 	 */
2608 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
2609 		DERR(vswp, "%s: error %d closing channel %lld",
2610 			__func__, rv, ldcp->ldc_id);
2611 		return (1);
2612 	}
2613 
2614 	(void) ldc_fini(ldcp->ldc_handle);
2615 
2616 	ldcp->ldc_status = LDC_INIT;
2617 	ldcp->ldc_handle = NULL;
2618 	ldcp->ldc_vswp = NULL;
2619 
2620 	if (ldcp->rxh != NULL) {
2621 		if (vio_destroy_mblks(ldcp->rxh)) {
2622 			/*
2623 			 * Mostly likely some mblks are still in use and
2624 			 * have not been returned to the pool. Add the pool
2625 			 * to the list maintained in the device instance.
2626 			 * Another attempt will be made to destroy the pool
2627 			 * when the device detaches.
2628 			 */
2629 			ldcp->rxh->nextp =  vswp->rxh;
2630 			vswp->rxh = ldcp->rxh;
2631 		}
2632 	}
2633 
2634 	mutex_destroy(&ldcp->ldc_txlock);
2635 	mutex_destroy(&ldcp->ldc_cblock);
2636 	cv_destroy(&ldcp->drain_cv);
2637 	mutex_destroy(&ldcp->drain_cv_lock);
2638 	mutex_destroy(&ldcp->hss_lock);
2639 	mutex_destroy(&ldcp->lane_in.seq_lock);
2640 	mutex_destroy(&ldcp->lane_out.seq_lock);
2641 
2642 	/* unlink it from the list */
2643 	prev_ldcp = ldcp->ldc_next;
2644 	ldcl->num_ldcs--;
2645 	kmem_free(ldcp, sizeof (vsw_ldc_t));
2646 
2647 	return (0);
2648 }
2649 
2650 /*
2651  * Open and attempt to bring up the channel. Note that channel
2652  * can only be brought up if peer has also opened channel.
2653  *
2654  * Returns 0 if can open and bring up channel, otherwise
2655  * returns 1.
2656  */
2657 static int
2658 vsw_ldc_init(vsw_ldc_t *ldcp)
2659 {
2660 	vsw_t 		*vswp = ldcp->ldc_vswp;
2661 	ldc_status_t	istatus = 0;
2662 	int		rv;
2663 
2664 	D1(vswp, "%s: enter", __func__);
2665 
2666 	LDC_ENTER_LOCK(ldcp);
2667 
2668 	/* don't start at 0 in case clients don't like that */
2669 	ldcp->next_ident = 1;
2670 
2671 	rv = ldc_open(ldcp->ldc_handle);
2672 	if (rv != 0) {
2673 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
2674 		    __func__, ldcp->ldc_id, rv);
2675 		LDC_EXIT_LOCK(ldcp);
2676 		return (1);
2677 	}
2678 
2679 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2680 		DERR(vswp, "%s: unable to get status", __func__);
2681 		LDC_EXIT_LOCK(ldcp);
2682 		return (1);
2683 
2684 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
2685 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
2686 		    __func__, ldcp->ldc_id, istatus);
2687 		LDC_EXIT_LOCK(ldcp);
2688 		return (1);
2689 	}
2690 
2691 	ldcp->ldc_status = istatus;
2692 	rv = ldc_up(ldcp->ldc_handle);
2693 	if (rv != 0) {
2694 		/*
2695 		 * Not a fatal error for ldc_up() to fail, as peer
2696 		 * end point may simply not be ready yet.
2697 		 */
2698 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
2699 			ldcp->ldc_id, rv);
2700 		LDC_EXIT_LOCK(ldcp);
2701 		return (1);
2702 	}
2703 
2704 	/*
2705 	 * ldc_up() call is non-blocking so need to explicitly
2706 	 * check channel status to see if in fact the channel
2707 	 * is UP.
2708 	 */
2709 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2710 		DERR(vswp, "%s: unable to get status", __func__);
2711 		LDC_EXIT_LOCK(ldcp);
2712 		return (1);
2713 
2714 	} else if (istatus != LDC_UP) {
2715 		DERR(vswp, "%s: id(%lld) status(%d) is not UP",
2716 		    __func__, ldcp->ldc_id, istatus);
2717 	} else {
2718 		ldcp->ldc_status = istatus;
2719 	}
2720 
2721 	LDC_EXIT_LOCK(ldcp);
2722 
2723 	D1(vswp, "%s: exit", __func__);
2724 	return (0);
2725 }
2726 
2727 /* disable callbacks on the channel */
2728 static int
2729 vsw_ldc_uninit(vsw_ldc_t *ldcp)
2730 {
2731 	vsw_t	*vswp = ldcp->ldc_vswp;
2732 	int	rv;
2733 
2734 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
2735 
2736 	LDC_ENTER_LOCK(ldcp);
2737 
2738 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
2739 	if (rv != 0) {
2740 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
2741 			"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
2742 		LDC_EXIT_LOCK(ldcp);
2743 		return (1);
2744 	}
2745 
2746 	ldcp->ldc_status = LDC_INIT;
2747 
2748 	LDC_EXIT_LOCK(ldcp);
2749 
2750 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
2751 
2752 	return (0);
2753 }
2754 
2755 static int
2756 vsw_init_ldcs(vsw_port_t *port)
2757 {
2758 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2759 	vsw_ldc_t	*ldcp;
2760 
2761 	READ_ENTER(&ldcl->lockrw);
2762 	ldcp =  ldcl->head;
2763 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2764 		(void) vsw_ldc_init(ldcp);
2765 	}
2766 	RW_EXIT(&ldcl->lockrw);
2767 
2768 	return (0);
2769 }
2770 
2771 static int
2772 vsw_uninit_ldcs(vsw_port_t *port)
2773 {
2774 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2775 	vsw_ldc_t	*ldcp;
2776 
2777 	D1(NULL, "vsw_uninit_ldcs: enter\n");
2778 
2779 	READ_ENTER(&ldcl->lockrw);
2780 	ldcp =  ldcl->head;
2781 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2782 		(void) vsw_ldc_uninit(ldcp);
2783 	}
2784 	RW_EXIT(&ldcl->lockrw);
2785 
2786 	D1(NULL, "vsw_uninit_ldcs: exit\n");
2787 
2788 	return (0);
2789 }
2790 
2791 /*
2792  * Wait until the callback(s) associated with the ldcs under the specified
2793  * port have completed.
2794  *
2795  * Prior to this function being invoked each channel under this port
2796  * should have been quiesced via ldc_set_cb_mode(DISABLE).
2797  *
2798  * A short explaination of what we are doing below..
2799  *
2800  * The simplest approach would be to have a reference counter in
2801  * the ldc structure which is increment/decremented by the callbacks as
2802  * they use the channel. The drain function could then simply disable any
2803  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
2804  * there is a tiny window here - before the callback is able to get the lock
2805  * on the channel it is interrupted and this function gets to execute. It
2806  * sees that the ref count is zero and believes its free to delete the
2807  * associated data structures.
2808  *
2809  * We get around this by taking advantage of the fact that before the ldc
2810  * framework invokes a callback it sets a flag to indicate that there is a
2811  * callback active (or about to become active). If when we attempt to
2812  * unregister a callback when this active flag is set then the unregister
2813  * will fail with EWOULDBLOCK.
2814  *
2815  * If the unregister fails we do a cv_timedwait. We will either be signaled
2816  * by the callback as it is exiting (note we have to wait a short period to
2817  * allow the callback to return fully to the ldc framework and it to clear
2818  * the active flag), or by the timer expiring. In either case we again attempt
2819  * the unregister. We repeat this until we can succesfully unregister the
2820  * callback.
2821  *
2822  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
2823  * the case where the callback has finished but the ldc framework has not yet
2824  * cleared the active flag. In this case we would never get a cv_signal.
2825  */
2826 static int
2827 vsw_drain_ldcs(vsw_port_t *port)
2828 {
2829 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2830 	vsw_ldc_t	*ldcp;
2831 	vsw_t		*vswp = port->p_vswp;
2832 
2833 	D1(vswp, "%s: enter", __func__);
2834 
2835 	READ_ENTER(&ldcl->lockrw);
2836 
2837 	ldcp = ldcl->head;
2838 
2839 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2840 		/*
2841 		 * If we can unregister the channel callback then we
2842 		 * know that there is no callback either running or
2843 		 * scheduled to run for this channel so move on to next
2844 		 * channel in the list.
2845 		 */
2846 		mutex_enter(&ldcp->drain_cv_lock);
2847 
2848 		/* prompt active callbacks to quit */
2849 		ldcp->drain_state = VSW_LDC_DRAINING;
2850 
2851 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
2852 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
2853 				ldcp->ldc_id);
2854 			mutex_exit(&ldcp->drain_cv_lock);
2855 			continue;
2856 		} else {
2857 			/*
2858 			 * If we end up here we know that either 1) a callback
2859 			 * is currently executing, 2) is about to start (i.e.
2860 			 * the ldc framework has set the active flag but
2861 			 * has not actually invoked the callback yet, or 3)
2862 			 * has finished and has returned to the ldc framework
2863 			 * but the ldc framework has not yet cleared the
2864 			 * active bit.
2865 			 *
2866 			 * Wait for it to finish.
2867 			 */
2868 			while (ldc_unreg_callback(ldcp->ldc_handle)
2869 								== EWOULDBLOCK)
2870 				(void) cv_timedwait(&ldcp->drain_cv,
2871 					&ldcp->drain_cv_lock, lbolt + hz);
2872 
2873 			mutex_exit(&ldcp->drain_cv_lock);
2874 			D2(vswp, "%s: unreg callback for chan %ld after "
2875 				"timeout", __func__, ldcp->ldc_id);
2876 		}
2877 	}
2878 	RW_EXIT(&ldcl->lockrw);
2879 
2880 	D1(vswp, "%s: exit", __func__);
2881 	return (0);
2882 }
2883 
2884 /*
2885  * Wait until all tasks which reference this port have completed.
2886  *
2887  * Prior to this function being invoked each channel under this port
2888  * should have been quiesced via ldc_set_cb_mode(DISABLE).
2889  */
2890 static int
2891 vsw_drain_port_taskq(vsw_port_t *port)
2892 {
2893 	vsw_t		*vswp = port->p_vswp;
2894 
2895 	D1(vswp, "%s: enter", __func__);
2896 
2897 	/*
2898 	 * Mark the port as in the process of being detached, and
2899 	 * dispatch a marker task to the queue so we know when all
2900 	 * relevant tasks have completed.
2901 	 */
2902 	mutex_enter(&port->state_lock);
2903 	port->state = VSW_PORT_DETACHING;
2904 
2905 	if ((vswp->taskq_p == NULL) ||
2906 		(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
2907 			port, DDI_NOSLEEP) != DDI_SUCCESS)) {
2908 		DERR(vswp, "%s: unable to dispatch marker task",
2909 			__func__);
2910 		mutex_exit(&port->state_lock);
2911 		return (1);
2912 	}
2913 
2914 	/*
2915 	 * Wait for the marker task to finish.
2916 	 */
2917 	while (port->state != VSW_PORT_DETACHABLE)
2918 		cv_wait(&port->state_cv, &port->state_lock);
2919 
2920 	mutex_exit(&port->state_lock);
2921 
2922 	D1(vswp, "%s: exit", __func__);
2923 
2924 	return (0);
2925 }
2926 
2927 static void
2928 vsw_marker_task(void *arg)
2929 {
2930 	vsw_port_t	*port = arg;
2931 	vsw_t		*vswp = port->p_vswp;
2932 
2933 	D1(vswp, "%s: enter", __func__);
2934 
2935 	mutex_enter(&port->state_lock);
2936 
2937 	/*
2938 	 * No further tasks should be dispatched which reference
2939 	 * this port so ok to mark it as safe to detach.
2940 	 */
2941 	port->state = VSW_PORT_DETACHABLE;
2942 
2943 	cv_signal(&port->state_cv);
2944 
2945 	mutex_exit(&port->state_lock);
2946 
2947 	D1(vswp, "%s: exit", __func__);
2948 }
2949 
2950 static vsw_port_t *
2951 vsw_lookup_port(vsw_t *vswp, int p_instance)
2952 {
2953 	vsw_port_list_t *plist = &vswp->plist;
2954 	vsw_port_t	*port;
2955 
2956 	for (port = plist->head; port != NULL; port = port->p_next) {
2957 		if (port->p_instance == p_instance) {
2958 			D2(vswp, "vsw_lookup_port: found p_instance\n");
2959 			return (port);
2960 		}
2961 	}
2962 
2963 	return (NULL);
2964 }
2965 
2966 /*
2967  * Search for and remove the specified port from the port
2968  * list. Returns 0 if able to locate and remove port, otherwise
2969  * returns 1.
2970  */
2971 static int
2972 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
2973 {
2974 	vsw_port_list_t *plist = &vswp->plist;
2975 	vsw_port_t	*curr_p, *prev_p;
2976 
2977 	if (plist->head == NULL)
2978 		return (1);
2979 
2980 	curr_p = prev_p = plist->head;
2981 
2982 	while (curr_p != NULL) {
2983 		if (curr_p == port) {
2984 			if (prev_p == curr_p) {
2985 				plist->head = curr_p->p_next;
2986 			} else {
2987 				prev_p->p_next = curr_p->p_next;
2988 			}
2989 			plist->num_ports--;
2990 			break;
2991 		} else {
2992 			prev_p = curr_p;
2993 			curr_p = curr_p->p_next;
2994 		}
2995 	}
2996 	return (0);
2997 }
2998 
2999 /*
3000  * Interrupt handler for ldc messages.
3001  */
3002 static uint_t
3003 vsw_ldc_cb(uint64_t event, caddr_t arg)
3004 {
3005 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
3006 	vsw_t 		*vswp = ldcp->ldc_vswp;
3007 	ldc_status_t	lstatus;
3008 	int		rv;
3009 
3010 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3011 
3012 	mutex_enter(&ldcp->ldc_cblock);
3013 
3014 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
3015 		mutex_exit(&ldcp->ldc_cblock);
3016 		return (LDC_SUCCESS);
3017 	}
3018 
3019 	if (event & LDC_EVT_UP) {
3020 		/*
3021 		 * Channel has come up, get the state and then start
3022 		 * the handshake.
3023 		 */
3024 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
3025 		if (rv != 0) {
3026 			cmn_err(CE_WARN, "Unable to read channel state");
3027 		}
3028 		ldcp->ldc_status = lstatus;
3029 
3030 		D2(vswp, "%s: id(%ld) event(%llx) UP:  status(%ld)",
3031 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3032 
3033 		vsw_restart_handshake(ldcp);
3034 
3035 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
3036 	}
3037 
3038 	if (event & LDC_EVT_READ) {
3039 		/*
3040 		 * Data available for reading.
3041 		 */
3042 		D2(vswp, "%s: id(ld) event(%llx) data READ",
3043 				__func__, ldcp->ldc_id, event);
3044 
3045 		vsw_process_pkt(ldcp);
3046 
3047 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
3048 
3049 		goto vsw_cb_exit;
3050 	}
3051 
3052 	if (event & LDC_EVT_RESET) {
3053 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
3054 		if (rv != 0) {
3055 			cmn_err(CE_WARN, "Unable to read channel state");
3056 		} else {
3057 			ldcp->ldc_status = lstatus;
3058 		}
3059 		D2(vswp, "%s: id(%ld) event(%llx) RESET:  status (%ld)",
3060 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3061 	}
3062 
3063 	if (event & LDC_EVT_DOWN) {
3064 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
3065 		if (rv != 0) {
3066 			cmn_err(CE_WARN, "Unable to read channel state");
3067 		} else {
3068 			ldcp->ldc_status = lstatus;
3069 		}
3070 
3071 		D2(vswp, "%s: id(%ld) event(%llx) DOWN:  status (%ld)",
3072 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3073 
3074 	}
3075 
3076 	/*
3077 	 * Catch either LDC_EVT_WRITE which we don't support or any
3078 	 * unknown event.
3079 	 */
3080 	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
3081 					| LDC_EVT_DOWN | LDC_EVT_READ)) {
3082 
3083 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
3084 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3085 	}
3086 
3087 vsw_cb_exit:
3088 	mutex_exit(&ldcp->ldc_cblock);
3089 
3090 	/*
3091 	 * Let the drain function know we are finishing if it
3092 	 * is waiting.
3093 	 */
3094 	mutex_enter(&ldcp->drain_cv_lock);
3095 	if (ldcp->drain_state == VSW_LDC_DRAINING)
3096 		cv_signal(&ldcp->drain_cv);
3097 	mutex_exit(&ldcp->drain_cv_lock);
3098 
3099 	return (LDC_SUCCESS);
3100 }
3101 
3102 /*
3103  * (Re)start a handshake with our peer by sending them
3104  * our version info.
3105  */
3106 static void
3107 vsw_restart_handshake(vsw_ldc_t *ldcp)
3108 {
3109 	vsw_t		*vswp = ldcp->ldc_vswp;
3110 	vsw_port_t	*port;
3111 	vsw_ldc_list_t	*ldcl;
3112 
3113 	D1(vswp, "vsw_restart_handshake: enter");
3114 
3115 	port = ldcp->ldc_port;
3116 	ldcl = &port->p_ldclist;
3117 
3118 	WRITE_ENTER(&ldcl->lockrw);
3119 
3120 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
3121 		ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3122 
3123 	vsw_free_lane_resources(ldcp, INBOUND);
3124 	vsw_free_lane_resources(ldcp, OUTBOUND);
3125 	RW_EXIT(&ldcl->lockrw);
3126 
3127 	ldcp->lane_in.lstate = 0;
3128 	ldcp->lane_out.lstate = 0;
3129 
3130 	/*
3131 	 * Remove parent port from any multicast groups
3132 	 * it may have registered with. Client must resend
3133 	 * multicast add command after handshake completes.
3134 	 */
3135 	(void) vsw_del_fdb(vswp, port);
3136 
3137 	vsw_del_mcst_port(port);
3138 
3139 	ldcp->hphase = VSW_MILESTONE0;
3140 
3141 	ldcp->peer_session = 0;
3142 	ldcp->session_status = 0;
3143 
3144 	/*
3145 	 * We now increment the transaction group id. This allows
3146 	 * us to identify and disard any tasks which are still pending
3147 	 * on the taskq and refer to the handshake session we are about
3148 	 * to restart. These stale messages no longer have any real
3149 	 * meaning.
3150 	 */
3151 	mutex_enter(&ldcp->hss_lock);
3152 	ldcp->hss_id++;
3153 	mutex_exit(&ldcp->hss_lock);
3154 
3155 	if (ldcp->hcnt++ > vsw_num_handshakes) {
3156 		cmn_err(CE_WARN, "exceeded number of permitted "
3157 			"handshake attempts (%d) on channel %ld",
3158 			ldcp->hcnt, ldcp->ldc_id);
3159 		return;
3160 	}
3161 
3162 	vsw_send_ver(ldcp);
3163 
3164 	D1(vswp, "vsw_restart_handshake: exit");
3165 }
3166 
3167 /*
3168  * returns 0 if legal for event signified by flag to have
3169  * occured at the time it did. Otherwise returns 1.
3170  */
3171 int
3172 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
3173 {
3174 	vsw_t		*vswp = ldcp->ldc_vswp;
3175 	uint64_t	state;
3176 	uint64_t	phase;
3177 
3178 	if (dir == INBOUND)
3179 		state = ldcp->lane_in.lstate;
3180 	else
3181 		state = ldcp->lane_out.lstate;
3182 
3183 	phase = ldcp->hphase;
3184 
3185 	switch (flag) {
3186 	case VSW_VER_INFO_RECV:
3187 		if (phase > VSW_MILESTONE0) {
3188 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
3189 				" when in state %d\n", ldcp->ldc_id, phase);
3190 			vsw_restart_handshake(ldcp);
3191 			return (1);
3192 		}
3193 		break;
3194 
3195 	case VSW_VER_ACK_RECV:
3196 	case VSW_VER_NACK_RECV:
3197 		if (!(state & VSW_VER_INFO_SENT)) {
3198 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
3199 				" or VER_NACK when in state %d\n",
3200 				ldcp->ldc_id, phase);
3201 			vsw_restart_handshake(ldcp);
3202 			return (1);
3203 		} else
3204 			state &= ~VSW_VER_INFO_SENT;
3205 		break;
3206 
3207 	case VSW_ATTR_INFO_RECV:
3208 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
3209 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
3210 				" when in state %d\n", ldcp->ldc_id, phase);
3211 			vsw_restart_handshake(ldcp);
3212 			return (1);
3213 		}
3214 		break;
3215 
3216 	case VSW_ATTR_ACK_RECV:
3217 	case VSW_ATTR_NACK_RECV:
3218 		if (!(state & VSW_ATTR_INFO_SENT)) {
3219 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
3220 				" or ATTR_NACK when in state %d\n",
3221 				ldcp->ldc_id, phase);
3222 			vsw_restart_handshake(ldcp);
3223 			return (1);
3224 		} else
3225 			state &= ~VSW_ATTR_INFO_SENT;
3226 		break;
3227 
3228 	case VSW_DRING_INFO_RECV:
3229 		if (phase < VSW_MILESTONE1) {
3230 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
3231 				" when in state %d\n", ldcp->ldc_id, phase);
3232 			vsw_restart_handshake(ldcp);
3233 			return (1);
3234 		}
3235 		break;
3236 
3237 	case VSW_DRING_ACK_RECV:
3238 	case VSW_DRING_NACK_RECV:
3239 		if (!(state & VSW_DRING_INFO_SENT)) {
3240 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
3241 				" or DRING_NACK when in state %d\n",
3242 				ldcp->ldc_id, phase);
3243 			vsw_restart_handshake(ldcp);
3244 			return (1);
3245 		} else
3246 			state &= ~VSW_DRING_INFO_SENT;
3247 		break;
3248 
3249 	case VSW_RDX_INFO_RECV:
3250 		if (phase < VSW_MILESTONE3) {
3251 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
3252 				" when in state %d\n", ldcp->ldc_id, phase);
3253 			vsw_restart_handshake(ldcp);
3254 			return (1);
3255 		}
3256 		break;
3257 
3258 	case VSW_RDX_ACK_RECV:
3259 	case VSW_RDX_NACK_RECV:
3260 		if (!(state & VSW_RDX_INFO_SENT)) {
3261 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
3262 				" or RDX_NACK when in state %d\n",
3263 				ldcp->ldc_id, phase);
3264 			vsw_restart_handshake(ldcp);
3265 			return (1);
3266 		} else
3267 			state &= ~VSW_RDX_INFO_SENT;
3268 		break;
3269 
3270 	case VSW_MCST_INFO_RECV:
3271 		if (phase < VSW_MILESTONE3) {
3272 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
3273 				" when in state %d\n", ldcp->ldc_id, phase);
3274 			vsw_restart_handshake(ldcp);
3275 			return (1);
3276 		}
3277 		break;
3278 
3279 	default:
3280 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
3281 				ldcp->ldc_id, flag);
3282 		return (1);
3283 	}
3284 
3285 	if (dir == INBOUND)
3286 		ldcp->lane_in.lstate = state;
3287 	else
3288 		ldcp->lane_out.lstate = state;
3289 
3290 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
3291 
3292 	return (0);
3293 }
3294 
3295 void
3296 vsw_next_milestone(vsw_ldc_t *ldcp)
3297 {
3298 	vsw_t		*vswp = ldcp->ldc_vswp;
3299 
3300 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
3301 		ldcp->ldc_id, ldcp->hphase);
3302 
3303 	DUMP_FLAGS(ldcp->lane_in.lstate);
3304 	DUMP_FLAGS(ldcp->lane_out.lstate);
3305 
3306 	switch (ldcp->hphase) {
3307 
3308 	case VSW_MILESTONE0:
3309 		/*
3310 		 * If we haven't started to handshake with our peer,
3311 		 * start to do so now.
3312 		 */
3313 		if (ldcp->lane_out.lstate == 0) {
3314 			D2(vswp, "%s: (chan %lld) starting handshake "
3315 				"with peer", __func__, ldcp->ldc_id);
3316 			vsw_restart_handshake(ldcp);
3317 		}
3318 
3319 		/*
3320 		 * Only way to pass this milestone is to have successfully
3321 		 * negotiated version info.
3322 		 */
3323 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
3324 			(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
3325 
3326 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
3327 				__func__, ldcp->ldc_id);
3328 
3329 			/*
3330 			 * Next milestone is passed when attribute
3331 			 * information has been successfully exchanged.
3332 			 */
3333 			ldcp->hphase = VSW_MILESTONE1;
3334 			vsw_send_attr(ldcp);
3335 
3336 		}
3337 		break;
3338 
3339 	case VSW_MILESTONE1:
3340 		/*
3341 		 * Only way to pass this milestone is to have successfully
3342 		 * negotiated attribute information.
3343 		 */
3344 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
3345 
3346 			ldcp->hphase = VSW_MILESTONE2;
3347 
3348 			/*
3349 			 * If the peer device has said it wishes to
3350 			 * use descriptor rings then we send it our ring
3351 			 * info, otherwise we just set up a private ring
3352 			 * which we use an internal buffer
3353 			 */
3354 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
3355 				vsw_send_dring_info(ldcp);
3356 		}
3357 		break;
3358 
3359 
3360 	case VSW_MILESTONE2:
3361 		/*
3362 		 * If peer has indicated in its attribute message that
3363 		 * it wishes to use descriptor rings then the only way
3364 		 * to pass this milestone is for us to have received
3365 		 * valid dring info.
3366 		 *
3367 		 * If peer is not using descriptor rings then just fall
3368 		 * through.
3369 		 */
3370 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
3371 			(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
3372 			break;
3373 
3374 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
3375 				__func__, ldcp->ldc_id);
3376 
3377 		ldcp->hphase = VSW_MILESTONE3;
3378 		vsw_send_rdx(ldcp);
3379 		break;
3380 
3381 	case VSW_MILESTONE3:
3382 		/*
3383 		 * Pass this milestone when all paramaters have been
3384 		 * successfully exchanged and RDX sent in both directions.
3385 		 *
3386 		 * Mark outbound lane as available to transmit data.
3387 		 */
3388 		if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) &&
3389 			(ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) {
3390 
3391 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
3392 				__func__, ldcp->ldc_id);
3393 			D2(vswp, "%s: ** handshake complete **", __func__);
3394 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
3395 			ldcp->hphase = VSW_MILESTONE4;
3396 			ldcp->hcnt = 0;
3397 			DISPLAY_STATE();
3398 		}
3399 		break;
3400 
3401 	case VSW_MILESTONE4:
3402 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
3403 							ldcp->ldc_id);
3404 		break;
3405 
3406 	default:
3407 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
3408 			ldcp->ldc_id, ldcp->hphase);
3409 	}
3410 
3411 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
3412 		ldcp->hphase);
3413 }
3414 
3415 /*
3416  * Check if major version is supported.
3417  *
3418  * Returns 0 if finds supported major number, and if necessary
3419  * adjusts the minor field.
3420  *
3421  * Returns 1 if can't match major number exactly. Sets mjor/minor
3422  * to next lowest support values, or to zero if no other values possible.
3423  */
3424 static int
3425 vsw_supported_version(vio_ver_msg_t *vp)
3426 {
3427 	int	i;
3428 
3429 	D1(NULL, "vsw_supported_version: enter");
3430 
3431 	for (i = 0; i < VSW_NUM_VER; i++) {
3432 		if (vsw_versions[i].ver_major == vp->ver_major) {
3433 			/*
3434 			 * Matching or lower major version found. Update
3435 			 * minor number if necessary.
3436 			 */
3437 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3438 				D2(NULL, "%s: adjusting minor value"
3439 					" from %d to %d", __func__,
3440 					vp->ver_minor,
3441 					vsw_versions[i].ver_minor);
3442 				vp->ver_minor = vsw_versions[i].ver_minor;
3443 			}
3444 
3445 			return (0);
3446 		}
3447 
3448 		if (vsw_versions[i].ver_major < vp->ver_major) {
3449 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3450 				D2(NULL, "%s: adjusting minor value"
3451 					" from %d to %d", __func__,
3452 					vp->ver_minor,
3453 					vsw_versions[i].ver_minor);
3454 				vp->ver_minor = vsw_versions[i].ver_minor;
3455 			}
3456 			return (1);
3457 		}
3458 	}
3459 
3460 	/* No match was possible, zero out fields */
3461 	vp->ver_major = 0;
3462 	vp->ver_minor = 0;
3463 
3464 	D1(NULL, "vsw_supported_version: exit");
3465 
3466 	return (1);
3467 }
3468 
3469 /*
3470  * Main routine for processing messages received over LDC.
3471  */
3472 static void
3473 vsw_process_pkt(void *arg)
3474 {
3475 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
3476 	vsw_t 		*vswp = ldcp->ldc_vswp;
3477 	size_t		msglen;
3478 	vio_msg_tag_t	tag;
3479 	def_msg_t	dmsg;
3480 	int 		rv = 0;
3481 
3482 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3483 
3484 	/*
3485 	 * If channel is up read messages until channel is empty.
3486 	 */
3487 	do {
3488 		msglen = sizeof (dmsg);
3489 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
3490 
3491 		if (rv != 0) {
3492 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
3493 				"len(%d)\n", __func__, ldcp->ldc_id,
3494 							rv, msglen);
3495 			break;
3496 		}
3497 
3498 		if (msglen == 0) {
3499 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
3500 			ldcp->ldc_id);
3501 			break;
3502 		}
3503 
3504 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
3505 		    ldcp->ldc_id, msglen);
3506 
3507 		/*
3508 		 * Figure out what sort of packet we have gotten by
3509 		 * examining the msg tag, and then switch it appropriately.
3510 		 */
3511 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
3512 
3513 		switch (tag.vio_msgtype) {
3514 		case VIO_TYPE_CTRL:
3515 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
3516 			break;
3517 		case VIO_TYPE_DATA:
3518 			vsw_process_data_pkt(ldcp, &dmsg, tag);
3519 			break;
3520 		case VIO_TYPE_ERR:
3521 			vsw_process_err_pkt(ldcp, &dmsg, tag);
3522 			break;
3523 		default:
3524 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
3525 				"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
3526 			break;
3527 		}
3528 	} while (msglen);
3529 
3530 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3531 }
3532 
3533 /*
3534  * Dispatch a task to process a VIO control message.
3535  */
3536 static void
3537 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
3538 {
3539 	vsw_ctrl_task_t		*ctaskp = NULL;
3540 	vsw_port_t		*port = ldcp->ldc_port;
3541 	vsw_t			*vswp = port->p_vswp;
3542 
3543 	D1(vswp, "%s: enter", __func__);
3544 
3545 	/*
3546 	 * We need to handle RDX ACK messages in-band as once they
3547 	 * are exchanged it is possible that we will get an
3548 	 * immediate (legitimate) data packet.
3549 	 */
3550 	if ((tag.vio_subtype_env == VIO_RDX) &&
3551 		(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
3552 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV))
3553 			return;
3554 
3555 		ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV;
3556 		vsw_next_milestone(ldcp);
3557 		D2(vswp, "%s (%ld) handling RDX_ACK in place", __func__,
3558 			ldcp->ldc_id);
3559 		return;
3560 	}
3561 
3562 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
3563 
3564 	if (ctaskp == NULL) {
3565 		DERR(vswp, "%s: unable to alloc space for ctrl"
3566 			" msg", __func__);
3567 		vsw_restart_handshake(ldcp);
3568 		return;
3569 	}
3570 
3571 	ctaskp->ldcp = ldcp;
3572 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
3573 	mutex_enter(&ldcp->hss_lock);
3574 	ctaskp->hss_id = ldcp->hss_id;
3575 	mutex_exit(&ldcp->hss_lock);
3576 
3577 	/*
3578 	 * Dispatch task to processing taskq if port is not in
3579 	 * the process of being detached.
3580 	 */
3581 	mutex_enter(&port->state_lock);
3582 	if (port->state == VSW_PORT_INIT) {
3583 		if ((vswp->taskq_p == NULL) ||
3584 			(ddi_taskq_dispatch(vswp->taskq_p,
3585 			vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
3586 							!= DDI_SUCCESS)) {
3587 			DERR(vswp, "%s: unable to dispatch task to taskq",
3588 				__func__);
3589 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3590 			mutex_exit(&port->state_lock);
3591 			vsw_restart_handshake(ldcp);
3592 			return;
3593 		}
3594 	} else {
3595 		DWARN(vswp, "%s: port %d detaching, not dispatching "
3596 			"task", __func__, port->p_instance);
3597 	}
3598 
3599 	mutex_exit(&port->state_lock);
3600 
3601 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
3602 			ldcp->ldc_id);
3603 	D1(vswp, "%s: exit", __func__);
3604 }
3605 
3606 /*
3607  * Process a VIO ctrl message. Invoked from taskq.
3608  */
3609 static void
3610 vsw_process_ctrl_pkt(void *arg)
3611 {
3612 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
3613 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
3614 	vsw_t 		*vswp = ldcp->ldc_vswp;
3615 	vio_msg_tag_t	tag;
3616 	uint16_t	env;
3617 
3618 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3619 
3620 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
3621 	env = tag.vio_subtype_env;
3622 
3623 	/* stale pkt check */
3624 	mutex_enter(&ldcp->hss_lock);
3625 	if (ctaskp->hss_id < ldcp->hss_id) {
3626 		DWARN(vswp, "%s: discarding stale packet belonging to"
3627 			" earlier (%ld) handshake session", __func__,
3628 			ctaskp->hss_id);
3629 		mutex_exit(&ldcp->hss_lock);
3630 		return;
3631 	}
3632 	mutex_exit(&ldcp->hss_lock);
3633 
3634 	/* session id check */
3635 	if (ldcp->session_status & VSW_PEER_SESSION) {
3636 		if (ldcp->peer_session != tag.vio_sid) {
3637 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3638 				__func__, ldcp->ldc_id, tag.vio_sid);
3639 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3640 			vsw_restart_handshake(ldcp);
3641 			return;
3642 		}
3643 	}
3644 
3645 	/*
3646 	 * Switch on vio_subtype envelope, then let lower routines
3647 	 * decide if its an INFO, ACK or NACK packet.
3648 	 */
3649 	switch (env) {
3650 	case VIO_VER_INFO:
3651 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
3652 		break;
3653 	case VIO_DRING_REG:
3654 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
3655 		break;
3656 	case VIO_DRING_UNREG:
3657 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
3658 		break;
3659 	case VIO_ATTR_INFO:
3660 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
3661 		break;
3662 	case VNET_MCAST_INFO:
3663 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
3664 		break;
3665 	case VIO_RDX:
3666 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
3667 		break;
3668 	default:
3669 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
3670 							__func__, env);
3671 	}
3672 
3673 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3674 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3675 }
3676 
3677 /*
3678  * Version negotiation. We can end up here either because our peer
3679  * has responded to a handshake message we have sent it, or our peer
3680  * has initiated a handshake with us. If its the former then can only
3681  * be ACK or NACK, if its the later can only be INFO.
3682  *
3683  * If its an ACK we move to the next stage of the handshake, namely
3684  * attribute exchange. If its a NACK we see if we can specify another
3685  * version, if we can't we stop.
3686  *
3687  * If it is an INFO we reset all params associated with communication
3688  * in that direction over this channel (remember connection is
3689  * essentially 2 independent simplex channels).
3690  */
3691 void
3692 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
3693 {
3694 	vio_ver_msg_t	*ver_pkt;
3695 	vsw_t 		*vswp = ldcp->ldc_vswp;
3696 
3697 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3698 
3699 	/*
3700 	 * We know this is a ctrl/version packet so
3701 	 * cast it into the correct structure.
3702 	 */
3703 	ver_pkt = (vio_ver_msg_t *)pkt;
3704 
3705 	switch (ver_pkt->tag.vio_subtype) {
3706 	case VIO_SUBTYPE_INFO:
3707 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
3708 
3709 		/*
3710 		 * Record the session id, which we will use from now
3711 		 * until we see another VER_INFO msg. Even then the
3712 		 * session id in most cases will be unchanged, execpt
3713 		 * if channel was reset.
3714 		 */
3715 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
3716 			(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
3717 			DERR(vswp, "%s: updating session id for chan %lld "
3718 				"from %llx to %llx", __func__, ldcp->ldc_id,
3719 				ldcp->peer_session, ver_pkt->tag.vio_sid);
3720 		}
3721 
3722 		ldcp->peer_session = ver_pkt->tag.vio_sid;
3723 		ldcp->session_status |= VSW_PEER_SESSION;
3724 
3725 		/* Legal message at this time ? */
3726 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
3727 			return;
3728 
3729 		/*
3730 		 * First check the device class. Currently only expect
3731 		 * to be talking to a network device. In the future may
3732 		 * also talk to another switch.
3733 		 */
3734 		if (ver_pkt->dev_class != VDEV_NETWORK) {
3735 			DERR(vswp, "%s: illegal device class %d", __func__,
3736 				ver_pkt->dev_class);
3737 
3738 			ver_pkt->tag.vio_sid = ldcp->local_session;
3739 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3740 
3741 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3742 
3743 			vsw_send_msg(ldcp, (void *)ver_pkt,
3744 					sizeof (vio_ver_msg_t));
3745 
3746 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
3747 			vsw_next_milestone(ldcp);
3748 			return;
3749 		} else {
3750 			ldcp->dev_class = ver_pkt->dev_class;
3751 		}
3752 
3753 		/*
3754 		 * Now check the version.
3755 		 */
3756 		if (vsw_supported_version(ver_pkt) == 0) {
3757 			/*
3758 			 * Support this major version and possibly
3759 			 * adjusted minor version.
3760 			 */
3761 
3762 			D2(vswp, "%s: accepted ver %d:%d", __func__,
3763 				ver_pkt->ver_major, ver_pkt->ver_minor);
3764 
3765 			/* Store accepted values */
3766 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
3767 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3768 
3769 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3770 
3771 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
3772 		} else {
3773 			/*
3774 			 * NACK back with the next lower major/minor
3775 			 * pairing we support (if don't suuport any more
3776 			 * versions then they will be set to zero.
3777 			 */
3778 
3779 			D2(vswp, "%s: replying with ver %d:%d", __func__,
3780 				ver_pkt->ver_major, ver_pkt->ver_minor);
3781 
3782 			/* Store updated values */
3783 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
3784 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3785 
3786 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3787 
3788 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
3789 		}
3790 
3791 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3792 		ver_pkt->tag.vio_sid = ldcp->local_session;
3793 		vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t));
3794 
3795 		vsw_next_milestone(ldcp);
3796 		break;
3797 
3798 	case VIO_SUBTYPE_ACK:
3799 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
3800 
3801 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
3802 			return;
3803 
3804 		/* Store updated values */
3805 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
3806 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3807 
3808 
3809 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
3810 		vsw_next_milestone(ldcp);
3811 
3812 		break;
3813 
3814 	case VIO_SUBTYPE_NACK:
3815 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
3816 
3817 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
3818 			return;
3819 
3820 		/*
3821 		 * If our peer sent us a NACK with the ver fields set to
3822 		 * zero then there is nothing more we can do. Otherwise see
3823 		 * if we support either the version suggested, or a lesser
3824 		 * one.
3825 		 */
3826 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
3827 			DERR(vswp, "%s: peer unable to negotiate any "
3828 				"further.", __func__);
3829 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
3830 			vsw_next_milestone(ldcp);
3831 			return;
3832 		}
3833 
3834 		/*
3835 		 * Check to see if we support this major version or
3836 		 * a lower one. If we don't then maj/min will be set
3837 		 * to zero.
3838 		 */
3839 		(void) vsw_supported_version(ver_pkt);
3840 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
3841 			/* Nothing more we can do */
3842 			DERR(vswp, "%s: version negotiation failed.\n",
3843 								__func__);
3844 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
3845 			vsw_next_milestone(ldcp);
3846 		} else {
3847 			/* found a supported major version */
3848 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
3849 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
3850 
3851 			D2(vswp, "%s: resending with updated values (%x, %x)",
3852 				__func__, ver_pkt->ver_major,
3853 				ver_pkt->ver_minor);
3854 
3855 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
3856 			ver_pkt->tag.vio_sid = ldcp->local_session;
3857 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3858 
3859 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3860 
3861 			vsw_send_msg(ldcp, (void *)ver_pkt,
3862 					sizeof (vio_ver_msg_t));
3863 
3864 			vsw_next_milestone(ldcp);
3865 
3866 		}
3867 		break;
3868 
3869 	default:
3870 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3871 			ver_pkt->tag.vio_subtype);
3872 	}
3873 
3874 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
3875 }
3876 
3877 /*
3878  * Process an attribute packet. We can end up here either because our peer
3879  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
3880  * peer has sent us an attribute INFO message
3881  *
3882  * If its an ACK we then move to the next stage of the handshake which
3883  * is to send our descriptor ring info to our peer. If its a NACK then
3884  * there is nothing more we can (currently) do.
3885  *
3886  * If we get a valid/acceptable INFO packet (and we have already negotiated
3887  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
3888  * NACK back and reset channel state to INACTIV.
3889  *
3890  * FUTURE: in time we will probably negotiate over attributes, but for
3891  * the moment unacceptable attributes are regarded as a fatal error.
3892  *
3893  */
3894 void
3895 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
3896 {
3897 	vnet_attr_msg_t		*attr_pkt;
3898 	vsw_t			*vswp = ldcp->ldc_vswp;
3899 	vsw_port_t		*port = ldcp->ldc_port;
3900 	uint64_t		macaddr = 0;
3901 	int			i;
3902 
3903 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3904 
3905 	/*
3906 	 * We know this is a ctrl/attr packet so
3907 	 * cast it into the correct structure.
3908 	 */
3909 	attr_pkt = (vnet_attr_msg_t *)pkt;
3910 
3911 	switch (attr_pkt->tag.vio_subtype) {
3912 	case VIO_SUBTYPE_INFO:
3913 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3914 
3915 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
3916 			return;
3917 
3918 		/*
3919 		 * If the attributes are unacceptable then we NACK back.
3920 		 */
3921 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
3922 
3923 			DERR(vswp, "%s (chan %d): invalid attributes",
3924 				__func__, ldcp->ldc_id);
3925 
3926 			vsw_free_lane_resources(ldcp, INBOUND);
3927 
3928 			attr_pkt->tag.vio_sid = ldcp->local_session;
3929 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3930 
3931 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
3932 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
3933 			vsw_send_msg(ldcp, (void *)attr_pkt,
3934 					sizeof (vnet_attr_msg_t));
3935 
3936 			vsw_next_milestone(ldcp);
3937 			return;
3938 		}
3939 
3940 		/*
3941 		 * Otherwise store attributes for this lane and update
3942 		 * lane state.
3943 		 */
3944 		ldcp->lane_in.mtu = attr_pkt->mtu;
3945 		ldcp->lane_in.addr = attr_pkt->addr;
3946 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
3947 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
3948 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
3949 
3950 		macaddr = ldcp->lane_in.addr;
3951 		for (i = ETHERADDRL - 1; i >= 0; i--) {
3952 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
3953 			macaddr >>= 8;
3954 		}
3955 
3956 		/* create the fdb entry for this port/mac address */
3957 		(void) vsw_add_fdb(vswp, port);
3958 
3959 		/* setup device specifc xmit routines */
3960 		mutex_enter(&port->tx_lock);
3961 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
3962 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
3963 			port->transmit = vsw_dringsend;
3964 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
3965 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
3966 			vsw_create_privring(ldcp);
3967 			port->transmit = vsw_descrsend;
3968 		}
3969 		mutex_exit(&port->tx_lock);
3970 
3971 		attr_pkt->tag.vio_sid = ldcp->local_session;
3972 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3973 
3974 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
3975 
3976 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
3977 
3978 		vsw_send_msg(ldcp, (void *)attr_pkt,
3979 					sizeof (vnet_attr_msg_t));
3980 
3981 		vsw_next_milestone(ldcp);
3982 		break;
3983 
3984 	case VIO_SUBTYPE_ACK:
3985 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3986 
3987 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
3988 			return;
3989 
3990 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
3991 		vsw_next_milestone(ldcp);
3992 		break;
3993 
3994 	case VIO_SUBTYPE_NACK:
3995 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3996 
3997 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
3998 			return;
3999 
4000 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
4001 		vsw_next_milestone(ldcp);
4002 		break;
4003 
4004 	default:
4005 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4006 			attr_pkt->tag.vio_subtype);
4007 	}
4008 
4009 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4010 }
4011 
4012 /*
4013  * Process a dring info packet. We can end up here either because our peer
4014  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
4015  * peer has sent us a dring INFO message.
4016  *
4017  * If we get a valid/acceptable INFO packet (and we have already negotiated
4018  * a version) we ACK back and update the lane state, otherwise we NACK back.
4019  *
4020  * FUTURE: nothing to stop client from sending us info on multiple dring's
4021  * but for the moment we will just use the first one we are given.
4022  *
4023  */
4024 void
4025 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
4026 {
4027 	vio_dring_reg_msg_t	*dring_pkt;
4028 	vsw_t			*vswp = ldcp->ldc_vswp;
4029 	ldc_mem_info_t		minfo;
4030 	dring_info_t		*dp, *dbp;
4031 	int			dring_found = 0;
4032 
4033 	/*
4034 	 * We know this is a ctrl/dring packet so
4035 	 * cast it into the correct structure.
4036 	 */
4037 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
4038 
4039 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4040 
4041 	switch (dring_pkt->tag.vio_subtype) {
4042 	case VIO_SUBTYPE_INFO:
4043 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4044 
4045 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
4046 			return;
4047 
4048 		/*
4049 		 * If the dring params are unacceptable then we NACK back.
4050 		 */
4051 		if (vsw_check_dring_info(dring_pkt)) {
4052 
4053 			DERR(vswp, "%s (%lld): invalid dring info",
4054 				__func__, ldcp->ldc_id);
4055 
4056 			vsw_free_lane_resources(ldcp, INBOUND);
4057 
4058 			dring_pkt->tag.vio_sid = ldcp->local_session;
4059 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4060 
4061 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
4062 
4063 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
4064 
4065 			vsw_send_msg(ldcp, (void *)dring_pkt,
4066 					sizeof (vio_dring_reg_msg_t));
4067 
4068 			vsw_next_milestone(ldcp);
4069 			return;
4070 		}
4071 
4072 		/*
4073 		 * Otherwise, attempt to map in the dring using the
4074 		 * cookie. If that succeeds we send back a unique dring
4075 		 * identifier that the sending side will use in future
4076 		 * to refer to this descriptor ring.
4077 		 */
4078 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4079 
4080 		dp->num_descriptors = dring_pkt->num_descriptors;
4081 		dp->descriptor_size = dring_pkt->descriptor_size;
4082 		dp->options = dring_pkt->options;
4083 		dp->ncookies = dring_pkt->ncookies;
4084 
4085 		/*
4086 		 * Note: should only get one cookie. Enforced in
4087 		 * the ldc layer.
4088 		 */
4089 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
4090 			sizeof (ldc_mem_cookie_t));
4091 
4092 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
4093 			dp->num_descriptors, dp->descriptor_size);
4094 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
4095 			dp->options, dp->ncookies);
4096 
4097 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
4098 			dp->ncookies, dp->num_descriptors,
4099 			dp->descriptor_size, LDC_SHADOW_MAP,
4100 			&(dp->handle))) != 0) {
4101 
4102 			DERR(vswp, "%s: dring_map failed\n", __func__);
4103 
4104 			kmem_free(dp, sizeof (dring_info_t));
4105 			vsw_free_lane_resources(ldcp, INBOUND);
4106 
4107 			dring_pkt->tag.vio_sid = ldcp->local_session;
4108 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4109 
4110 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
4111 
4112 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
4113 			vsw_send_msg(ldcp, (void *)dring_pkt,
4114 				sizeof (vio_dring_reg_msg_t));
4115 
4116 			vsw_next_milestone(ldcp);
4117 			return;
4118 		}
4119 
4120 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
4121 
4122 			DERR(vswp, "%s: dring_addr failed\n", __func__);
4123 
4124 			kmem_free(dp, sizeof (dring_info_t));
4125 			vsw_free_lane_resources(ldcp, INBOUND);
4126 
4127 			dring_pkt->tag.vio_sid = ldcp->local_session;
4128 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4129 
4130 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
4131 
4132 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
4133 			vsw_send_msg(ldcp, (void *)dring_pkt,
4134 				sizeof (vio_dring_reg_msg_t));
4135 
4136 			vsw_next_milestone(ldcp);
4137 			return;
4138 		} else {
4139 			/* store the address of the pub part of ring */
4140 			dp->pub_addr = minfo.vaddr;
4141 		}
4142 
4143 		/* no private section as we are importing */
4144 		dp->priv_addr = NULL;
4145 
4146 		/*
4147 		 * Using simple mono increasing int for ident at
4148 		 * the moment.
4149 		 */
4150 		dp->ident = ldcp->next_ident;
4151 		ldcp->next_ident++;
4152 
4153 		dp->end_idx = 0;
4154 		dp->next = NULL;
4155 
4156 		/*
4157 		 * Link it onto the end of the list of drings
4158 		 * for this lane.
4159 		 */
4160 		if (ldcp->lane_in.dringp == NULL) {
4161 			D2(vswp, "%s: adding first INBOUND dring", __func__);
4162 			ldcp->lane_in.dringp = dp;
4163 		} else {
4164 			dbp = ldcp->lane_in.dringp;
4165 
4166 			while (dbp->next != NULL)
4167 				dbp = dbp->next;
4168 
4169 			dbp->next = dp;
4170 		}
4171 
4172 		/* acknowledge it */
4173 		dring_pkt->tag.vio_sid = ldcp->local_session;
4174 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4175 		dring_pkt->dring_ident = dp->ident;
4176 
4177 		vsw_send_msg(ldcp, (void *)dring_pkt,
4178 				sizeof (vio_dring_reg_msg_t));
4179 
4180 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
4181 		vsw_next_milestone(ldcp);
4182 		break;
4183 
4184 	case VIO_SUBTYPE_ACK:
4185 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4186 
4187 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
4188 			return;
4189 
4190 		/*
4191 		 * Peer is acknowledging our dring info and will have
4192 		 * sent us a dring identifier which we will use to
4193 		 * refer to this ring w.r.t. our peer.
4194 		 */
4195 		dp = ldcp->lane_out.dringp;
4196 		if (dp != NULL) {
4197 			/*
4198 			 * Find the ring this ident should be associated
4199 			 * with.
4200 			 */
4201 			if (vsw_dring_match(dp, dring_pkt)) {
4202 				dring_found = 1;
4203 
4204 			} else while (dp != NULL) {
4205 				if (vsw_dring_match(dp, dring_pkt)) {
4206 					dring_found = 1;
4207 					break;
4208 				}
4209 				dp = dp->next;
4210 			}
4211 
4212 			if (dring_found == 0) {
4213 				DERR(NULL, "%s: unrecognised ring cookie",
4214 					__func__);
4215 				vsw_restart_handshake(ldcp);
4216 				return;
4217 			}
4218 
4219 		} else {
4220 			DERR(vswp, "%s: DRING ACK received but no drings "
4221 				"allocated", __func__);
4222 			vsw_restart_handshake(ldcp);
4223 			return;
4224 		}
4225 
4226 		/* store ident */
4227 		dp->ident = dring_pkt->dring_ident;
4228 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
4229 		vsw_next_milestone(ldcp);
4230 		break;
4231 
4232 	case VIO_SUBTYPE_NACK:
4233 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4234 
4235 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
4236 			return;
4237 
4238 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
4239 		vsw_next_milestone(ldcp);
4240 		break;
4241 
4242 	default:
4243 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4244 			dring_pkt->tag.vio_subtype);
4245 	}
4246 
4247 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4248 }
4249 
4250 /*
4251  * Process a request from peer to unregister a dring.
4252  *
4253  * For the moment we just restart the handshake if our
4254  * peer endpoint attempts to unregister a dring.
4255  */
4256 void
4257 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
4258 {
4259 	vsw_t			*vswp = ldcp->ldc_vswp;
4260 	vio_dring_unreg_msg_t	*dring_pkt;
4261 
4262 	/*
4263 	 * We know this is a ctrl/dring packet so
4264 	 * cast it into the correct structure.
4265 	 */
4266 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
4267 
4268 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4269 
4270 	switch (dring_pkt->tag.vio_subtype) {
4271 	case VIO_SUBTYPE_INFO:
4272 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4273 
4274 		DWARN(vswp, "%s: restarting handshake..", __func__);
4275 		vsw_restart_handshake(ldcp);
4276 		break;
4277 
4278 	case VIO_SUBTYPE_ACK:
4279 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4280 
4281 		DWARN(vswp, "%s: restarting handshake..", __func__);
4282 		vsw_restart_handshake(ldcp);
4283 		break;
4284 
4285 	case VIO_SUBTYPE_NACK:
4286 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4287 
4288 		DWARN(vswp, "%s: restarting handshake..", __func__);
4289 		vsw_restart_handshake(ldcp);
4290 		break;
4291 
4292 	default:
4293 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4294 			dring_pkt->tag.vio_subtype);
4295 		vsw_restart_handshake(ldcp);
4296 	}
4297 
4298 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4299 }
4300 
4301 #define	SND_MCST_NACK(ldcp, pkt) \
4302 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4303 	pkt->tag.vio_sid = ldcp->local_session; \
4304 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t));
4305 
4306 /*
4307  * Process a multicast request from a vnet.
4308  *
4309  * Vnet's specify a multicast address that they are interested in. This
4310  * address is used as a key into the hash table which forms the multicast
4311  * forwarding database (mFDB).
4312  *
4313  * The table keys are the multicast addresses, while the table entries
4314  * are pointers to lists of ports which wish to receive packets for the
4315  * specified multicast address.
4316  *
4317  * When a multicast packet is being switched we use the address as a key
4318  * into the hash table, and then walk the appropriate port list forwarding
4319  * the pkt to each port in turn.
4320  *
4321  * If a vnet is no longer interested in a particular multicast grouping
4322  * we simply find the correct location in the hash table and then delete
4323  * the relevant port from the port list.
4324  *
4325  * To deal with the case whereby a port is being deleted without first
4326  * removing itself from the lists in the hash table, we maintain a list
4327  * of multicast addresses the port has registered an interest in, within
4328  * the port structure itself. We then simply walk that list of addresses
4329  * using them as keys into the hash table and remove the port from the
4330  * appropriate lists.
4331  */
4332 static void
4333 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
4334 {
4335 	vnet_mcast_msg_t	*mcst_pkt;
4336 	vsw_port_t		*port = ldcp->ldc_port;
4337 	vsw_t			*vswp = ldcp->ldc_vswp;
4338 	int			i;
4339 
4340 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4341 
4342 	/*
4343 	 * We know this is a ctrl/mcast packet so
4344 	 * cast it into the correct structure.
4345 	 */
4346 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
4347 
4348 	switch (mcst_pkt->tag.vio_subtype) {
4349 	case VIO_SUBTYPE_INFO:
4350 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4351 
4352 		/*
4353 		 * Check if in correct state to receive a multicast
4354 		 * message (i.e. handshake complete). If not reset
4355 		 * the handshake.
4356 		 */
4357 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
4358 			return;
4359 
4360 		/*
4361 		 * Before attempting to add or remove address check
4362 		 * that they are valid multicast addresses.
4363 		 * If not, then NACK back.
4364 		 */
4365 		for (i = 0; i < mcst_pkt->count; i++) {
4366 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
4367 				DERR(vswp, "%s: invalid multicast address",
4368 								__func__);
4369 				SND_MCST_NACK(ldcp, mcst_pkt);
4370 				return;
4371 			}
4372 		}
4373 
4374 		/*
4375 		 * Now add/remove the addresses. If this fails we
4376 		 * NACK back.
4377 		 */
4378 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
4379 			SND_MCST_NACK(ldcp, mcst_pkt);
4380 			return;
4381 		}
4382 
4383 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4384 		mcst_pkt->tag.vio_sid = ldcp->local_session;
4385 
4386 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
4387 
4388 		vsw_send_msg(ldcp, (void *)mcst_pkt,
4389 					sizeof (vnet_mcast_msg_t));
4390 		break;
4391 
4392 	case VIO_SUBTYPE_ACK:
4393 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4394 
4395 		/*
4396 		 * We shouldn't ever get a multicast ACK message as
4397 		 * at the moment we never request multicast addresses
4398 		 * to be set on some other device. This may change in
4399 		 * the future if we have cascading switches.
4400 		 */
4401 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
4402 			return;
4403 
4404 				/* Do nothing */
4405 		break;
4406 
4407 	case VIO_SUBTYPE_NACK:
4408 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4409 
4410 		/*
4411 		 * We shouldn't get a multicast NACK packet for the
4412 		 * same reasons as we shouldn't get a ACK packet.
4413 		 */
4414 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
4415 			return;
4416 
4417 				/* Do nothing */
4418 		break;
4419 
4420 	default:
4421 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4422 			mcst_pkt->tag.vio_subtype);
4423 	}
4424 
4425 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4426 }
4427 
4428 static void
4429 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
4430 {
4431 	vio_rdx_msg_t	*rdx_pkt;
4432 	vsw_t		*vswp = ldcp->ldc_vswp;
4433 
4434 	/*
4435 	 * We know this is a ctrl/rdx packet so
4436 	 * cast it into the correct structure.
4437 	 */
4438 	rdx_pkt = (vio_rdx_msg_t *)pkt;
4439 
4440 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4441 
4442 	switch (rdx_pkt->tag.vio_subtype) {
4443 	case VIO_SUBTYPE_INFO:
4444 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4445 
4446 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV))
4447 			return;
4448 
4449 		rdx_pkt->tag.vio_sid = ldcp->local_session;
4450 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4451 
4452 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
4453 
4454 		ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT;
4455 
4456 		vsw_send_msg(ldcp, (void *)rdx_pkt,
4457 				sizeof (vio_rdx_msg_t));
4458 
4459 		vsw_next_milestone(ldcp);
4460 		break;
4461 
4462 	case VIO_SUBTYPE_ACK:
4463 		/*
4464 		 * Should be handled in-band by callback handler.
4465 		 */
4466 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
4467 		vsw_restart_handshake(ldcp);
4468 		break;
4469 
4470 	case VIO_SUBTYPE_NACK:
4471 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4472 
4473 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV))
4474 			return;
4475 
4476 		ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV;
4477 		vsw_next_milestone(ldcp);
4478 		break;
4479 
4480 	default:
4481 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4482 			rdx_pkt->tag.vio_subtype);
4483 	}
4484 
4485 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4486 }
4487 
4488 static void
4489 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
4490 {
4491 	uint16_t	env = tag.vio_subtype_env;
4492 	vsw_t		*vswp = ldcp->ldc_vswp;
4493 
4494 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4495 
4496 	/* session id check */
4497 	if (ldcp->session_status & VSW_PEER_SESSION) {
4498 		if (ldcp->peer_session != tag.vio_sid) {
4499 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
4500 				__func__, ldcp->ldc_id, tag.vio_sid);
4501 			vsw_restart_handshake(ldcp);
4502 			return;
4503 		}
4504 	}
4505 
4506 	/*
4507 	 * It is an error for us to be getting data packets
4508 	 * before the handshake has completed.
4509 	 */
4510 	if (ldcp->hphase != VSW_MILESTONE4) {
4511 		DERR(vswp, "%s: got data packet before handshake complete "
4512 			"hphase %d (%x: %x)", __func__, ldcp->hphase,
4513 			ldcp->lane_in.lstate, ldcp->lane_out.lstate);
4514 		DUMP_FLAGS(ldcp->lane_in.lstate);
4515 		DUMP_FLAGS(ldcp->lane_out.lstate);
4516 		vsw_restart_handshake(ldcp);
4517 		return;
4518 	}
4519 
4520 	/*
4521 	 * Switch on vio_subtype envelope, then let lower routines
4522 	 * decide if its an INFO, ACK or NACK packet.
4523 	 */
4524 	if (env == VIO_DRING_DATA) {
4525 		vsw_process_data_dring_pkt(ldcp, dpkt);
4526 	} else if (env == VIO_PKT_DATA) {
4527 		vsw_process_data_raw_pkt(ldcp, dpkt);
4528 	} else if (env == VIO_DESC_DATA) {
4529 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
4530 	} else {
4531 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
4532 							__func__, env);
4533 	}
4534 
4535 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4536 }
4537 
4538 #define	SND_DRING_NACK(ldcp, pkt) \
4539 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4540 	pkt->tag.vio_sid = ldcp->local_session; \
4541 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t));
4542 
4543 static void
4544 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
4545 {
4546 	vio_dring_msg_t		*dring_pkt;
4547 	vnet_public_desc_t	*pub_addr = NULL;
4548 	vsw_private_desc_t	*priv_addr = NULL;
4549 	dring_info_t		*dp = NULL;
4550 	vsw_t			*vswp = ldcp->ldc_vswp;
4551 	mblk_t			*mp = NULL;
4552 	mblk_t			*bp = NULL;
4553 	mblk_t			*bpt = NULL;
4554 	size_t			nbytes = 0;
4555 	size_t			off = 0;
4556 	uint64_t		ncookies = 0;
4557 	uint64_t		chain = 0;
4558 	uint64_t		j, len;
4559 	uint32_t		pos, start, datalen;
4560 	uint32_t		range_start, range_end;
4561 	int32_t			end, num, cnt = 0;
4562 	int			i, rv;
4563 	boolean_t		ack_needed = B_FALSE;
4564 	boolean_t		prev_desc_ack = B_FALSE;
4565 	int			read_attempts = 0;
4566 
4567 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4568 
4569 	/*
4570 	 * We know this is a data/dring packet so
4571 	 * cast it into the correct structure.
4572 	 */
4573 	dring_pkt = (vio_dring_msg_t *)dpkt;
4574 
4575 	/*
4576 	 * Switch on the vio_subtype. If its INFO then we need to
4577 	 * process the data. If its an ACK we need to make sure
4578 	 * it makes sense (i.e did we send an earlier data/info),
4579 	 * and if its a NACK then we maybe attempt a retry.
4580 	 */
4581 	switch (dring_pkt->tag.vio_subtype) {
4582 	case VIO_SUBTYPE_INFO:
4583 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
4584 
4585 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
4586 				dring_pkt->dring_ident)) == NULL) {
4587 
4588 			DERR(vswp, "%s(%lld): unable to find dring from "
4589 				"ident 0x%llx", __func__, ldcp->ldc_id,
4590 				dring_pkt->dring_ident);
4591 
4592 			SND_DRING_NACK(ldcp, dring_pkt);
4593 			return;
4594 		}
4595 
4596 		start = pos = dring_pkt->start_idx;
4597 		end = dring_pkt->end_idx;
4598 		len = dp->num_descriptors;
4599 
4600 		range_start = range_end = pos;
4601 
4602 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
4603 			__func__, ldcp->ldc_id, start, end);
4604 
4605 		if (end == -1) {
4606 			num = -1;
4607 		} else if (end >= 0) {
4608 			num = end >= pos ?
4609 				end - pos + 1: (len - pos + 1) + end;
4610 
4611 			/* basic sanity check */
4612 			if (end > len) {
4613 				DERR(vswp, "%s(%lld): endpoint %lld outside "
4614 					"ring length %lld", __func__,
4615 					ldcp->ldc_id, end, len);
4616 
4617 				SND_DRING_NACK(ldcp, dring_pkt);
4618 				return;
4619 			}
4620 		} else {
4621 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
4622 				__func__, ldcp->ldc_id, end);
4623 			SND_DRING_NACK(ldcp, dring_pkt);
4624 			return;
4625 		}
4626 
4627 		while (cnt != num) {
4628 vsw_recheck_desc:
4629 			if ((rv = ldc_mem_dring_acquire(dp->handle,
4630 							pos, pos)) != 0) {
4631 				DERR(vswp, "%s(%lld): unable to acquire "
4632 					"descriptor at pos %d: err %d",
4633 					__func__, pos, ldcp->ldc_id, rv);
4634 				SND_DRING_NACK(ldcp, dring_pkt);
4635 				return;
4636 			}
4637 
4638 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
4639 
4640 			/*
4641 			 * When given a bounded range of descriptors
4642 			 * to process, its an error to hit a descriptor
4643 			 * which is not ready. In the non-bounded case
4644 			 * (end_idx == -1) this simply indicates we have
4645 			 * reached the end of the current active range.
4646 			 */
4647 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
4648 				/* unbound - no error */
4649 				if (end == -1) {
4650 					if (read_attempts == vsw_read_attempts)
4651 						break;
4652 
4653 					delay(drv_usectohz(vsw_desc_delay));
4654 					read_attempts++;
4655 					goto vsw_recheck_desc;
4656 				}
4657 
4658 				/* bounded - error - so NACK back */
4659 				DERR(vswp, "%s(%lld): descriptor not READY "
4660 					"(%d)", __func__, ldcp->ldc_id,
4661 					pub_addr->hdr.dstate);
4662 				SND_DRING_NACK(ldcp, dring_pkt);
4663 				return;
4664 			}
4665 
4666 			DTRACE_PROBE1(read_attempts, int, read_attempts);
4667 
4668 			range_end = pos;
4669 
4670 			/*
4671 			 * If we ACK'd the previous descriptor then now
4672 			 * record the new range start position for later
4673 			 * ACK's.
4674 			 */
4675 			if (prev_desc_ack) {
4676 				range_start = pos;
4677 
4678 				D2(vswp, "%s(%lld): updating range start "
4679 					"to be %d", __func__, ldcp->ldc_id,
4680 					range_start);
4681 
4682 				prev_desc_ack = B_FALSE;
4683 			}
4684 
4685 			/*
4686 			 * Data is padded to align on 8 byte boundary,
4687 			 * datalen is actual data length, i.e. minus that
4688 			 * padding.
4689 			 */
4690 			datalen = pub_addr->nbytes;
4691 
4692 			/*
4693 			 * Does peer wish us to ACK when we have finished
4694 			 * with this descriptor ?
4695 			 */
4696 			if (pub_addr->hdr.ack)
4697 				ack_needed = B_TRUE;
4698 
4699 			D2(vswp, "%s(%lld): processing desc %lld at pos"
4700 				" 0x%llx : dstate 0x%lx : datalen 0x%lx",
4701 				__func__, ldcp->ldc_id, pos, pub_addr,
4702 				pub_addr->hdr.dstate, datalen);
4703 
4704 			/*
4705 			 * Mark that we are starting to process descriptor.
4706 			 */
4707 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
4708 
4709 			mp = vio_allocb(ldcp->rxh);
4710 			if (mp == NULL) {
4711 				/*
4712 				 * No free receive buffers available, so
4713 				 * fallback onto allocb(9F). Make sure that
4714 				 * we get a data buffer which is a multiple
4715 				 * of 8 as this is required by ldc_mem_copy.
4716 				 */
4717 				DTRACE_PROBE(allocb);
4718 				mp = allocb(datalen + VNET_IPALIGN + 8,
4719 								BPRI_MED);
4720 			}
4721 
4722 			/*
4723 			 * Ensure that we ask ldc for an aligned
4724 			 * number of bytes.
4725 			 */
4726 			nbytes = datalen + VNET_IPALIGN;
4727 			if (nbytes & 0x7) {
4728 				off = 8 - (nbytes & 0x7);
4729 				nbytes += off;
4730 			}
4731 
4732 			ncookies = pub_addr->ncookies;
4733 			rv = ldc_mem_copy(ldcp->ldc_handle,
4734 				(caddr_t)mp->b_rptr, 0, &nbytes,
4735 				pub_addr->memcookie, ncookies,
4736 				LDC_COPY_IN);
4737 
4738 			if (rv != 0) {
4739 				DERR(vswp, "%s(%d): unable to copy in "
4740 					"data from %d cookies in desc %d"
4741 					" (rv %d)", __func__, ldcp->ldc_id,
4742 					ncookies, pos, rv);
4743 				freemsg(mp);
4744 
4745 				pub_addr->hdr.dstate = VIO_DESC_DONE;
4746 				(void) ldc_mem_dring_release(dp->handle,
4747 								pos, pos);
4748 				break;
4749 			} else {
4750 				D2(vswp, "%s(%d): copied in %ld bytes"
4751 					" using %d cookies", __func__,
4752 					ldcp->ldc_id, nbytes, ncookies);
4753 			}
4754 
4755 			/* adjust the read pointer to skip over the padding */
4756 			mp->b_rptr += VNET_IPALIGN;
4757 
4758 			/* point to the actual end of data */
4759 			mp->b_wptr = mp->b_rptr + datalen;
4760 
4761 			/* build a chain of received packets */
4762 			if (bp == NULL) {
4763 				/* first pkt */
4764 				bp = mp;
4765 				bp->b_next = bp->b_prev = NULL;
4766 				bpt = bp;
4767 				chain = 1;
4768 			} else {
4769 				mp->b_next = NULL;
4770 				mp->b_prev = bpt;
4771 				bpt->b_next = mp;
4772 				bpt = mp;
4773 				chain++;
4774 			}
4775 
4776 			/* mark we are finished with this descriptor */
4777 			pub_addr->hdr.dstate = VIO_DESC_DONE;
4778 
4779 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
4780 
4781 			/*
4782 			 * Send an ACK back to peer if requested.
4783 			 */
4784 			if (ack_needed) {
4785 				ack_needed = B_FALSE;
4786 
4787 				dring_pkt->start_idx = range_start;
4788 				dring_pkt->end_idx = range_end;
4789 
4790 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
4791 					" requested", __func__, ldcp->ldc_id,
4792 					dring_pkt->start_idx,
4793 					dring_pkt->end_idx);
4794 
4795 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
4796 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4797 				dring_pkt->tag.vio_sid = ldcp->local_session;
4798 				vsw_send_msg(ldcp, (void *)dring_pkt,
4799 					sizeof (vio_dring_msg_t));
4800 
4801 				prev_desc_ack = B_TRUE;
4802 				range_start = pos;
4803 			}
4804 
4805 			/* next descriptor */
4806 			pos = (pos + 1) % len;
4807 			cnt++;
4808 
4809 			/*
4810 			 * Break out of loop here and stop processing to
4811 			 * allow some other network device (or disk) to
4812 			 * get access to the cpu.
4813 			 */
4814 			/* send the chain of packets to be switched */
4815 			if (chain > vsw_chain_len) {
4816 				D3(vswp, "%s(%lld): switching chain of %d "
4817 					"msgs", __func__, ldcp->ldc_id, chain);
4818 				vsw_switch_frame(vswp, bp, VSW_VNETPORT,
4819 							ldcp->ldc_port, NULL);
4820 				bp = NULL;
4821 				break;
4822 			}
4823 		}
4824 
4825 		/* send the chain of packets to be switched */
4826 		if (bp != NULL) {
4827 			D3(vswp, "%s(%lld): switching chain of %d msgs",
4828 					__func__, ldcp->ldc_id, chain);
4829 			vsw_switch_frame(vswp, bp, VSW_VNETPORT,
4830 							ldcp->ldc_port, NULL);
4831 		}
4832 
4833 		DTRACE_PROBE1(msg_cnt, int, cnt);
4834 
4835 		/*
4836 		 * We are now finished so ACK back with the state
4837 		 * set to STOPPING so our peer knows we are finished
4838 		 */
4839 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4840 		dring_pkt->tag.vio_sid = ldcp->local_session;
4841 
4842 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
4843 
4844 		DTRACE_PROBE(stop_process_sent);
4845 
4846 		/*
4847 		 * We have not processed any more descriptors beyond
4848 		 * the last one we ACK'd.
4849 		 */
4850 		if (prev_desc_ack)
4851 			range_start = range_end;
4852 
4853 		dring_pkt->start_idx = range_start;
4854 		dring_pkt->end_idx = range_end;
4855 
4856 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
4857 			__func__, ldcp->ldc_id, dring_pkt->start_idx,
4858 			dring_pkt->end_idx);
4859 
4860 		vsw_send_msg(ldcp, (void *)dring_pkt,
4861 					sizeof (vio_dring_msg_t));
4862 		break;
4863 
4864 	case VIO_SUBTYPE_ACK:
4865 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
4866 		/*
4867 		 * Verify that the relevant descriptors are all
4868 		 * marked as DONE
4869 		 */
4870 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
4871 			dring_pkt->dring_ident)) == NULL) {
4872 			DERR(vswp, "%s: unknown ident in ACK", __func__);
4873 			return;
4874 		}
4875 
4876 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
4877 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4878 
4879 		start = end = 0;
4880 		start = dring_pkt->start_idx;
4881 		end = dring_pkt->end_idx;
4882 		len = dp->num_descriptors;
4883 
4884 		j = num = 0;
4885 		/* calculate # descriptors taking into a/c wrap around */
4886 		num = end >= start ? end - start + 1: (len - start + 1) + end;
4887 
4888 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
4889 			__func__, ldcp->ldc_id, start, end, num);
4890 
4891 		mutex_enter(&dp->dlock);
4892 		dp->last_ack_recv = end;
4893 		mutex_exit(&dp->dlock);
4894 
4895 		for (i = start; j < num; i = (i + 1) % len, j++) {
4896 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
4897 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4898 
4899 			/*
4900 			 * If the last descriptor in a range has the ACK
4901 			 * bit set then we will get two messages from our
4902 			 * peer relating to it. The normal ACK msg and then
4903 			 * a subsequent STOP msg. The first message will have
4904 			 * resulted in the descriptor being reclaimed and
4905 			 * its state set to FREE so when we encounter a non
4906 			 * DONE descriptor we need to check to see if its
4907 			 * because we have just reclaimed it.
4908 			 */
4909 			mutex_enter(&priv_addr->dstate_lock);
4910 			if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
4911 				/* clear all the fields */
4912 				bzero(priv_addr->datap, priv_addr->datalen);
4913 				priv_addr->datalen = 0;
4914 
4915 				pub_addr->hdr.dstate = VIO_DESC_FREE;
4916 				pub_addr->hdr.ack = 0;
4917 
4918 				priv_addr->dstate = VIO_DESC_FREE;
4919 				mutex_exit(&priv_addr->dstate_lock);
4920 
4921 				D3(vswp, "clearing descp %d : pub state "
4922 					"0x%llx : priv state 0x%llx", i,
4923 					pub_addr->hdr.dstate,
4924 					priv_addr->dstate);
4925 
4926 			} else {
4927 				mutex_exit(&priv_addr->dstate_lock);
4928 
4929 				if (dring_pkt->dring_process_state !=
4930 							VIO_DP_STOPPED) {
4931 					DERR(vswp, "%s: descriptor %lld at pos "
4932 						" 0x%llx not DONE (0x%lx)\n",
4933 						__func__, i, pub_addr,
4934 						pub_addr->hdr.dstate);
4935 					return;
4936 				}
4937 			}
4938 		}
4939 
4940 		/*
4941 		 * If our peer is stopping processing descriptors then
4942 		 * we check to make sure it has processed all the descriptors
4943 		 * we have updated. If not then we send it a new message
4944 		 * to prompt it to restart.
4945 		 */
4946 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
4947 			DTRACE_PROBE(stop_process_recv);
4948 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
4949 				__func__, ldcp->ldc_id, dring_pkt->start_idx,
4950 				dring_pkt->end_idx);
4951 
4952 			/*
4953 			 * Check next descriptor in public section of ring.
4954 			 * If its marked as READY then we need to prompt our
4955 			 * peer to start processing the ring again.
4956 			 */
4957 			i = (end + 1) % len;
4958 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
4959 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4960 
4961 			/*
4962 			 * Hold the restart lock across all of this to
4963 			 * make sure that its not possible for us to
4964 			 * decide that a msg needs to be sent in the future
4965 			 * but the sending code having already checked is
4966 			 * about to exit.
4967 			 */
4968 			mutex_enter(&dp->restart_lock);
4969 			mutex_enter(&priv_addr->dstate_lock);
4970 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
4971 
4972 				mutex_exit(&priv_addr->dstate_lock);
4973 
4974 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4975 				dring_pkt->tag.vio_sid = ldcp->local_session;
4976 
4977 				mutex_enter(&ldcp->lane_out.seq_lock);
4978 				dring_pkt->seq_num = ldcp->lane_out.seq_num++;
4979 				mutex_exit(&ldcp->lane_out.seq_lock);
4980 
4981 				dring_pkt->start_idx = (end + 1) % len;
4982 				dring_pkt->end_idx = -1;
4983 
4984 				D2(vswp, "%s(%lld) : sending restart msg:"
4985 					" %d : %d", __func__, ldcp->ldc_id,
4986 					dring_pkt->start_idx,
4987 					dring_pkt->end_idx);
4988 
4989 				vsw_send_msg(ldcp, (void *)dring_pkt,
4990 						sizeof (vio_dring_msg_t));
4991 			} else {
4992 				mutex_exit(&priv_addr->dstate_lock);
4993 				dp->restart_reqd = B_TRUE;
4994 			}
4995 			mutex_exit(&dp->restart_lock);
4996 		}
4997 		break;
4998 
4999 	case VIO_SUBTYPE_NACK:
5000 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
5001 						__func__, ldcp->ldc_id);
5002 		/*
5003 		 * Something is badly wrong if we are getting NACK's
5004 		 * for our data pkts. So reset the channel.
5005 		 */
5006 		vsw_restart_handshake(ldcp);
5007 
5008 		break;
5009 
5010 	default:
5011 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
5012 			ldcp->ldc_id, dring_pkt->tag.vio_subtype);
5013 	}
5014 
5015 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5016 }
5017 
5018 /*
5019  * VIO_PKT_DATA (a.k.a raw data mode )
5020  *
5021  * Note - currently not supported. Do nothing.
5022  */
5023 static void
5024 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
5025 {
5026 	_NOTE(ARGUNUSED(dpkt))
5027 
5028 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
5029 
5030 	DERR(NULL, "%s (%lld): currently  not supported",
5031 						__func__, ldcp->ldc_id);
5032 
5033 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
5034 }
5035 
5036 #define	SND_IBND_DESC_NACK(ldcp, pkt) \
5037 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5038 	pkt->tag.vio_sid = ldcp->local_session; \
5039 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t));
5040 
5041 /*
5042  * Process an in-band descriptor message (most likely from
5043  * OBP).
5044  */
5045 static void
5046 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
5047 {
5048 	vio_ibnd_desc_t		*ibnd_desc;
5049 	dring_info_t		*dp = NULL;
5050 	vsw_private_desc_t	*priv_addr = NULL;
5051 	vsw_t			*vswp = ldcp->ldc_vswp;
5052 	mblk_t			*mp = NULL;
5053 	size_t			nbytes = 0;
5054 	size_t			off = 0;
5055 	uint64_t		idx = 0;
5056 	uint32_t		num = 1, len, datalen = 0;
5057 	uint64_t		ncookies = 0;
5058 	int			i, rv;
5059 	int			j = 0;
5060 
5061 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5062 
5063 	ibnd_desc = (vio_ibnd_desc_t *)pkt;
5064 
5065 	switch (ibnd_desc->hdr.tag.vio_subtype) {
5066 	case VIO_SUBTYPE_INFO:
5067 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5068 
5069 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
5070 			return;
5071 
5072 		/*
5073 		 * Data is padded to align on a 8 byte boundary,
5074 		 * nbytes is actual data length, i.e. minus that
5075 		 * padding.
5076 		 */
5077 		datalen = ibnd_desc->nbytes;
5078 
5079 		D2(vswp, "%s(%lld): processing inband desc : "
5080 			": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
5081 
5082 		ncookies = ibnd_desc->ncookies;
5083 
5084 		/*
5085 		 * allocb(9F) returns an aligned data block. We
5086 		 * need to ensure that we ask ldc for an aligned
5087 		 * number of bytes also.
5088 		 */
5089 		nbytes = datalen;
5090 		if (nbytes & 0x7) {
5091 			off = 8 - (nbytes & 0x7);
5092 			nbytes += off;
5093 		}
5094 
5095 		mp = allocb(datalen, BPRI_MED);
5096 		if (mp == NULL) {
5097 			DERR(vswp, "%s(%lld): allocb failed",
5098 					__func__, ldcp->ldc_id);
5099 			return;
5100 		}
5101 
5102 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
5103 			0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
5104 			LDC_COPY_IN);
5105 
5106 		if (rv != 0) {
5107 			DERR(vswp, "%s(%d): unable to copy in data from "
5108 				"%d cookie(s)", __func__,
5109 				ldcp->ldc_id, ncookies);
5110 			freemsg(mp);
5111 			return;
5112 		} else {
5113 			D2(vswp, "%s(%d): copied in %ld bytes using %d "
5114 				"cookies", __func__, ldcp->ldc_id, nbytes,
5115 				ncookies);
5116 		}
5117 
5118 		/* point to the actual end of data */
5119 		mp->b_wptr = mp->b_rptr + datalen;
5120 
5121 		/*
5122 		 * We ACK back every in-band descriptor message we process
5123 		 */
5124 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
5125 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
5126 		vsw_send_msg(ldcp, (void *)ibnd_desc,
5127 				sizeof (vio_ibnd_desc_t));
5128 
5129 		/* send the packet to be switched */
5130 		vsw_switch_frame(vswp, mp, VSW_VNETPORT,
5131 					ldcp->ldc_port, NULL);
5132 
5133 		break;
5134 
5135 	case VIO_SUBTYPE_ACK:
5136 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5137 
5138 		/* Verify the ACK is valid */
5139 		idx = ibnd_desc->hdr.desc_handle;
5140 
5141 		if (idx >= VSW_RING_NUM_EL) {
5142 			cmn_err(CE_WARN, "%s: corrupted ACK received "
5143 				"(idx %ld)", __func__, idx);
5144 			return;
5145 		}
5146 
5147 		if ((dp = ldcp->lane_out.dringp) == NULL) {
5148 			DERR(vswp, "%s: no dring found", __func__);
5149 			return;
5150 		}
5151 
5152 		len = dp->num_descriptors;
5153 		/*
5154 		 * If the descriptor we are being ACK'ed for is not the
5155 		 * one we expected, then pkts were lost somwhere, either
5156 		 * when we tried to send a msg, or a previous ACK msg from
5157 		 * our peer. In either case we now reclaim the descriptors
5158 		 * in the range from the last ACK we received up to the
5159 		 * current ACK.
5160 		 */
5161 		if (idx != dp->last_ack_recv) {
5162 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
5163 				__func__, dp->last_ack_recv, idx);
5164 			num = idx >= dp->last_ack_recv ?
5165 				idx - dp->last_ack_recv + 1:
5166 				(len - dp->last_ack_recv + 1) + idx;
5167 		}
5168 
5169 		/*
5170 		 * When we sent the in-band message to our peer we
5171 		 * marked the copy in our private ring as READY. We now
5172 		 * check that the descriptor we are being ACK'ed for is in
5173 		 * fact READY, i.e. it is one we have shared with our peer.
5174 		 *
5175 		 * If its not we flag an error, but still reset the descr
5176 		 * back to FREE.
5177 		 */
5178 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
5179 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5180 			mutex_enter(&priv_addr->dstate_lock);
5181 			if (priv_addr->dstate != VIO_DESC_READY) {
5182 				DERR(vswp, "%s: (%ld) desc at index %ld not "
5183 					"READY (0x%lx)", __func__,
5184 					ldcp->ldc_id, idx, priv_addr->dstate);
5185 				DERR(vswp, "%s: bound %d: ncookies %ld : "
5186 					"datalen %ld", __func__,
5187 					priv_addr->bound, priv_addr->ncookies,
5188 					priv_addr->datalen);
5189 			}
5190 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
5191 				ldcp->ldc_id, idx);
5192 			/* release resources associated with sent msg */
5193 			bzero(priv_addr->datap, priv_addr->datalen);
5194 			priv_addr->datalen = 0;
5195 			priv_addr->dstate = VIO_DESC_FREE;
5196 			mutex_exit(&priv_addr->dstate_lock);
5197 		}
5198 		/* update to next expected value */
5199 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
5200 
5201 		break;
5202 
5203 	case VIO_SUBTYPE_NACK:
5204 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5205 
5206 		/*
5207 		 * We should only get a NACK if our peer doesn't like
5208 		 * something about a message we have sent it. If this
5209 		 * happens we just release the resources associated with
5210 		 * the message. (We are relying on higher layers to decide
5211 		 * whether or not to resend.
5212 		 */
5213 
5214 		/* limit check */
5215 		idx = ibnd_desc->hdr.desc_handle;
5216 
5217 		if (idx >= VSW_RING_NUM_EL) {
5218 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
5219 				__func__, idx);
5220 			return;
5221 		}
5222 
5223 		if ((dp = ldcp->lane_out.dringp) == NULL) {
5224 			DERR(vswp, "%s: no dring found", __func__);
5225 			return;
5226 		}
5227 
5228 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5229 
5230 		/* move to correct location in ring */
5231 		priv_addr += idx;
5232 
5233 		/* release resources associated with sent msg */
5234 		mutex_enter(&priv_addr->dstate_lock);
5235 		bzero(priv_addr->datap, priv_addr->datalen);
5236 		priv_addr->datalen = 0;
5237 		priv_addr->dstate = VIO_DESC_FREE;
5238 		mutex_exit(&priv_addr->dstate_lock);
5239 
5240 		break;
5241 
5242 	default:
5243 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
5244 			ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
5245 	}
5246 
5247 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5248 }
5249 
5250 static void
5251 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
5252 {
5253 	_NOTE(ARGUNUSED(epkt))
5254 
5255 	vsw_t		*vswp = ldcp->ldc_vswp;
5256 	uint16_t	env = tag.vio_subtype_env;
5257 
5258 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
5259 
5260 	/*
5261 	 * Error vio_subtypes have yet to be defined. So for
5262 	 * the moment we can't do anything.
5263 	 */
5264 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
5265 
5266 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
5267 }
5268 
5269 /*
5270  * Switch the given ethernet frame when operating in layer 2 mode.
5271  *
5272  * vswp: pointer to the vsw instance
5273  * mp: pointer to chain of ethernet frame(s) to be switched
5274  * caller: identifies the source of this frame as:
5275  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
5276  *		2. VSW_PHYSDEV - the physical ethernet device
5277  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
5278  * arg: argument provided by the caller.
5279  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
5280  *		2. for PHYSDEV - NULL
5281  *		3. for LOCALDEV - pointer to to this vsw_t(self)
5282  */
5283 void
5284 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
5285 			vsw_port_t *arg, mac_resource_handle_t mrh)
5286 {
5287 	struct ether_header	*ehp;
5288 	vsw_port_t		*port = NULL;
5289 	mblk_t			*bp, *ret_m;
5290 	mblk_t			*nmp = NULL;
5291 	vsw_port_list_t		*plist = &vswp->plist;
5292 
5293 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
5294 
5295 	/*
5296 	 * PERF: rather than breaking up the chain here, scan it
5297 	 * to find all mblks heading to same destination and then
5298 	 * pass that sub-chain to the lower transmit functions.
5299 	 */
5300 
5301 	/* process the chain of packets */
5302 	bp = mp;
5303 	while (bp) {
5304 		mp = bp;
5305 		bp = bp->b_next;
5306 		mp->b_next = mp->b_prev = NULL;
5307 		ehp = (struct ether_header *)mp->b_rptr;
5308 
5309 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
5310 			__func__, MBLKSIZE(mp), MBLKL(mp));
5311 
5312 		READ_ENTER(&vswp->if_lockrw);
5313 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
5314 			/*
5315 			 * If destination is VSW_LOCALDEV (vsw as an eth
5316 			 * interface) and if the device is up & running,
5317 			 * send the packet up the stack on this host.
5318 			 * If the virtual interface is down, drop the packet.
5319 			 */
5320 			if (caller != VSW_LOCALDEV) {
5321 				if (vswp->if_state & VSW_IF_UP) {
5322 					RW_EXIT(&vswp->if_lockrw);
5323 					mac_rx(vswp->if_mh, mrh, mp);
5324 				} else {
5325 					RW_EXIT(&vswp->if_lockrw);
5326 					/* Interface down, drop pkt */
5327 					freemsg(mp);
5328 				}
5329 			} else {
5330 				RW_EXIT(&vswp->if_lockrw);
5331 				freemsg(mp);
5332 			}
5333 			continue;
5334 		}
5335 		RW_EXIT(&vswp->if_lockrw);
5336 
5337 		READ_ENTER(&plist->lockrw);
5338 		port = vsw_lookup_fdb(vswp, ehp);
5339 		if (port) {
5340 			/*
5341 			 * Mark the port as in-use.
5342 			 */
5343 			mutex_enter(&port->ref_lock);
5344 			port->ref_cnt++;
5345 			mutex_exit(&port->ref_lock);
5346 			RW_EXIT(&plist->lockrw);
5347 
5348 			/*
5349 			 * If plumbed and in promisc mode then copy msg
5350 			 * and send up the stack.
5351 			 */
5352 			READ_ENTER(&vswp->if_lockrw);
5353 			if (VSW_U_P(vswp->if_state)) {
5354 				RW_EXIT(&vswp->if_lockrw);
5355 				nmp = copymsg(mp);
5356 				if (nmp)
5357 					mac_rx(vswp->if_mh, mrh, nmp);
5358 			} else {
5359 				RW_EXIT(&vswp->if_lockrw);
5360 			}
5361 
5362 			/*
5363 			 * If the destination is in FDB, the packet
5364 			 * should be forwarded to the correponding
5365 			 * vsw_port (connected to a vnet device -
5366 			 * VSW_VNETPORT)
5367 			 */
5368 			(void) vsw_portsend(port, mp);
5369 
5370 			/*
5371 			 * Decrement use count in port and check if
5372 			 * should wake delete thread.
5373 			 */
5374 			mutex_enter(&port->ref_lock);
5375 			port->ref_cnt--;
5376 			if (port->ref_cnt == 0)
5377 				cv_signal(&port->ref_cv);
5378 			mutex_exit(&port->ref_lock);
5379 		} else {
5380 			RW_EXIT(&plist->lockrw);
5381 			/*
5382 			 * Destination not in FDB.
5383 			 *
5384 			 * If the destination is broadcast or
5385 			 * multicast forward the packet to all
5386 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
5387 			 * except the caller.
5388 			 */
5389 			if (IS_BROADCAST(ehp)) {
5390 				D3(vswp, "%s: BROADCAST pkt", __func__);
5391 				(void) vsw_forward_all(vswp, mp,
5392 								caller, arg);
5393 			} else if (IS_MULTICAST(ehp)) {
5394 				D3(vswp, "%s: MULTICAST pkt", __func__);
5395 				(void) vsw_forward_grp(vswp, mp,
5396 							caller, arg);
5397 			} else {
5398 				/*
5399 				 * If the destination is unicast, and came
5400 				 * from either a logical network device or
5401 				 * the switch itself when it is plumbed, then
5402 				 * send it out on the physical device and also
5403 				 * up the stack if the logical interface is
5404 				 * in promiscious mode.
5405 				 *
5406 				 * NOTE:  The assumption here is that if we
5407 				 * cannot find the destination in our fdb, its
5408 				 * a unicast address, and came from either a
5409 				 * vnet or down the stack (when plumbed) it
5410 				 * must be destinded for an ethernet device
5411 				 * outside our ldoms.
5412 				 */
5413 				if (caller == VSW_VNETPORT) {
5414 					READ_ENTER(&vswp->if_lockrw);
5415 					if (VSW_U_P(vswp->if_state)) {
5416 						RW_EXIT(&vswp->if_lockrw);
5417 						nmp = copymsg(mp);
5418 						if (nmp)
5419 							mac_rx(vswp->if_mh,
5420 								mrh, nmp);
5421 					} else {
5422 						RW_EXIT(&vswp->if_lockrw);
5423 					}
5424 					if ((ret_m = vsw_tx_msg(vswp, mp))
5425 								!= NULL) {
5426 						DERR(vswp, "%s: drop mblks to "
5427 							"phys dev", __func__);
5428 						freemsg(ret_m);
5429 					}
5430 
5431 				} else if (caller == VSW_PHYSDEV) {
5432 					/*
5433 					 * Pkt seen because card in promisc
5434 					 * mode. Send up stack if plumbed in
5435 					 * promisc mode, else drop it.
5436 					 */
5437 					READ_ENTER(&vswp->if_lockrw);
5438 					if (VSW_U_P(vswp->if_state)) {
5439 						RW_EXIT(&vswp->if_lockrw);
5440 						mac_rx(vswp->if_mh, mrh, mp);
5441 					} else {
5442 						RW_EXIT(&vswp->if_lockrw);
5443 						freemsg(mp);
5444 					}
5445 
5446 				} else if (caller == VSW_LOCALDEV) {
5447 					/*
5448 					 * Pkt came down the stack, send out
5449 					 * over physical device.
5450 					 */
5451 					if ((ret_m = vsw_tx_msg(vswp, mp))
5452 								!= NULL) {
5453 						DERR(vswp, "%s: drop mblks to "
5454 							"phys dev", __func__);
5455 						freemsg(ret_m);
5456 					}
5457 				}
5458 			}
5459 		}
5460 	}
5461 	D1(vswp, "%s: exit\n", __func__);
5462 }
5463 
5464 /*
5465  * Switch ethernet frame when in layer 3 mode (i.e. using IP
5466  * layer to do the routing).
5467  *
5468  * There is a large amount of overlap between this function and
5469  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
5470  * both these functions.
5471  */
5472 void
5473 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
5474 			vsw_port_t *arg, mac_resource_handle_t mrh)
5475 {
5476 	struct ether_header	*ehp;
5477 	vsw_port_t		*port = NULL;
5478 	mblk_t			*bp = NULL;
5479 	vsw_port_list_t		*plist = &vswp->plist;
5480 
5481 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
5482 
5483 	/*
5484 	 * In layer 3 mode should only ever be switching packets
5485 	 * between IP layer and vnet devices. So make sure thats
5486 	 * who is invoking us.
5487 	 */
5488 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
5489 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
5490 		freemsgchain(mp);
5491 		return;
5492 	}
5493 
5494 	/* process the chain of packets */
5495 	bp = mp;
5496 	while (bp) {
5497 		mp = bp;
5498 		bp = bp->b_next;
5499 		mp->b_next = mp->b_prev = NULL;
5500 		ehp = (struct ether_header *)mp->b_rptr;
5501 
5502 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
5503 			__func__, MBLKSIZE(mp), MBLKL(mp));
5504 
5505 		READ_ENTER(&plist->lockrw);
5506 		port = vsw_lookup_fdb(vswp, ehp);
5507 		if (port) {
5508 			/*
5509 			 * Mark port as in-use.
5510 			 */
5511 			mutex_enter(&port->ref_lock);
5512 			port->ref_cnt++;
5513 			mutex_exit(&port->ref_lock);
5514 			RW_EXIT(&plist->lockrw);
5515 
5516 			D2(vswp, "%s: sending to target port", __func__);
5517 			(void) vsw_portsend(port, mp);
5518 
5519 			/*
5520 			 * Finished with port so decrement ref count and
5521 			 * check if should wake delete thread.
5522 			 */
5523 			mutex_enter(&port->ref_lock);
5524 			port->ref_cnt--;
5525 			if (port->ref_cnt == 0)
5526 				cv_signal(&port->ref_cv);
5527 			mutex_exit(&port->ref_lock);
5528 		} else {
5529 			RW_EXIT(&plist->lockrw);
5530 			/*
5531 			 * Destination not in FDB
5532 			 *
5533 			 * If the destination is broadcast or
5534 			 * multicast forward the packet to all
5535 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
5536 			 * except the caller.
5537 			 */
5538 			if (IS_BROADCAST(ehp)) {
5539 				D2(vswp, "%s: BROADCAST pkt", __func__);
5540 				(void) vsw_forward_all(vswp, mp,
5541 								caller, arg);
5542 			} else if (IS_MULTICAST(ehp)) {
5543 				D2(vswp, "%s: MULTICAST pkt", __func__);
5544 				(void) vsw_forward_grp(vswp, mp,
5545 							caller, arg);
5546 			} else {
5547 				/*
5548 				 * Unicast pkt from vnet that we don't have
5549 				 * an FDB entry for, so must be destinded for
5550 				 * the outside world. Attempt to send up to the
5551 				 * IP layer to allow it to deal with it.
5552 				 */
5553 				if (caller == VSW_VNETPORT) {
5554 					READ_ENTER(&vswp->if_lockrw);
5555 					if (vswp->if_state & VSW_IF_UP) {
5556 						RW_EXIT(&vswp->if_lockrw);
5557 						D2(vswp, "%s: sending up",
5558 							__func__);
5559 						mac_rx(vswp->if_mh, mrh, mp);
5560 					} else {
5561 						RW_EXIT(&vswp->if_lockrw);
5562 						/* Interface down, drop pkt */
5563 						D2(vswp, "%s I/F down",
5564 								__func__);
5565 						freemsg(mp);
5566 					}
5567 				}
5568 			}
5569 		}
5570 	}
5571 
5572 	D1(vswp, "%s: exit", __func__);
5573 }
5574 
5575 /*
5576  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
5577  * except the caller (port on which frame arrived).
5578  */
5579 static int
5580 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
5581 {
5582 	vsw_port_list_t	*plist = &vswp->plist;
5583 	vsw_port_t	*portp;
5584 	mblk_t		*nmp = NULL;
5585 	mblk_t		*ret_m = NULL;
5586 	int		skip_port = 0;
5587 
5588 	D1(vswp, "vsw_forward_all: enter\n");
5589 
5590 	/*
5591 	 * Broadcast message from inside ldoms so send to outside
5592 	 * world if in either of layer 2 modes.
5593 	 */
5594 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
5595 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
5596 		((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
5597 
5598 		nmp = dupmsg(mp);
5599 		if (nmp) {
5600 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
5601 				DERR(vswp, "%s: dropping pkt(s) "
5602 				"consisting of %ld bytes of data for"
5603 				" physical device", __func__, MBLKL(ret_m));
5604 			freemsg(ret_m);
5605 			}
5606 		}
5607 	}
5608 
5609 	if (caller == VSW_VNETPORT)
5610 		skip_port = 1;
5611 
5612 	/*
5613 	 * Broadcast message from other vnet (layer 2 or 3) or outside
5614 	 * world (layer 2 only), send up stack if plumbed.
5615 	 */
5616 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
5617 		READ_ENTER(&vswp->if_lockrw);
5618 		if (vswp->if_state & VSW_IF_UP) {
5619 			RW_EXIT(&vswp->if_lockrw);
5620 			nmp = copymsg(mp);
5621 			if (nmp)
5622 				mac_rx(vswp->if_mh, NULL, nmp);
5623 		} else {
5624 			RW_EXIT(&vswp->if_lockrw);
5625 		}
5626 	}
5627 
5628 	/* send it to all VNETPORTs */
5629 	READ_ENTER(&plist->lockrw);
5630 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
5631 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
5632 		/*
5633 		 * Caution ! - don't reorder these two checks as arg
5634 		 * will be NULL if the caller is PHYSDEV. skip_port is
5635 		 * only set if caller is VNETPORT.
5636 		 */
5637 		if ((skip_port) && (portp == arg))
5638 			continue;
5639 		else {
5640 			nmp = dupmsg(mp);
5641 			if (nmp) {
5642 				(void) vsw_portsend(portp, nmp);
5643 			} else {
5644 				DERR(vswp, "vsw_forward_all: nmp NULL");
5645 			}
5646 		}
5647 	}
5648 	RW_EXIT(&plist->lockrw);
5649 
5650 	freemsg(mp);
5651 
5652 	D1(vswp, "vsw_forward_all: exit\n");
5653 	return (0);
5654 }
5655 
5656 /*
5657  * Forward pkts to any devices or interfaces which have registered
5658  * an interest in them (i.e. multicast groups).
5659  */
5660 static int
5661 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
5662 {
5663 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
5664 	mfdb_ent_t		*entp = NULL;
5665 	mfdb_ent_t		*tpp = NULL;
5666 	vsw_port_t 		*port;
5667 	uint64_t		key = 0;
5668 	mblk_t			*nmp = NULL;
5669 	mblk_t			*ret_m = NULL;
5670 	boolean_t		check_if = B_TRUE;
5671 
5672 	/*
5673 	 * Convert address to hash table key
5674 	 */
5675 	KEY_HASH(key, ehp->ether_dhost);
5676 
5677 	D1(vswp, "%s: key 0x%llx", __func__, key);
5678 
5679 	/*
5680 	 * If pkt came from either a vnet or down the stack (if we are
5681 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
5682 	 * over the physical adapter, and then check to see if any other
5683 	 * vnets are interested in it.
5684 	 */
5685 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
5686 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
5687 		((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
5688 		nmp = dupmsg(mp);
5689 		if (nmp) {
5690 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
5691 				DERR(vswp, "%s: dropping pkt(s) "
5692 					"consisting of %ld bytes of "
5693 					"data for physical device",
5694 					__func__, MBLKL(ret_m));
5695 				freemsg(ret_m);
5696 			}
5697 		}
5698 	}
5699 
5700 	READ_ENTER(&vswp->mfdbrw);
5701 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
5702 				(mod_hash_val_t *)&entp) != 0) {
5703 		D3(vswp, "%s: no table entry found for addr 0x%llx",
5704 								__func__, key);
5705 	} else {
5706 		/*
5707 		 * Send to list of devices associated with this address...
5708 		 */
5709 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
5710 
5711 			/* dont send to ourselves */
5712 			if ((caller == VSW_VNETPORT) &&
5713 				(tpp->d_addr == (void *)arg)) {
5714 				port = (vsw_port_t *)tpp->d_addr;
5715 				D3(vswp, "%s: not sending to ourselves"
5716 					" : port %d", __func__,
5717 					port->p_instance);
5718 				continue;
5719 
5720 			} else if ((caller == VSW_LOCALDEV) &&
5721 				(tpp->d_type == VSW_LOCALDEV)) {
5722 				D3(vswp, "%s: not sending back up stack",
5723 					__func__);
5724 				continue;
5725 			}
5726 
5727 			if (tpp->d_type == VSW_VNETPORT) {
5728 				port = (vsw_port_t *)tpp->d_addr;
5729 				D3(vswp, "%s: sending to port %ld for "
5730 					" addr 0x%llx", __func__,
5731 					port->p_instance, key);
5732 
5733 				nmp = dupmsg(mp);
5734 				if (nmp)
5735 					(void) vsw_portsend(port, nmp);
5736 			} else {
5737 				if (vswp->if_state & VSW_IF_UP) {
5738 					nmp = copymsg(mp);
5739 					if (nmp)
5740 						mac_rx(vswp->if_mh, NULL, nmp);
5741 					check_if = B_FALSE;
5742 					D3(vswp, "%s: sending up stack"
5743 						" for addr 0x%llx", __func__,
5744 						key);
5745 				}
5746 			}
5747 		}
5748 	}
5749 
5750 	RW_EXIT(&vswp->mfdbrw);
5751 
5752 	/*
5753 	 * If the pkt came from either a vnet or from physical device,
5754 	 * and if we havent already sent the pkt up the stack then we
5755 	 * check now if we can/should (i.e. the interface is plumbed
5756 	 * and in promisc mode).
5757 	 */
5758 	if ((check_if) &&
5759 		((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
5760 		READ_ENTER(&vswp->if_lockrw);
5761 		if (VSW_U_P(vswp->if_state)) {
5762 			RW_EXIT(&vswp->if_lockrw);
5763 			D3(vswp, "%s: (caller %d) finally sending up stack"
5764 				" for addr 0x%llx", __func__, caller, key);
5765 			nmp = copymsg(mp);
5766 			if (nmp)
5767 				mac_rx(vswp->if_mh, NULL, nmp);
5768 		} else {
5769 			RW_EXIT(&vswp->if_lockrw);
5770 		}
5771 	}
5772 
5773 	freemsg(mp);
5774 
5775 	D1(vswp, "%s: exit", __func__);
5776 
5777 	return (0);
5778 }
5779 
5780 /* transmit the packet over the given port */
5781 static int
5782 vsw_portsend(vsw_port_t *port, mblk_t *mp)
5783 {
5784 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
5785 	vsw_ldc_t 	*ldcp;
5786 	int		status = 0;
5787 
5788 
5789 	READ_ENTER(&ldcl->lockrw);
5790 	/*
5791 	 * Note for now, we have a single channel.
5792 	 */
5793 	ldcp = ldcl->head;
5794 	if (ldcp == NULL) {
5795 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
5796 		freemsg(mp);
5797 		RW_EXIT(&ldcl->lockrw);
5798 		return (1);
5799 	}
5800 
5801 	/*
5802 	 * Send the message out using the appropriate
5803 	 * transmit function which will free mblock when it
5804 	 * is finished with it.
5805 	 */
5806 	mutex_enter(&port->tx_lock);
5807 	if (port->transmit != NULL)
5808 		status = (*port->transmit)(ldcp, mp);
5809 	else {
5810 		freemsg(mp);
5811 	}
5812 	mutex_exit(&port->tx_lock);
5813 
5814 	RW_EXIT(&ldcl->lockrw);
5815 
5816 	return (status);
5817 }
5818 
5819 /*
5820  * Send packet out via descriptor ring to a logical device.
5821  */
5822 static int
5823 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
5824 {
5825 	vio_dring_msg_t		dring_pkt;
5826 	dring_info_t		*dp = NULL;
5827 	vsw_private_desc_t	*priv_desc = NULL;
5828 	vnet_public_desc_t	*pub = NULL;
5829 	vsw_t			*vswp = ldcp->ldc_vswp;
5830 	mblk_t			*bp;
5831 	size_t			n, size;
5832 	caddr_t			bufp;
5833 	int			idx;
5834 	int			status = LDC_TX_SUCCESS;
5835 
5836 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
5837 
5838 	/* TODO: make test a macro */
5839 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
5840 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
5841 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
5842 			"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
5843 			ldcp->lane_out.lstate);
5844 		freemsg(mp);
5845 		return (LDC_TX_FAILURE);
5846 	}
5847 
5848 	/*
5849 	 * Note - using first ring only, this may change
5850 	 * in the future.
5851 	 */
5852 	if ((dp = ldcp->lane_out.dringp) == NULL) {
5853 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
5854 			" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
5855 		freemsg(mp);
5856 		return (LDC_TX_FAILURE);
5857 	}
5858 
5859 	size = msgsize(mp);
5860 	if (size > (size_t)ETHERMAX) {
5861 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
5862 		    ldcp->ldc_id, size);
5863 		freemsg(mp);
5864 		return (LDC_TX_FAILURE);
5865 	}
5866 
5867 	/*
5868 	 * Find a free descriptor
5869 	 *
5870 	 * Note: for the moment we are assuming that we will only
5871 	 * have one dring going from the switch to each of its
5872 	 * peers. This may change in the future.
5873 	 */
5874 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
5875 		D2(vswp, "%s(%lld): no descriptor available for ring "
5876 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
5877 
5878 		/* nothing more we can do */
5879 		status = LDC_TX_NORESOURCES;
5880 		goto vsw_dringsend_free_exit;
5881 	} else {
5882 		D2(vswp, "%s(%lld): free private descriptor found at pos "
5883 			"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
5884 			priv_desc);
5885 	}
5886 
5887 	/* copy data into the descriptor */
5888 	bufp = priv_desc->datap;
5889 	bufp += VNET_IPALIGN;
5890 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
5891 		n = MBLKL(bp);
5892 		bcopy(bp->b_rptr, bufp, n);
5893 		bufp += n;
5894 	}
5895 
5896 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
5897 
5898 	pub = priv_desc->descp;
5899 	pub->nbytes = priv_desc->datalen;
5900 
5901 	mutex_enter(&priv_desc->dstate_lock);
5902 	pub->hdr.dstate = VIO_DESC_READY;
5903 	mutex_exit(&priv_desc->dstate_lock);
5904 
5905 	/*
5906 	 * Determine whether or not we need to send a message to our
5907 	 * peer prompting them to read our newly updated descriptor(s).
5908 	 */
5909 	mutex_enter(&dp->restart_lock);
5910 	if (dp->restart_reqd) {
5911 		dp->restart_reqd = B_FALSE;
5912 		mutex_exit(&dp->restart_lock);
5913 
5914 		/*
5915 		 * Send a vio_dring_msg to peer to prompt them to read
5916 		 * the updated descriptor ring.
5917 		 */
5918 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
5919 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
5920 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
5921 		dring_pkt.tag.vio_sid = ldcp->local_session;
5922 
5923 		/* Note - for now using first ring */
5924 		dring_pkt.dring_ident = dp->ident;
5925 
5926 		mutex_enter(&ldcp->lane_out.seq_lock);
5927 		dring_pkt.seq_num = ldcp->lane_out.seq_num++;
5928 		mutex_exit(&ldcp->lane_out.seq_lock);
5929 
5930 		/*
5931 		 * If last_ack_recv is -1 then we know we've not
5932 		 * received any ack's yet, so this must be the first
5933 		 * msg sent, so set the start to the begining of the ring.
5934 		 */
5935 		mutex_enter(&dp->dlock);
5936 		if (dp->last_ack_recv == -1) {
5937 			dring_pkt.start_idx = 0;
5938 		} else {
5939 			dring_pkt.start_idx = (dp->last_ack_recv + 1) %
5940 						dp->num_descriptors;
5941 		}
5942 		dring_pkt.end_idx = -1;
5943 		mutex_exit(&dp->dlock);
5944 
5945 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
5946 			ldcp->ldc_id, dp, dring_pkt.dring_ident);
5947 		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
5948 			__func__, ldcp->ldc_id, dring_pkt.start_idx,
5949 			dring_pkt.end_idx, dring_pkt.seq_num);
5950 
5951 		vsw_send_msg(ldcp, (void *)&dring_pkt,
5952 						sizeof (vio_dring_msg_t));
5953 	} else {
5954 		mutex_exit(&dp->restart_lock);
5955 		D2(vswp, "%s(%lld): updating descp %d", __func__,
5956 			ldcp->ldc_id, idx);
5957 	}
5958 
5959 vsw_dringsend_free_exit:
5960 
5961 	/* free the message block */
5962 	freemsg(mp);
5963 
5964 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
5965 	return (status);
5966 }
5967 
5968 /*
5969  * Send an in-band descriptor message over ldc.
5970  */
5971 static int
5972 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
5973 {
5974 	vsw_t			*vswp = ldcp->ldc_vswp;
5975 	vio_ibnd_desc_t		ibnd_msg;
5976 	vsw_private_desc_t	*priv_desc = NULL;
5977 	dring_info_t		*dp = NULL;
5978 	size_t			n, size = 0;
5979 	caddr_t			bufp;
5980 	mblk_t			*bp;
5981 	int			idx, i;
5982 	int			status = LDC_TX_SUCCESS;
5983 	static int		warn_msg = 1;
5984 
5985 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5986 
5987 	ASSERT(mp != NULL);
5988 
5989 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
5990 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
5991 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
5992 			__func__, ldcp->ldc_id, ldcp->ldc_status,
5993 			ldcp->lane_out.lstate);
5994 		freemsg(mp);
5995 		return (LDC_TX_FAILURE);
5996 	}
5997 
5998 	/*
5999 	 * only expect single dring to exist, which we use
6000 	 * as an internal buffer, rather than a transfer channel.
6001 	 */
6002 	if ((dp = ldcp->lane_out.dringp) == NULL) {
6003 		DERR(vswp, "%s(%lld): no dring for outbound lane",
6004 			__func__, ldcp->ldc_id);
6005 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
6006 			__func__, ldcp->ldc_id, ldcp->ldc_status,
6007 			ldcp->lane_out.lstate);
6008 		freemsg(mp);
6009 		return (LDC_TX_FAILURE);
6010 	}
6011 
6012 	size = msgsize(mp);
6013 	if (size > (size_t)ETHERMAX) {
6014 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
6015 		    ldcp->ldc_id, size);
6016 		freemsg(mp);
6017 		return (LDC_TX_FAILURE);
6018 	}
6019 
6020 	/*
6021 	 * Find a free descriptor in our buffer ring
6022 	 */
6023 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
6024 		if (warn_msg) {
6025 			DERR(vswp, "%s(%lld): no descriptor available for ring "
6026 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
6027 			warn_msg = 0;
6028 		}
6029 
6030 		/* nothing more we can do */
6031 		status = LDC_TX_NORESOURCES;
6032 		goto vsw_descrsend_free_exit;
6033 	} else {
6034 		D2(vswp, "%s(%lld): free private descriptor found at pos "
6035 			"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
6036 			priv_desc);
6037 		warn_msg = 1;
6038 	}
6039 
6040 	/* copy data into the descriptor */
6041 	bufp = priv_desc->datap;
6042 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
6043 		n = MBLKL(bp);
6044 		bcopy(bp->b_rptr, bufp, n);
6045 		bufp += n;
6046 	}
6047 
6048 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
6049 
6050 	/* create and send the in-band descp msg */
6051 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
6052 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
6053 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
6054 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
6055 
6056 	mutex_enter(&ldcp->lane_out.seq_lock);
6057 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
6058 	mutex_exit(&ldcp->lane_out.seq_lock);
6059 
6060 	/*
6061 	 * Copy the mem cookies describing the data from the
6062 	 * private region of the descriptor ring into the inband
6063 	 * descriptor.
6064 	 */
6065 	for (i = 0; i < priv_desc->ncookies; i++) {
6066 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
6067 			sizeof (ldc_mem_cookie_t));
6068 	}
6069 
6070 	ibnd_msg.hdr.desc_handle = idx;
6071 	ibnd_msg.ncookies = priv_desc->ncookies;
6072 	ibnd_msg.nbytes = size;
6073 
6074 	vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t));
6075 
6076 vsw_descrsend_free_exit:
6077 
6078 	/* free the allocated message blocks */
6079 	freemsg(mp);
6080 
6081 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6082 	return (status);
6083 }
6084 
6085 static void
6086 vsw_send_ver(vsw_ldc_t *ldcp)
6087 {
6088 	vsw_t		*vswp = ldcp->ldc_vswp;
6089 	lane_t		*lp = &ldcp->lane_out;
6090 	vio_ver_msg_t	ver_msg;
6091 
6092 	D1(vswp, "%s enter", __func__);
6093 
6094 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
6095 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
6096 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
6097 	ver_msg.tag.vio_sid = ldcp->local_session;
6098 
6099 	ver_msg.ver_major = vsw_versions[0].ver_major;
6100 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
6101 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
6102 
6103 	lp->lstate |= VSW_VER_INFO_SENT;
6104 	lp->ver_major = ver_msg.ver_major;
6105 	lp->ver_minor = ver_msg.ver_minor;
6106 
6107 	DUMP_TAG(ver_msg.tag);
6108 
6109 	vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t));
6110 
6111 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
6112 }
6113 
6114 static void
6115 vsw_send_attr(vsw_ldc_t *ldcp)
6116 {
6117 	vsw_t			*vswp = ldcp->ldc_vswp;
6118 	lane_t			*lp = &ldcp->lane_out;
6119 	vnet_attr_msg_t		attr_msg;
6120 
6121 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
6122 
6123 	/*
6124 	 * Subtype is set to INFO by default
6125 	 */
6126 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
6127 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
6128 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
6129 	attr_msg.tag.vio_sid = ldcp->local_session;
6130 
6131 	/* payload copied from default settings for lane */
6132 	attr_msg.mtu = lp->mtu;
6133 	attr_msg.addr_type = lp->addr_type;
6134 	attr_msg.xfer_mode = lp->xfer_mode;
6135 	attr_msg.ack_freq = lp->xfer_mode;
6136 
6137 	READ_ENTER(&vswp->if_lockrw);
6138 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
6139 	RW_EXIT(&vswp->if_lockrw);
6140 
6141 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
6142 
6143 	DUMP_TAG(attr_msg.tag);
6144 
6145 	vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t));
6146 
6147 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
6148 }
6149 
6150 /*
6151  * Create dring info msg (which also results in the creation of
6152  * a dring).
6153  */
6154 static vio_dring_reg_msg_t *
6155 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
6156 {
6157 	vio_dring_reg_msg_t	*mp;
6158 	dring_info_t		*dp;
6159 	vsw_t			*vswp = ldcp->ldc_vswp;
6160 
6161 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
6162 
6163 	/*
6164 	 * If we can't create a dring, obviously no point sending
6165 	 * a message.
6166 	 */
6167 	if ((dp = vsw_create_dring(ldcp)) == NULL)
6168 		return (NULL);
6169 
6170 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
6171 
6172 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
6173 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
6174 	mp->tag.vio_subtype_env = VIO_DRING_REG;
6175 	mp->tag.vio_sid = ldcp->local_session;
6176 
6177 	/* payload */
6178 	mp->num_descriptors = dp->num_descriptors;
6179 	mp->descriptor_size = dp->descriptor_size;
6180 	mp->options = dp->options;
6181 	mp->ncookies = dp->ncookies;
6182 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
6183 
6184 	mp->dring_ident = 0;
6185 
6186 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
6187 
6188 	return (mp);
6189 }
6190 
6191 static void
6192 vsw_send_dring_info(vsw_ldc_t *ldcp)
6193 {
6194 	vio_dring_reg_msg_t	*dring_msg;
6195 	vsw_t			*vswp = ldcp->ldc_vswp;
6196 
6197 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
6198 
6199 	dring_msg = vsw_create_dring_info_pkt(ldcp);
6200 	if (dring_msg == NULL) {
6201 		cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg");
6202 		return;
6203 	}
6204 
6205 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
6206 
6207 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
6208 
6209 	vsw_send_msg(ldcp, dring_msg,
6210 		sizeof (vio_dring_reg_msg_t));
6211 
6212 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
6213 
6214 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
6215 }
6216 
6217 static void
6218 vsw_send_rdx(vsw_ldc_t *ldcp)
6219 {
6220 	vsw_t		*vswp = ldcp->ldc_vswp;
6221 	vio_rdx_msg_t	rdx_msg;
6222 
6223 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
6224 
6225 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
6226 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
6227 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
6228 	rdx_msg.tag.vio_sid = ldcp->local_session;
6229 
6230 	ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT;
6231 
6232 	DUMP_TAG(rdx_msg.tag);
6233 
6234 	vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t));
6235 
6236 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
6237 }
6238 
6239 /*
6240  * Generic routine to send message out over ldc channel.
6241  */
6242 static void
6243 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size)
6244 {
6245 	int		rv;
6246 	size_t		msglen = size;
6247 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
6248 	vsw_t		*vswp = ldcp->ldc_vswp;
6249 
6250 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
6251 			ldcp->ldc_id, size);
6252 
6253 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
6254 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
6255 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
6256 
6257 	mutex_enter(&ldcp->ldc_txlock);
6258 	do {
6259 		msglen = size;
6260 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
6261 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
6262 
6263 	mutex_exit(&ldcp->ldc_txlock);
6264 
6265 	if ((rv != 0) || (msglen != size)) {
6266 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
6267 			"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
6268 			rv, size, msglen);
6269 	}
6270 
6271 	D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes",
6272 			ldcp->ldc_id, msglen);
6273 }
6274 
6275 /*
6276  * Add an entry into FDB, for the given mac address and port_id.
6277  * Returns 0 on success, 1 on failure.
6278  *
6279  * Lock protecting FDB must be held by calling process.
6280  */
6281 static int
6282 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
6283 {
6284 	uint64_t	addr = 0;
6285 
6286 	D1(vswp, "%s: enter", __func__);
6287 
6288 	KEY_HASH(addr, port->p_macaddr);
6289 
6290 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
6291 
6292 	/*
6293 	 * Note: duplicate keys will be rejected by mod_hash.
6294 	 */
6295 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
6296 				(mod_hash_val_t)port) != 0) {
6297 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
6298 		return (1);
6299 	}
6300 
6301 	D1(vswp, "%s: exit", __func__);
6302 	return (0);
6303 }
6304 
6305 /*
6306  * Remove an entry from FDB.
6307  * Returns 0 on success, 1 on failure.
6308  */
6309 static int
6310 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
6311 {
6312 	uint64_t	addr = 0;
6313 
6314 	D1(vswp, "%s: enter", __func__);
6315 
6316 	KEY_HASH(addr, port->p_macaddr);
6317 
6318 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
6319 
6320 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
6321 
6322 	D1(vswp, "%s: enter", __func__);
6323 
6324 	return (0);
6325 }
6326 
6327 /*
6328  * Search fdb for a given mac address.
6329  * Returns pointer to the entry if found, else returns NULL.
6330  */
6331 static vsw_port_t *
6332 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
6333 {
6334 	uint64_t	key = 0;
6335 	vsw_port_t	*port = NULL;
6336 
6337 	D1(vswp, "%s: enter", __func__);
6338 
6339 	KEY_HASH(key, ehp->ether_dhost);
6340 
6341 	D2(vswp, "%s: key = 0x%llx", __func__, key);
6342 
6343 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
6344 				(mod_hash_val_t *)&port) != 0) {
6345 		return (NULL);
6346 	}
6347 
6348 	D1(vswp, "%s: exit", __func__);
6349 
6350 	return (port);
6351 }
6352 
6353 /*
6354  * Add or remove multicast address(es).
6355  *
6356  * Returns 0 on success, 1 on failure.
6357  */
6358 static int
6359 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
6360 {
6361 	mcst_addr_t		*mcst_p = NULL;
6362 	vsw_t			*vswp = port->p_vswp;
6363 	uint64_t		addr = 0x0;
6364 	int			i, ret;
6365 
6366 	D1(vswp, "%s: enter", __func__);
6367 
6368 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
6369 
6370 	if (vswp->mh == NULL)
6371 		return (1);
6372 
6373 	for (i = 0; i < mcst_pkt->count; i++) {
6374 		/*
6375 		 * Convert address into form that can be used
6376 		 * as hash table key.
6377 		 */
6378 		KEY_HASH(addr, mcst_pkt->mca[i]);
6379 
6380 		/*
6381 		 * Add or delete the specified address/port combination.
6382 		 */
6383 		if (mcst_pkt->set == 0x1) {
6384 			D3(vswp, "%s: adding multicast address 0x%llx for "
6385 				"port %ld", __func__, addr, port->p_instance);
6386 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
6387 				/*
6388 				 * Update the list of multicast
6389 				 * addresses contained within the
6390 				 * port structure to include this new
6391 				 * one.
6392 				 */
6393 				mcst_p = kmem_alloc(sizeof (mcst_addr_t),
6394 								KM_NOSLEEP);
6395 				if (mcst_p == NULL) {
6396 					DERR(vswp, "%s: unable to alloc mem",
6397 						__func__);
6398 					return (1);
6399 				}
6400 
6401 				mcst_p->nextp = NULL;
6402 				mcst_p->addr = addr;
6403 
6404 				mutex_enter(&port->mca_lock);
6405 				mcst_p->nextp = port->mcap;
6406 				port->mcap = mcst_p;
6407 				mutex_exit(&port->mca_lock);
6408 
6409 				/*
6410 				 * Program the address into HW. If the addr
6411 				 * has already been programmed then the MAC
6412 				 * just increments a ref counter (which is
6413 				 * used when the address is being deleted)
6414 				 */
6415 				ret = mac_multicst_add(vswp->mh,
6416 						(uchar_t *)&mcst_pkt->mca[i]);
6417 				if (ret) {
6418 					cmn_err(CE_WARN, "!unable to add "
6419 						"multicast address");
6420 					(void) vsw_del_mcst(vswp, VSW_VNETPORT,
6421 						addr, port);
6422 					vsw_del_addr(VSW_VNETPORT, port, addr);
6423 					return (ret);
6424 				}
6425 
6426 			} else {
6427 				DERR(vswp, "%s: error adding multicast "
6428 					"address 0x%llx for port %ld",
6429 					__func__, addr, port->p_instance);
6430 				return (1);
6431 			}
6432 		} else {
6433 			/*
6434 			 * Delete an entry from the multicast hash
6435 			 * table and update the address list
6436 			 * appropriately.
6437 			 */
6438 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
6439 				D3(vswp, "%s: deleting multicast address "
6440 					"0x%llx for port %ld", __func__, addr,
6441 					port->p_instance);
6442 
6443 				vsw_del_addr(VSW_VNETPORT, port, addr);
6444 
6445 				/*
6446 				 * Remove the address from HW. The address
6447 				 * will actually only be removed once the ref
6448 				 * count within the MAC layer has dropped to
6449 				 * zero. I.e. we can safely call this fn even
6450 				 * if other ports are interested in this
6451 				 * address.
6452 				 */
6453 				(void) mac_multicst_remove(vswp->mh,
6454 						(uchar_t *)&mcst_pkt->mca[i]);
6455 
6456 			} else {
6457 				DERR(vswp, "%s: error deleting multicast "
6458 					"addr 0x%llx for port %ld",
6459 					__func__, addr, port->p_instance);
6460 				return (1);
6461 			}
6462 		}
6463 	}
6464 	D1(vswp, "%s: exit", __func__);
6465 	return (0);
6466 }
6467 
6468 /*
6469  * Add a new multicast entry.
6470  *
6471  * Search hash table based on address. If match found then
6472  * update associated val (which is chain of ports), otherwise
6473  * create new key/val (addr/port) pair and insert into table.
6474  */
6475 static int
6476 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
6477 {
6478 	int		dup = 0;
6479 	int		rv = 0;
6480 	mfdb_ent_t	*ment = NULL;
6481 	mfdb_ent_t	*tmp_ent = NULL;
6482 	mfdb_ent_t	*new_ent = NULL;
6483 	void		*tgt = NULL;
6484 
6485 	if (devtype == VSW_VNETPORT) {
6486 		/*
6487 		 * Being invoked from a vnet.
6488 		 */
6489 		ASSERT(arg != NULL);
6490 		tgt = arg;
6491 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
6492 			((vsw_port_t *)arg)->p_instance, addr);
6493 	} else {
6494 		/*
6495 		 * We are being invoked via the m_multicst mac entry
6496 		 * point.
6497 		 */
6498 		D2(NULL, "%s: address 0x%llx", __func__, addr);
6499 		tgt = (void *)vswp;
6500 	}
6501 
6502 	WRITE_ENTER(&vswp->mfdbrw);
6503 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
6504 				(mod_hash_val_t *)&ment) != 0) {
6505 
6506 		/* address not currently in table */
6507 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
6508 		ment->d_addr = (void *)tgt;
6509 		ment->d_type = devtype;
6510 		ment->nextp = NULL;
6511 
6512 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
6513 			(mod_hash_val_t)ment) != 0) {
6514 			DERR(vswp, "%s: hash table insertion failed", __func__);
6515 			kmem_free(ment, sizeof (mfdb_ent_t));
6516 			rv = 1;
6517 		} else {
6518 			D2(vswp, "%s: added initial entry for 0x%llx to "
6519 				"table", __func__, addr);
6520 		}
6521 	} else {
6522 		/*
6523 		 * Address in table. Check to see if specified port
6524 		 * is already associated with the address. If not add
6525 		 * it now.
6526 		 */
6527 		tmp_ent = ment;
6528 		while (tmp_ent != NULL) {
6529 			if (tmp_ent->d_addr == (void *)tgt) {
6530 				if (devtype == VSW_VNETPORT) {
6531 					DERR(vswp, "%s: duplicate port entry "
6532 						"found for portid %ld and key "
6533 						"0x%llx", __func__,
6534 						((vsw_port_t *)arg)->p_instance,
6535 						addr);
6536 				} else {
6537 					DERR(vswp, "%s: duplicate entry found"
6538 						"for key 0x%llx",
6539 						__func__, addr);
6540 				}
6541 				rv = 1;
6542 				dup = 1;
6543 				break;
6544 			}
6545 			tmp_ent = tmp_ent->nextp;
6546 		}
6547 
6548 		/*
6549 		 * Port not on list so add it to end now.
6550 		 */
6551 		if (0 == dup) {
6552 			D2(vswp, "%s: added entry for 0x%llx to table",
6553 				__func__, addr);
6554 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
6555 			new_ent->d_addr = (void *)tgt;
6556 			new_ent->d_type = devtype;
6557 			new_ent->nextp = NULL;
6558 
6559 			tmp_ent = ment;
6560 			while (tmp_ent->nextp != NULL)
6561 				tmp_ent = tmp_ent->nextp;
6562 
6563 			tmp_ent->nextp = new_ent;
6564 		}
6565 	}
6566 
6567 	RW_EXIT(&vswp->mfdbrw);
6568 	return (rv);
6569 }
6570 
6571 /*
6572  * Remove a multicast entry from the hashtable.
6573  *
6574  * Search hash table based on address. If match found, scan
6575  * list of ports associated with address. If specified port
6576  * found remove it from list.
6577  */
6578 static int
6579 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
6580 {
6581 	mfdb_ent_t	*ment = NULL;
6582 	mfdb_ent_t	*curr_p, *prev_p;
6583 	void		*tgt = NULL;
6584 
6585 	D1(vswp, "%s: enter", __func__);
6586 
6587 	if (devtype == VSW_VNETPORT) {
6588 		tgt = (vsw_port_t *)arg;
6589 		D2(vswp, "%s: removing port %d from mFDB for address"
6590 			" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
6591 			addr);
6592 	} else {
6593 		D2(vswp, "%s: removing entry", __func__);
6594 		tgt = (void *)vswp;
6595 	}
6596 
6597 	WRITE_ENTER(&vswp->mfdbrw);
6598 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
6599 				(mod_hash_val_t *)&ment) != 0) {
6600 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
6601 		RW_EXIT(&vswp->mfdbrw);
6602 		return (1);
6603 	}
6604 
6605 	prev_p = curr_p = ment;
6606 
6607 	while (curr_p != NULL) {
6608 		if (curr_p->d_addr == (void *)tgt) {
6609 			if (devtype == VSW_VNETPORT) {
6610 				D2(vswp, "%s: port %d found", __func__,
6611 					((vsw_port_t *)tgt)->p_instance);
6612 			} else {
6613 				D2(vswp, "%s: instance found", __func__);
6614 			}
6615 
6616 			if (prev_p == curr_p) {
6617 				/*
6618 				 * head of list, if no other element is in
6619 				 * list then destroy this entry, otherwise
6620 				 * just replace it with updated value.
6621 				 */
6622 				ment = curr_p->nextp;
6623 				kmem_free(curr_p, sizeof (mfdb_ent_t));
6624 				if (ment == NULL) {
6625 					(void) mod_hash_destroy(vswp->mfdb,
6626 							(mod_hash_val_t)addr);
6627 				} else {
6628 					(void) mod_hash_replace(vswp->mfdb,
6629 							(mod_hash_key_t)addr,
6630 							(mod_hash_val_t)ment);
6631 				}
6632 			} else {
6633 				/*
6634 				 * Not head of list, no need to do
6635 				 * replacement, just adjust list pointers.
6636 				 */
6637 				prev_p->nextp = curr_p->nextp;
6638 				kmem_free(curr_p, sizeof (mfdb_ent_t));
6639 			}
6640 			break;
6641 		}
6642 
6643 		prev_p = curr_p;
6644 		curr_p = curr_p->nextp;
6645 	}
6646 
6647 	RW_EXIT(&vswp->mfdbrw);
6648 
6649 	D1(vswp, "%s: exit", __func__);
6650 
6651 	return (0);
6652 }
6653 
6654 /*
6655  * Port is being deleted, but has registered an interest in one
6656  * or more multicast groups. Using the list of addresses maintained
6657  * within the port structure find the appropriate entry in the hash
6658  * table and remove this port from the list of interested ports.
6659  */
6660 static void
6661 vsw_del_mcst_port(vsw_port_t *port)
6662 {
6663 	mcst_addr_t	*mcst_p = NULL;
6664 	vsw_t		*vswp = port->p_vswp;
6665 
6666 	D1(vswp, "%s: enter", __func__);
6667 
6668 	mutex_enter(&port->mca_lock);
6669 	while (port->mcap != NULL) {
6670 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
6671 					port->mcap->addr, port);
6672 
6673 		mcst_p = port->mcap->nextp;
6674 		kmem_free(port->mcap, sizeof (mcst_addr_t));
6675 		port->mcap = mcst_p;
6676 	}
6677 	mutex_exit(&port->mca_lock);
6678 
6679 	D1(vswp, "%s: exit", __func__);
6680 }
6681 
6682 /*
6683  * This vsw instance is detaching, but has registered an interest in one
6684  * or more multicast groups. Using the list of addresses maintained
6685  * within the vsw structure find the appropriate entry in the hash
6686  * table and remove this instance from the list of interested ports.
6687  */
6688 static void
6689 vsw_del_mcst_vsw(vsw_t *vswp)
6690 {
6691 	mcst_addr_t	*next_p = NULL;
6692 
6693 	D1(vswp, "%s: enter", __func__);
6694 
6695 	mutex_enter(&vswp->mca_lock);
6696 
6697 	while (vswp->mcap != NULL) {
6698 		DERR(vswp, "%s: deleting addr 0x%llx",
6699 			__func__, vswp->mcap->addr);
6700 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
6701 				vswp->mcap->addr, NULL);
6702 
6703 		next_p = vswp->mcap->nextp;
6704 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
6705 		vswp->mcap = next_p;
6706 	}
6707 
6708 	vswp->mcap = NULL;
6709 	mutex_exit(&vswp->mca_lock);
6710 
6711 	D1(vswp, "%s: exit", __func__);
6712 }
6713 
6714 
6715 /*
6716  * Remove the specified address from the list of address maintained
6717  * in this port node.
6718  */
6719 static void
6720 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
6721 {
6722 	vsw_t		*vswp = NULL;
6723 	vsw_port_t	*port = NULL;
6724 	mcst_addr_t	*prev_p = NULL;
6725 	mcst_addr_t	*curr_p = NULL;
6726 
6727 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
6728 		__func__, devtype, addr);
6729 
6730 	if (devtype == VSW_VNETPORT) {
6731 		port = (vsw_port_t *)arg;
6732 		mutex_enter(&port->mca_lock);
6733 		prev_p = curr_p = port->mcap;
6734 	} else {
6735 		vswp = (vsw_t *)arg;
6736 		mutex_enter(&vswp->mca_lock);
6737 		prev_p = curr_p = vswp->mcap;
6738 	}
6739 
6740 	while (curr_p != NULL) {
6741 		if (curr_p->addr == addr) {
6742 			D2(NULL, "%s: address found", __func__);
6743 			/* match found */
6744 			if (prev_p == curr_p) {
6745 				/* list head */
6746 				if (devtype == VSW_VNETPORT)
6747 					port->mcap = curr_p->nextp;
6748 				else
6749 					vswp->mcap = curr_p->nextp;
6750 			} else {
6751 				prev_p->nextp = curr_p->nextp;
6752 			}
6753 			kmem_free(curr_p, sizeof (mcst_addr_t));
6754 			break;
6755 		} else {
6756 			prev_p = curr_p;
6757 			curr_p = curr_p->nextp;
6758 		}
6759 	}
6760 
6761 	if (devtype == VSW_VNETPORT)
6762 		mutex_exit(&port->mca_lock);
6763 	else
6764 		mutex_exit(&vswp->mca_lock);
6765 
6766 	D1(NULL, "%s: exit", __func__);
6767 }
6768 
6769 /*
6770  * Creates a descriptor ring (dring) and links it into the
6771  * link of outbound drings for this channel.
6772  *
6773  * Returns NULL if creation failed.
6774  */
6775 static dring_info_t *
6776 vsw_create_dring(vsw_ldc_t *ldcp)
6777 {
6778 	vsw_private_desc_t	*priv_addr = NULL;
6779 	vsw_t			*vswp = ldcp->ldc_vswp;
6780 	ldc_mem_info_t		minfo;
6781 	dring_info_t		*dp, *tp;
6782 	int			i;
6783 
6784 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
6785 
6786 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
6787 
6788 	/* create public section of ring */
6789 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
6790 			VSW_PUB_SIZE, &dp->handle)) != 0) {
6791 
6792 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
6793 			"failed", ldcp->ldc_id);
6794 		goto create_fail_exit;
6795 	}
6796 
6797 	ASSERT(dp->handle != NULL);
6798 
6799 	/*
6800 	 * Get the base address of the public section of the ring.
6801 	 */
6802 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
6803 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
6804 			ldcp->ldc_id);
6805 		goto dring_fail_exit;
6806 	} else {
6807 		ASSERT(minfo.vaddr != 0);
6808 		dp->pub_addr = minfo.vaddr;
6809 	}
6810 
6811 	dp->num_descriptors = VSW_RING_NUM_EL;
6812 	dp->descriptor_size = VSW_PUB_SIZE;
6813 	dp->options = VIO_TX_DRING;
6814 	dp->ncookies = 1;	/* guaranteed by ldc */
6815 
6816 	/*
6817 	 * create private portion of ring
6818 	 */
6819 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
6820 		(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
6821 
6822 	if (vsw_setup_ring(ldcp, dp)) {
6823 		DERR(vswp, "%s: unable to setup ring", __func__);
6824 		goto dring_fail_exit;
6825 	}
6826 
6827 	/* haven't used any descriptors yet */
6828 	dp->end_idx = 0;
6829 	dp->last_ack_recv = -1;
6830 
6831 	/* bind dring to the channel */
6832 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
6833 		LDC_SHADOW_MAP, LDC_MEM_RW,
6834 		&dp->cookie[0], &dp->ncookies)) != 0) {
6835 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
6836 			"%lld", ldcp->ldc_id);
6837 		goto dring_fail_exit;
6838 	}
6839 
6840 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
6841 	dp->restart_reqd = B_TRUE;
6842 
6843 	/*
6844 	 * Only ever create rings for outgoing lane. Link it onto
6845 	 * end of list.
6846 	 */
6847 	if (ldcp->lane_out.dringp == NULL) {
6848 		D2(vswp, "vsw_create_dring: adding first outbound ring");
6849 		ldcp->lane_out.dringp = dp;
6850 	} else {
6851 		tp = ldcp->lane_out.dringp;
6852 		while (tp->next != NULL)
6853 			tp = tp->next;
6854 
6855 		tp->next = dp;
6856 	}
6857 
6858 	return (dp);
6859 
6860 dring_fail_exit:
6861 	(void) ldc_mem_dring_destroy(dp->handle);
6862 
6863 create_fail_exit:
6864 	if (dp->priv_addr != NULL) {
6865 		priv_addr = dp->priv_addr;
6866 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
6867 			if (priv_addr->memhandle != NULL)
6868 				(void) ldc_mem_free_handle(
6869 						priv_addr->memhandle);
6870 			priv_addr++;
6871 		}
6872 		kmem_free(dp->priv_addr,
6873 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
6874 	}
6875 	mutex_destroy(&dp->dlock);
6876 
6877 	kmem_free(dp, sizeof (dring_info_t));
6878 	return (NULL);
6879 }
6880 
6881 /*
6882  * Create a ring consisting of just a private portion and link
6883  * it into the list of rings for the outbound lane.
6884  *
6885  * These type of rings are used primarily for temporary data
6886  * storage (i.e. as data buffers).
6887  */
6888 void
6889 vsw_create_privring(vsw_ldc_t *ldcp)
6890 {
6891 	dring_info_t		*dp, *tp;
6892 	vsw_t			*vswp = ldcp->ldc_vswp;
6893 
6894 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6895 
6896 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
6897 
6898 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
6899 
6900 	/* no public section */
6901 	dp->pub_addr = NULL;
6902 
6903 	dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
6904 					VSW_RING_NUM_EL), KM_SLEEP);
6905 
6906 	dp->num_descriptors = VSW_RING_NUM_EL;
6907 
6908 	if (vsw_setup_ring(ldcp, dp)) {
6909 		DERR(vswp, "%s: setup of ring failed", __func__);
6910 		kmem_free(dp->priv_addr,
6911 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
6912 		mutex_destroy(&dp->dlock);
6913 		kmem_free(dp, sizeof (dring_info_t));
6914 		return;
6915 	}
6916 
6917 	/* haven't used any descriptors yet */
6918 	dp->end_idx = 0;
6919 
6920 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
6921 	dp->restart_reqd = B_TRUE;
6922 
6923 	/*
6924 	 * Only ever create rings for outgoing lane. Link it onto
6925 	 * end of list.
6926 	 */
6927 	if (ldcp->lane_out.dringp == NULL) {
6928 		D2(vswp, "%s: adding first outbound privring", __func__);
6929 		ldcp->lane_out.dringp = dp;
6930 	} else {
6931 		tp = ldcp->lane_out.dringp;
6932 		while (tp->next != NULL)
6933 			tp = tp->next;
6934 
6935 		tp->next = dp;
6936 	}
6937 
6938 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6939 }
6940 
6941 /*
6942  * Setup the descriptors in the dring. Returns 0 on success, 1 on
6943  * failure.
6944  */
6945 int
6946 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
6947 {
6948 	vnet_public_desc_t	*pub_addr = NULL;
6949 	vsw_private_desc_t	*priv_addr = NULL;
6950 	vsw_t			*vswp = ldcp->ldc_vswp;
6951 	uint64_t		*tmpp;
6952 	uint64_t		offset = 0;
6953 	uint32_t		ncookies = 0;
6954 	static char		*name = "vsw_setup_ring";
6955 	int			i, j, nc, rv;
6956 
6957 	priv_addr = dp->priv_addr;
6958 	pub_addr = dp->pub_addr;
6959 
6960 	/* public section may be null but private should never be */
6961 	ASSERT(priv_addr != NULL);
6962 
6963 	/*
6964 	 * Allocate the region of memory which will be used to hold
6965 	 * the data the descriptors will refer to.
6966 	 */
6967 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
6968 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
6969 
6970 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
6971 		dp->data_sz, dp->data_addr);
6972 
6973 	tmpp = (uint64_t *)dp->data_addr;
6974 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
6975 
6976 	/*
6977 	 * Initialise some of the private and public (if they exist)
6978 	 * descriptor fields.
6979 	 */
6980 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
6981 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
6982 
6983 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
6984 			&priv_addr->memhandle)) != 0) {
6985 			DERR(vswp, "%s: alloc mem handle failed", name);
6986 			goto setup_ring_cleanup;
6987 		}
6988 
6989 		priv_addr->datap = (void *)tmpp;
6990 
6991 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
6992 			(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
6993 			LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
6994 			&(priv_addr->memcookie[0]), &ncookies);
6995 		if (rv != 0) {
6996 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
6997 				"(rv %d)", name, ldcp->ldc_id, rv);
6998 			goto setup_ring_cleanup;
6999 		}
7000 		priv_addr->bound = 1;
7001 
7002 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
7003 			name, i, priv_addr->memcookie[0].addr,
7004 			priv_addr->memcookie[0].size);
7005 
7006 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
7007 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
7008 				"invalid num of cookies (%d) for size 0x%llx",
7009 				name, ldcp->ldc_id, ncookies,
7010 				VSW_RING_EL_DATA_SZ);
7011 
7012 			goto setup_ring_cleanup;
7013 		} else {
7014 			for (j = 1; j < ncookies; j++) {
7015 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
7016 					&(priv_addr->memcookie[j]));
7017 				if (rv != 0) {
7018 					DERR(vswp, "%s: ldc_mem_nextcookie "
7019 						"failed rv (%d)", name, rv);
7020 					goto setup_ring_cleanup;
7021 				}
7022 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
7023 					"size 0x%llx", name, j,
7024 					priv_addr->memcookie[j].addr,
7025 					priv_addr->memcookie[j].size);
7026 			}
7027 
7028 		}
7029 		priv_addr->ncookies = ncookies;
7030 		priv_addr->dstate = VIO_DESC_FREE;
7031 
7032 		if (pub_addr != NULL) {
7033 
7034 			/* link pub and private sides */
7035 			priv_addr->descp = pub_addr;
7036 
7037 			pub_addr->ncookies = priv_addr->ncookies;
7038 
7039 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
7040 				bcopy(&priv_addr->memcookie[nc],
7041 					&pub_addr->memcookie[nc],
7042 					sizeof (ldc_mem_cookie_t));
7043 			}
7044 
7045 			pub_addr->hdr.dstate = VIO_DESC_FREE;
7046 			pub_addr++;
7047 		}
7048 
7049 		/*
7050 		 * move to next element in the dring and the next
7051 		 * position in the data buffer.
7052 		 */
7053 		priv_addr++;
7054 		tmpp += offset;
7055 	}
7056 
7057 	return (0);
7058 
7059 setup_ring_cleanup:
7060 	priv_addr = dp->priv_addr;
7061 
7062 	for (j = 0; j < i; j++) {
7063 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
7064 		(void) ldc_mem_free_handle(priv_addr->memhandle);
7065 
7066 		mutex_destroy(&priv_addr->dstate_lock);
7067 
7068 		priv_addr++;
7069 	}
7070 	kmem_free(dp->data_addr, dp->data_sz);
7071 
7072 	return (1);
7073 }
7074 
7075 /*
7076  * Searches the private section of a ring for a free descriptor,
7077  * starting at the location of the last free descriptor found
7078  * previously.
7079  *
7080  * Returns 0 if free descriptor is available, and updates state
7081  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
7082  *
7083  * FUTURE: might need to return contiguous range of descriptors
7084  * as dring info msg assumes all will be contiguous.
7085  */
7086 static int
7087 vsw_dring_find_free_desc(dring_info_t *dringp,
7088 		vsw_private_desc_t **priv_p, int *idx)
7089 {
7090 	vsw_private_desc_t	*addr = NULL;
7091 	int			num = VSW_RING_NUM_EL;
7092 	int			ret = 1;
7093 
7094 	D1(NULL, "%s enter\n", __func__);
7095 
7096 	ASSERT(dringp->priv_addr != NULL);
7097 
7098 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
7099 			__func__, dringp, dringp->end_idx);
7100 
7101 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
7102 
7103 	mutex_enter(&addr->dstate_lock);
7104 	if (addr->dstate == VIO_DESC_FREE) {
7105 		addr->dstate = VIO_DESC_READY;
7106 		*priv_p = addr;
7107 		*idx = dringp->end_idx;
7108 		dringp->end_idx = (dringp->end_idx + 1) % num;
7109 		ret = 0;
7110 
7111 	}
7112 	mutex_exit(&addr->dstate_lock);
7113 
7114 	/* ring full */
7115 	if (ret == 1) {
7116 		D2(NULL, "%s: no desp free: started at %d", __func__,
7117 			dringp->end_idx);
7118 	}
7119 
7120 	D1(NULL, "%s: exit\n", __func__);
7121 
7122 	return (ret);
7123 }
7124 
7125 /*
7126  * Map from a dring identifier to the ring itself. Returns
7127  * pointer to ring or NULL if no match found.
7128  */
7129 static dring_info_t *
7130 vsw_ident2dring(lane_t *lane, uint64_t ident)
7131 {
7132 	dring_info_t	*dp = NULL;
7133 
7134 	if ((dp = lane->dringp) == NULL) {
7135 		return (NULL);
7136 	} else {
7137 		if (dp->ident == ident)
7138 			return (dp);
7139 
7140 		while (dp != NULL) {
7141 			if (dp->ident == ident)
7142 				break;
7143 			dp = dp->next;
7144 		}
7145 	}
7146 
7147 	return (dp);
7148 }
7149 
7150 /*
7151  * Set the default lane attributes. These are copied into
7152  * the attr msg we send to our peer. If they are not acceptable
7153  * then (currently) the handshake ends.
7154  */
7155 static void
7156 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
7157 {
7158 	bzero(lp, sizeof (lane_t));
7159 
7160 	READ_ENTER(&vswp->if_lockrw);
7161 	ether_copy(&(vswp->if_addr), &(lp->addr));
7162 	RW_EXIT(&vswp->if_lockrw);
7163 
7164 	lp->mtu = VSW_MTU;
7165 	lp->addr_type = ADDR_TYPE_MAC;
7166 	lp->xfer_mode = VIO_DRING_MODE;
7167 	lp->ack_freq = 0;	/* for shared mode */
7168 
7169 	mutex_enter(&lp->seq_lock);
7170 	lp->seq_num = VNET_ISS;
7171 	mutex_exit(&lp->seq_lock);
7172 }
7173 
7174 /*
7175  * Verify that the attributes are acceptable.
7176  *
7177  * FUTURE: If some attributes are not acceptable, change them
7178  * our desired values.
7179  */
7180 static int
7181 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
7182 {
7183 	int	ret = 0;
7184 
7185 	D1(NULL, "vsw_check_attr enter\n");
7186 
7187 	/*
7188 	 * Note we currently only support in-band descriptors
7189 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
7190 	 */
7191 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
7192 			(pkt->xfer_mode != VIO_DRING_MODE)) {
7193 		D2(NULL, "vsw_check_attr: unknown mode %x\n",
7194 			pkt->xfer_mode);
7195 		ret = 1;
7196 	}
7197 
7198 	/* Only support MAC addresses at moment. */
7199 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
7200 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
7201 			"or address 0x%llx\n", pkt->addr_type,
7202 			pkt->addr);
7203 		ret = 1;
7204 	}
7205 
7206 	/*
7207 	 * MAC address supplied by device should match that stored
7208 	 * in the vsw-port OBP node. Need to decide what to do if they
7209 	 * don't match, for the moment just warn but don't fail.
7210 	 */
7211 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
7212 		DERR(NULL, "vsw_check_attr: device supplied address "
7213 			"0x%llx doesn't match node address 0x%llx\n",
7214 			pkt->addr, port->p_macaddr);
7215 	}
7216 
7217 	/*
7218 	 * Ack freq only makes sense in pkt mode, in shared
7219 	 * mode the ring descriptors say whether or not to
7220 	 * send back an ACK.
7221 	 */
7222 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
7223 				(pkt->ack_freq > 0)) {
7224 		D2(NULL, "vsw_check_attr: non zero ack freq "
7225 			" in SHM mode\n");
7226 		ret = 1;
7227 	}
7228 
7229 	/*
7230 	 * Note: for the moment we only support ETHER
7231 	 * frames. This may change in the future.
7232 	 */
7233 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
7234 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
7235 			pkt->mtu);
7236 		ret = 1;
7237 	}
7238 
7239 	D1(NULL, "vsw_check_attr exit\n");
7240 
7241 	return (ret);
7242 }
7243 
7244 /*
7245  * Returns 1 if there is a problem, 0 otherwise.
7246  */
7247 static int
7248 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
7249 {
7250 	_NOTE(ARGUNUSED(pkt))
7251 
7252 	int	ret = 0;
7253 
7254 	D1(NULL, "vsw_check_dring_info enter\n");
7255 
7256 	if ((pkt->num_descriptors == 0) ||
7257 		(pkt->descriptor_size == 0) ||
7258 		(pkt->ncookies != 1)) {
7259 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
7260 		ret = 1;
7261 	}
7262 
7263 	D1(NULL, "vsw_check_dring_info exit\n");
7264 
7265 	return (ret);
7266 }
7267 
7268 /*
7269  * Returns 1 if two memory cookies match. Otherwise returns 0.
7270  */
7271 static int
7272 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
7273 {
7274 	if ((m1->addr != m2->addr) ||
7275 		(m2->size != m2->size)) {
7276 		return (0);
7277 	} else {
7278 		return (1);
7279 	}
7280 }
7281 
7282 /*
7283  * Returns 1 if ring described in reg message matches that
7284  * described by dring_info structure. Otherwise returns 0.
7285  */
7286 static int
7287 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
7288 {
7289 	if ((msg->descriptor_size != dp->descriptor_size) ||
7290 		(msg->num_descriptors != dp->num_descriptors) ||
7291 		(msg->ncookies != dp->ncookies) ||
7292 		!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
7293 		return (0);
7294 	} else {
7295 		return (1);
7296 	}
7297 
7298 }
7299 
7300 static caddr_t
7301 vsw_print_ethaddr(uint8_t *a, char *ebuf)
7302 {
7303 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
7304 	    a[0], a[1], a[2], a[3], a[4], a[5]);
7305 	return (ebuf);
7306 }
7307 
7308 /*
7309  * Reset and free all the resources associated with
7310  * the channel.
7311  */
7312 static void
7313 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
7314 {
7315 	dring_info_t		*dp, *dpp;
7316 	lane_t			*lp = NULL;
7317 	int			rv = 0;
7318 
7319 	ASSERT(ldcp != NULL);
7320 
7321 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
7322 
7323 	if (dir == INBOUND) {
7324 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
7325 			" of channel %lld", __func__, ldcp->ldc_id);
7326 		lp = &ldcp->lane_in;
7327 	} else {
7328 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
7329 			" of channel %lld", __func__, ldcp->ldc_id);
7330 		lp = &ldcp->lane_out;
7331 	}
7332 
7333 	lp->lstate = VSW_LANE_INACTIV;
7334 	mutex_enter(&lp->seq_lock);
7335 	lp->seq_num = VNET_ISS;
7336 	mutex_exit(&lp->seq_lock);
7337 	if (lp->dringp) {
7338 		if (dir == INBOUND) {
7339 			dp = lp->dringp;
7340 			while (dp != NULL) {
7341 				dpp = dp->next;
7342 				if (dp->handle != NULL)
7343 					(void) ldc_mem_dring_unmap(dp->handle);
7344 				kmem_free(dp, sizeof (dring_info_t));
7345 				dp = dpp;
7346 			}
7347 		} else {
7348 			/*
7349 			 * unbind, destroy exported dring, free dring struct
7350 			 */
7351 			dp = lp->dringp;
7352 			rv = vsw_free_ring(dp);
7353 		}
7354 		if (rv == 0) {
7355 			lp->dringp = NULL;
7356 		}
7357 	}
7358 
7359 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
7360 }
7361 
7362 /*
7363  * Free ring and all associated resources.
7364  */
7365 static int
7366 vsw_free_ring(dring_info_t *dp)
7367 {
7368 	vsw_private_desc_t	*paddr = NULL;
7369 	dring_info_t		*dpp;
7370 	int			i, rv = 1;
7371 
7372 	while (dp != NULL) {
7373 		mutex_enter(&dp->dlock);
7374 		dpp = dp->next;
7375 		if (dp->priv_addr != NULL) {
7376 			/*
7377 			 * First unbind and free the memory handles
7378 			 * stored in each descriptor within the ring.
7379 			 */
7380 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
7381 				paddr = (vsw_private_desc_t *)
7382 						dp->priv_addr + i;
7383 				if (paddr->memhandle != NULL) {
7384 					if (paddr->bound == 1) {
7385 						rv = ldc_mem_unbind_handle(
7386 							paddr->memhandle);
7387 
7388 						if (rv != 0) {
7389 							DERR(NULL, "error "
7390 							"unbinding handle for "
7391 							"ring 0x%llx at pos %d",
7392 							dp, i);
7393 							mutex_exit(&dp->dlock);
7394 							return (rv);
7395 						}
7396 						paddr->bound = 0;
7397 					}
7398 
7399 					rv = ldc_mem_free_handle(
7400 							paddr->memhandle);
7401 					if (rv != 0) {
7402 						DERR(NULL, "error freeing "
7403 							"handle for ring "
7404 							"0x%llx at pos %d",
7405 							dp, i);
7406 						mutex_exit(&dp->dlock);
7407 						return (rv);
7408 					}
7409 					paddr->memhandle = NULL;
7410 				}
7411 				mutex_destroy(&paddr->dstate_lock);
7412 			}
7413 			kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
7414 					* VSW_RING_NUM_EL));
7415 		}
7416 
7417 		/*
7418 		 * Now unbind and destroy the ring itself.
7419 		 */
7420 		if (dp->handle != NULL) {
7421 			(void) ldc_mem_dring_unbind(dp->handle);
7422 			(void) ldc_mem_dring_destroy(dp->handle);
7423 		}
7424 
7425 		if (dp->data_addr != NULL) {
7426 			kmem_free(dp->data_addr, dp->data_sz);
7427 		}
7428 
7429 		mutex_exit(&dp->dlock);
7430 		mutex_destroy(&dp->dlock);
7431 		mutex_destroy(&dp->restart_lock);
7432 		kmem_free(dp, sizeof (dring_info_t));
7433 
7434 		dp = dpp;
7435 	}
7436 	return (0);
7437 }
7438 
7439 /*
7440  * Debugging routines
7441  */
7442 static void
7443 display_state(void)
7444 {
7445 	vsw_t		*vswp;
7446 	vsw_port_list_t	*plist;
7447 	vsw_port_t 	*port;
7448 	vsw_ldc_list_t	*ldcl;
7449 	vsw_ldc_t 	*ldcp;
7450 
7451 	cmn_err(CE_NOTE, "***** system state *****");
7452 
7453 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
7454 		plist = &vswp->plist;
7455 		READ_ENTER(&plist->lockrw);
7456 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
7457 			vswp->instance, plist->num_ports);
7458 
7459 		for (port = plist->head; port != NULL; port = port->p_next) {
7460 			ldcl = &port->p_ldclist;
7461 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
7462 				port->p_instance, ldcl->num_ldcs);
7463 			READ_ENTER(&ldcl->lockrw);
7464 			ldcp = ldcl->head;
7465 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
7466 				cmn_err(CE_CONT, "chan %lu : dev %d : "
7467 					"status %d : phase %u\n",
7468 					ldcp->ldc_id, ldcp->dev_class,
7469 					ldcp->ldc_status, ldcp->hphase);
7470 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
7471 					"psession %lu\n",
7472 					ldcp->ldc_id,
7473 					ldcp->local_session,
7474 					ldcp->peer_session);
7475 
7476 				cmn_err(CE_CONT, "Inbound lane:\n");
7477 				display_lane(&ldcp->lane_in);
7478 				cmn_err(CE_CONT, "Outbound lane:\n");
7479 				display_lane(&ldcp->lane_out);
7480 			}
7481 			RW_EXIT(&ldcl->lockrw);
7482 		}
7483 		RW_EXIT(&plist->lockrw);
7484 	}
7485 	cmn_err(CE_NOTE, "***** system state *****");
7486 }
7487 
7488 static void
7489 display_lane(lane_t *lp)
7490 {
7491 	dring_info_t	*drp;
7492 
7493 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
7494 		lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
7495 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
7496 		lp->addr_type, lp->addr, lp->xfer_mode);
7497 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
7498 
7499 	cmn_err(CE_CONT, "Dring info:\n");
7500 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
7501 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
7502 			drp->num_descriptors, drp->descriptor_size);
7503 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
7504 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
7505 			(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
7506 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
7507 			drp->ident, drp->end_idx);
7508 		display_ring(drp);
7509 	}
7510 }
7511 
7512 static void
7513 display_ring(dring_info_t *dringp)
7514 {
7515 	uint64_t		i;
7516 	uint64_t		priv_count = 0;
7517 	uint64_t		pub_count = 0;
7518 	vnet_public_desc_t	*pub_addr = NULL;
7519 	vsw_private_desc_t	*priv_addr = NULL;
7520 
7521 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
7522 		if (dringp->pub_addr != NULL) {
7523 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
7524 
7525 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
7526 				pub_count++;
7527 		}
7528 
7529 		if (dringp->priv_addr != NULL) {
7530 			priv_addr =
7531 				(vsw_private_desc_t *)dringp->priv_addr + i;
7532 
7533 			if (priv_addr->dstate == VIO_DESC_FREE)
7534 				priv_count++;
7535 		}
7536 	}
7537 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
7538 			i, priv_count, pub_count);
7539 }
7540 
7541 static void
7542 dump_flags(uint64_t state)
7543 {
7544 	int	i;
7545 
7546 	typedef struct flag_name {
7547 		int	flag_val;
7548 		char	*flag_name;
7549 	} flag_name_t;
7550 
7551 	flag_name_t	flags[] = {
7552 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
7553 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
7554 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
7555 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
7556 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
7557 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
7558 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
7559 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
7560 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
7561 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
7562 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
7563 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
7564 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
7565 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
7566 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
7567 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
7568 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
7569 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
7570 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
7571 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
7572 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
7573 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
7574 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
7575 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
7576 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
7577 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
7578 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
7579 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
7580 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
7581 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
7582 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
7583 
7584 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
7585 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
7586 		if (state & flags[i].flag_val)
7587 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
7588 	}
7589 }
7590