xref: /titanic_50/usr/src/uts/sun4v/io/vsw.c (revision d6b3210d4ed626e58d72b0c439ecba06617f963e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 
74 /*
75  * Function prototypes.
76  */
77 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
78 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
79 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
80 static	int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
81 static	int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *);
82 static	int vsw_get_physaddr(vsw_t *);
83 static	int vsw_setup_switching(vsw_t *);
84 static	int vsw_setup_layer2(vsw_t *);
85 static	int vsw_setup_layer3(vsw_t *);
86 
87 /* MAC Ring table functions. */
88 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
89 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
90 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
91 static void vsw_queue_stop(vsw_queue_t *vqp);
92 static vsw_queue_t *vsw_queue_create();
93 static void vsw_queue_destroy(vsw_queue_t *vqp);
94 
95 /* MAC layer routines */
96 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
97 		mac_resource_t *mrp);
98 static	int vsw_get_hw_maddr(vsw_t *);
99 static	int vsw_set_hw(vsw_t *, vsw_port_t *);
100 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *);
101 static	int vsw_unset_hw(vsw_t *, vsw_port_t *);
102 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *);
103 static	int vsw_reconfig_hw(vsw_t *);
104 static int vsw_mac_attach(vsw_t *vswp);
105 static void vsw_mac_detach(vsw_t *vswp);
106 
107 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
108 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
109 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
110 static int vsw_mac_register(vsw_t *);
111 static int vsw_mac_unregister(vsw_t *);
112 static int vsw_m_stat(void *, uint_t, uint64_t *);
113 static void vsw_m_stop(void *arg);
114 static int vsw_m_start(void *arg);
115 static int vsw_m_unicst(void *arg, const uint8_t *);
116 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
117 static int vsw_m_promisc(void *arg, boolean_t);
118 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
119 
120 /* MDEG routines */
121 static	int vsw_mdeg_register(vsw_t *vswp);
122 static	void vsw_mdeg_unregister(vsw_t *vswp);
123 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
124 static	int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
125 static	void vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
126 static	void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);
127 
128 /* Port add/deletion routines */
129 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
130 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
131 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
132 static	int vsw_detach_ports(vsw_t *vswp);
133 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
134 static	int vsw_port_delete(vsw_port_t *port);
135 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
136 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
137 static	int vsw_init_ldcs(vsw_port_t *port);
138 static	int vsw_uninit_ldcs(vsw_port_t *port);
139 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
140 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
141 static	int vsw_drain_ldcs(vsw_port_t *port);
142 static	int vsw_drain_port_taskq(vsw_port_t *port);
143 static	void vsw_marker_task(void *);
144 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
145 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
146 
147 /* Interrupt routines */
148 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
149 
150 /* Handshake routines */
151 static	void vsw_ldc_reinit(vsw_ldc_t *);
152 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
153 static	void vsw_conn_task(void *);
154 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
155 static	void vsw_next_milestone(vsw_ldc_t *);
156 static	int vsw_supported_version(vio_ver_msg_t *);
157 
158 /* Data processing routines */
159 static void vsw_process_pkt(void *);
160 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
161 static void vsw_process_ctrl_pkt(void *);
162 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
163 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
164 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
165 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
166 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
167 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
168 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
169 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
170 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
171 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
172 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
173 
174 /* Switching/data transmit routines */
175 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
176 	    vsw_port_t *port, mac_resource_handle_t);
177 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
178 	    vsw_port_t *port, mac_resource_handle_t);
179 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
180 	    vsw_port_t *port);
181 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
182 	    vsw_port_t *port);
183 static	int vsw_portsend(vsw_port_t *, mblk_t *);
184 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
185 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
186 
187 /* Packet creation routines */
188 static void vsw_send_ver(void *);
189 static void vsw_send_attr(vsw_ldc_t *);
190 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
191 static void vsw_send_dring_info(vsw_ldc_t *);
192 static void vsw_send_rdx(vsw_ldc_t *);
193 
194 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
195 
196 /* Forwarding database (FDB) routines */
197 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
198 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
199 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
200 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
201 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
202 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
203 static	void vsw_del_addr(uint8_t, void *, uint64_t);
204 static	void vsw_del_mcst_port(vsw_port_t *);
205 static	void vsw_del_mcst_vsw(vsw_t *);
206 
207 /* Dring routines */
208 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
209 static void vsw_create_privring(vsw_ldc_t *);
210 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
211 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
212     int *);
213 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
214 
215 static void vsw_set_lane_attr(vsw_t *, lane_t *);
216 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
217 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
218 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
219 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
220 
221 /* Misc support routines */
222 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
223 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
224 static int vsw_free_ring(dring_info_t *);
225 
226 /* Debugging routines */
227 static void dump_flags(uint64_t);
228 static void display_state(void);
229 static void display_lane(lane_t *);
230 static void display_ring(dring_info_t *);
231 
232 int	vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */
233 int	vsw_wretries = 100;		/* # of write attempts */
234 int	vsw_chain_len = 150;		/* max # of mblks in msg chain */
235 int	vsw_desc_delay = 0;		/* delay in us */
236 int	vsw_read_attempts = 5;		/* # of reads of descriptor */
237 
238 uint32_t	vsw_mblk_size = VSW_MBLK_SIZE;
239 uint32_t	vsw_num_mblks = VSW_NUM_MBLKS;
240 
241 static	mac_callbacks_t	vsw_m_callbacks = {
242 	0,
243 	vsw_m_stat,
244 	vsw_m_start,
245 	vsw_m_stop,
246 	vsw_m_promisc,
247 	vsw_m_multicst,
248 	vsw_m_unicst,
249 	vsw_m_tx,
250 	NULL,
251 	NULL,
252 	NULL
253 };
254 
255 static	struct	cb_ops	vsw_cb_ops = {
256 	nulldev,			/* cb_open */
257 	nulldev,			/* cb_close */
258 	nodev,				/* cb_strategy */
259 	nodev,				/* cb_print */
260 	nodev,				/* cb_dump */
261 	nodev,				/* cb_read */
262 	nodev,				/* cb_write */
263 	nodev,				/* cb_ioctl */
264 	nodev,				/* cb_devmap */
265 	nodev,				/* cb_mmap */
266 	nodev,				/* cb_segmap */
267 	nochpoll,			/* cb_chpoll */
268 	ddi_prop_op,			/* cb_prop_op */
269 	NULL,				/* cb_stream */
270 	D_MP,				/* cb_flag */
271 	CB_REV,				/* rev */
272 	nodev,				/* int (*cb_aread)() */
273 	nodev				/* int (*cb_awrite)() */
274 };
275 
276 static	struct	dev_ops	vsw_ops = {
277 	DEVO_REV,		/* devo_rev */
278 	0,			/* devo_refcnt */
279 	vsw_getinfo,		/* devo_getinfo */
280 	nulldev,		/* devo_identify */
281 	nulldev,		/* devo_probe */
282 	vsw_attach,		/* devo_attach */
283 	vsw_detach,		/* devo_detach */
284 	nodev,			/* devo_reset */
285 	&vsw_cb_ops,		/* devo_cb_ops */
286 	(struct bus_ops *)NULL,	/* devo_bus_ops */
287 	ddi_power		/* devo_power */
288 };
289 
290 extern	struct	mod_ops	mod_driverops;
291 static struct modldrv vswmodldrv = {
292 	&mod_driverops,
293 	"sun4v Virtual Switch %I%",
294 	&vsw_ops,
295 };
296 
297 #define	LDC_ENTER_LOCK(ldcp)	\
298 				mutex_enter(&((ldcp)->ldc_cblock));\
299 				mutex_enter(&((ldcp)->ldc_txlock));
300 #define	LDC_EXIT_LOCK(ldcp)	\
301 				mutex_exit(&((ldcp)->ldc_txlock));\
302 				mutex_exit(&((ldcp)->ldc_cblock));
303 
304 /* Driver soft state ptr  */
305 static void	*vsw_state;
306 
307 /*
308  * Linked list of "vsw_t" structures - one per instance.
309  */
310 vsw_t		*vsw_head = NULL;
311 krwlock_t	vsw_rw;
312 
313 /*
314  * Property names
315  */
316 static char vdev_propname[] = "virtual-device";
317 static char vsw_propname[] = "virtual-network-switch";
318 static char physdev_propname[] = "vsw-phys-dev";
319 static char smode_propname[] = "vsw-switch-mode";
320 static char macaddr_propname[] = "local-mac-address";
321 static char remaddr_propname[] = "remote-mac-address";
322 static char ldcids_propname[] = "ldc-ids";
323 static char chan_propname[] = "channel-endpoint";
324 static char id_propname[] = "id";
325 static char reg_propname[] = "reg";
326 
327 /* supported versions */
328 static	ver_sup_t	vsw_versions[] = { {1, 0} };
329 
330 /*
331  * Matching criteria passed to the MDEG to register interest
332  * in changes to 'virtual-device-port' nodes identified by their
333  * 'id' property.
334  */
335 static md_prop_match_t vport_prop_match[] = {
336 	{ MDET_PROP_VAL,    "id"   },
337 	{ MDET_LIST_END,    NULL    }
338 };
339 
340 static mdeg_node_match_t vport_match = { "virtual-device-port",
341 						vport_prop_match };
342 
343 /*
344  * Matching criteria passed to the MDEG to register interest
345  * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified
346  * by their 'name' and 'cfg-handle' properties.
347  */
348 static md_prop_match_t vdev_prop_match[] = {
349 	{ MDET_PROP_STR,    "name"   },
350 	{ MDET_PROP_VAL,    "cfg-handle" },
351 	{ MDET_LIST_END,    NULL    }
352 };
353 
354 static mdeg_node_match_t vdev_match = { "virtual-device",
355 						vdev_prop_match };
356 
357 
358 /*
359  * Specification of an MD node passed to the MDEG to filter any
360  * 'vport' nodes that do not belong to the specified node. This
361  * template is copied for each vsw instance and filled in with
362  * the appropriate 'cfg-handle' value before being passed to the MDEG.
363  */
364 static mdeg_prop_spec_t vsw_prop_template[] = {
365 	{ MDET_PROP_STR,    "name",		vsw_propname },
366 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
367 	{ MDET_LIST_END,    NULL,		NULL	}
368 };
369 
370 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
371 
372 /*
373  * From /etc/system enable/disable thread per ring. This is a mode
374  * selection that is done a vsw driver attach time.
375  */
376 boolean_t vsw_multi_ring_enable = B_FALSE;
377 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;
378 
379 /*
380  * Print debug messages - set to 0x1f to enable all msgs
381  * or 0x0 to turn all off.
382  */
383 int vswdbg = 0x0;
384 
385 /*
386  * debug levels:
387  * 0x01:	Function entry/exit tracing
388  * 0x02:	Internal function messages
389  * 0x04:	Verbose internal messages
390  * 0x08:	Warning messages
391  * 0x10:	Error messages
392  */
393 
394 static void
395 vswdebug(vsw_t *vswp, const char *fmt, ...)
396 {
397 	char buf[512];
398 	va_list ap;
399 
400 	va_start(ap, fmt);
401 	(void) vsprintf(buf, fmt, ap);
402 	va_end(ap);
403 
404 	if (vswp == NULL)
405 		cmn_err(CE_CONT, "%s\n", buf);
406 	else
407 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
408 }
409 
410 /*
411  * For the moment the state dump routines have their own
412  * private flag.
413  */
414 #define	DUMP_STATE	0
415 
416 #if DUMP_STATE
417 
418 #define	DUMP_TAG(tag) \
419 {			\
420 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
421 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
422 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
423 }
424 
425 #define	DUMP_TAG_PTR(tag) \
426 {			\
427 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
428 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
429 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
430 }
431 
432 #define	DUMP_FLAGS(flags) dump_flags(flags);
433 #define	DISPLAY_STATE()	display_state()
434 
435 #else
436 
437 #define	DUMP_TAG(tag)
438 #define	DUMP_TAG_PTR(tag)
439 #define	DUMP_FLAGS(state)
440 #define	DISPLAY_STATE()
441 
442 #endif	/* DUMP_STATE */
443 
444 #ifdef DEBUG
445 
446 #define	D1		\
447 if (vswdbg & 0x01)	\
448 	vswdebug
449 
450 #define	D2		\
451 if (vswdbg & 0x02)	\
452 	vswdebug
453 
454 #define	D3		\
455 if (vswdbg & 0x04)	\
456 	vswdebug
457 
458 #define	DWARN		\
459 if (vswdbg & 0x08)	\
460 	vswdebug
461 
462 #define	DERR		\
463 if (vswdbg & 0x10)	\
464 	vswdebug
465 
466 #else
467 
468 #define	DERR		if (0)	vswdebug
469 #define	DWARN		if (0)	vswdebug
470 #define	D1		if (0)	vswdebug
471 #define	D2		if (0)	vswdebug
472 #define	D3		if (0)	vswdebug
473 
474 #endif	/* DEBUG */
475 
476 static struct modlinkage modlinkage = {
477 	MODREV_1,
478 	&vswmodldrv,
479 	NULL
480 };
481 
482 int
483 _init(void)
484 {
485 	int status;
486 
487 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
488 
489 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
490 	if (status != 0) {
491 		return (status);
492 	}
493 
494 	mac_init_ops(&vsw_ops, "vsw");
495 	status = mod_install(&modlinkage);
496 	if (status != 0) {
497 		ddi_soft_state_fini(&vsw_state);
498 	}
499 	return (status);
500 }
501 
502 int
503 _fini(void)
504 {
505 	int status;
506 
507 	status = mod_remove(&modlinkage);
508 	if (status != 0)
509 		return (status);
510 	mac_fini_ops(&vsw_ops);
511 	ddi_soft_state_fini(&vsw_state);
512 
513 	rw_destroy(&vsw_rw);
514 
515 	return (status);
516 }
517 
518 int
519 _info(struct modinfo *modinfop)
520 {
521 	return (mod_info(&modlinkage, modinfop));
522 }
523 
524 static int
525 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
526 {
527 	vsw_t		*vswp;
528 	int		instance;
529 	char		hashname[MAXNAMELEN];
530 	char		qname[TASKQ_NAMELEN];
531 	enum		{ PROG_init = 0x00,
532 				PROG_if_lock = 0x01,
533 				PROG_fdb = 0x02,
534 				PROG_mfdb = 0x04,
535 				PROG_report_dev = 0x08,
536 				PROG_plist = 0x10,
537 				PROG_taskq = 0x20}
538 			progress;
539 
540 	progress = PROG_init;
541 
542 	switch (cmd) {
543 	case DDI_ATTACH:
544 		break;
545 	case DDI_RESUME:
546 		/* nothing to do for this non-device */
547 		return (DDI_SUCCESS);
548 	case DDI_PM_RESUME:
549 	default:
550 		return (DDI_FAILURE);
551 	}
552 
553 	instance = ddi_get_instance(dip);
554 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
555 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
556 		return (DDI_FAILURE);
557 	}
558 	vswp = ddi_get_soft_state(vsw_state, instance);
559 
560 	if (vswp == NULL) {
561 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
562 		goto vsw_attach_fail;
563 	}
564 
565 	vswp->dip = dip;
566 	vswp->instance = instance;
567 	ddi_set_driver_private(dip, (caddr_t)vswp);
568 
569 	mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
570 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
571 	progress |= PROG_if_lock;
572 
573 	/* setup the unicast forwarding database  */
574 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
575 							vswp->instance);
576 	D2(vswp, "creating unicast hash table (%s)...", hashname);
577 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
578 		mod_hash_null_valdtor, sizeof (void *));
579 
580 	progress |= PROG_fdb;
581 
582 	/* setup the multicast fowarding database */
583 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
584 							vswp->instance);
585 	D2(vswp, "creating multicast hash table %s)...", hashname);
586 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
587 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
588 			mod_hash_null_valdtor, sizeof (void *));
589 
590 	progress |= PROG_mfdb;
591 
592 	/*
593 	 * create lock protecting list of multicast addresses
594 	 * which could come via m_multicst() entry point when plumbed.
595 	 */
596 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
597 	vswp->mcap = NULL;
598 
599 	ddi_report_dev(vswp->dip);
600 
601 	progress |= PROG_report_dev;
602 
603 	WRITE_ENTER(&vsw_rw);
604 	vswp->next = vsw_head;
605 	vsw_head = vswp;
606 	RW_EXIT(&vsw_rw);
607 
608 	/* setup the port list */
609 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
610 	vswp->plist.head = NULL;
611 
612 	progress |= PROG_plist;
613 
614 	/*
615 	 * Create the taskq which will process all the VIO
616 	 * control messages.
617 	 */
618 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
619 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
620 					TASKQ_DEFAULTPRI, 0)) == NULL) {
621 		cmn_err(CE_WARN, "!vsw%d: Unable to create task queue",
622 			vswp->instance);
623 		goto vsw_attach_fail;
624 	}
625 
626 	progress |= PROG_taskq;
627 
628 	/* prevent auto-detaching */
629 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
630 				DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
631 		cmn_err(CE_NOTE, "!Unable to set \"%s\" property for "
632 			"instance %u", DDI_NO_AUTODETACH, instance);
633 	}
634 
635 	/*
636 	 * Now we have everything setup, register an interest in
637 	 * specific MD nodes.
638 	 *
639 	 * The callback is invoked in 2 cases, firstly if upon mdeg
640 	 * registration there are existing nodes which match our specified
641 	 * criteria, and secondly if the MD is changed (and again, there
642 	 * are nodes which we are interested in present within it. Note
643 	 * that our callback will be invoked even if our specified nodes
644 	 * have not actually changed).
645 	 *
646 	 * Until the callback is invoked we cannot switch any pkts as
647 	 * we don't know basic information such as what mode we are
648 	 * operating in. However we expect the callback to be invoked
649 	 * immediately upon registration as this driver should only
650 	 * be attaching if there are vsw nodes in the MD.
651 	 */
652 	if (vsw_mdeg_register(vswp))
653 		goto vsw_attach_fail;
654 
655 	return (DDI_SUCCESS);
656 
657 vsw_attach_fail:
658 	DERR(NULL, "vsw_attach: failed");
659 
660 	if (progress & PROG_taskq)
661 		ddi_taskq_destroy(vswp->taskq_p);
662 
663 	if (progress & PROG_plist)
664 		rw_destroy(&vswp->plist.lockrw);
665 
666 	if (progress & PROG_report_dev) {
667 		ddi_remove_minor_node(dip, NULL);
668 		mutex_destroy(&vswp->mca_lock);
669 	}
670 
671 	if (progress & PROG_mfdb) {
672 		mod_hash_destroy_hash(vswp->mfdb);
673 		vswp->mfdb = NULL;
674 		rw_destroy(&vswp->mfdbrw);
675 	}
676 
677 	if (progress & PROG_fdb) {
678 		mod_hash_destroy_hash(vswp->fdb);
679 		vswp->fdb = NULL;
680 	}
681 
682 	if (progress & PROG_if_lock) {
683 		rw_destroy(&vswp->if_lockrw);
684 		mutex_destroy(&vswp->mac_lock);
685 	}
686 
687 	ddi_soft_state_free(vsw_state, instance);
688 	return (DDI_FAILURE);
689 }
690 
691 static int
692 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
693 {
694 	vio_mblk_pool_t		*poolp, *npoolp;
695 	vsw_t			**vswpp, *vswp;
696 	int 			instance;
697 
698 	instance = ddi_get_instance(dip);
699 	vswp = ddi_get_soft_state(vsw_state, instance);
700 
701 	if (vswp == NULL) {
702 		return (DDI_FAILURE);
703 	}
704 
705 	switch (cmd) {
706 	case DDI_DETACH:
707 		break;
708 	case DDI_SUSPEND:
709 	case DDI_PM_SUSPEND:
710 	default:
711 		return (DDI_FAILURE);
712 	}
713 
714 	D2(vswp, "detaching instance %d", instance);
715 
716 	if (vswp->if_state & VSW_IF_REG) {
717 		if (vsw_mac_unregister(vswp) != 0) {
718 			cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
719 				"MAC layer", vswp->instance);
720 			return (DDI_FAILURE);
721 		}
722 	}
723 
724 	vsw_mdeg_unregister(vswp);
725 
726 	/* remove mac layer callback */
727 	mutex_enter(&vswp->mac_lock);
728 	if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
729 		mac_rx_remove(vswp->mh, vswp->mrh);
730 		vswp->mrh = NULL;
731 	}
732 	mutex_exit(&vswp->mac_lock);
733 
734 	if (vsw_detach_ports(vswp) != 0) {
735 		cmn_err(CE_WARN, "!vsw%d: Unable to detach ports",
736 							vswp->instance);
737 		return (DDI_FAILURE);
738 	}
739 
740 	rw_destroy(&vswp->if_lockrw);
741 
742 	/*
743 	 * Now that the ports have been deleted, stop and close
744 	 * the physical device.
745 	 */
746 	mutex_enter(&vswp->mac_lock);
747 	if (vswp->mh != NULL) {
748 		if (vswp->mstarted)
749 			mac_stop(vswp->mh);
750 		if (vswp->mresources)
751 			mac_resource_set(vswp->mh, NULL, NULL);
752 		mac_close(vswp->mh);
753 
754 		vswp->mh = NULL;
755 		vswp->txinfo = NULL;
756 	}
757 	mutex_exit(&vswp->mac_lock);
758 	mutex_destroy(&vswp->mac_lock);
759 
760 	/*
761 	 * Destroy any free pools that may still exist.
762 	 */
763 	poolp = vswp->rxh;
764 	while (poolp != NULL) {
765 		npoolp = vswp->rxh = poolp->nextp;
766 		if (vio_destroy_mblks(poolp) != 0) {
767 			vswp->rxh = poolp;
768 			return (DDI_FAILURE);
769 		}
770 		poolp = npoolp;
771 	}
772 
773 	/*
774 	 * Remove this instance from any entries it may be on in
775 	 * the hash table by using the list of addresses maintained
776 	 * in the vsw_t structure.
777 	 */
778 	vsw_del_mcst_vsw(vswp);
779 
780 	vswp->mcap = NULL;
781 	mutex_destroy(&vswp->mca_lock);
782 
783 	/*
784 	 * By now any pending tasks have finished and the underlying
785 	 * ldc's have been destroyed, so its safe to delete the control
786 	 * message taskq.
787 	 */
788 	if (vswp->taskq_p != NULL)
789 		ddi_taskq_destroy(vswp->taskq_p);
790 
791 	/*
792 	 * At this stage all the data pointers in the hash table
793 	 * should be NULL, as all the ports have been removed and will
794 	 * have deleted themselves from the port lists which the data
795 	 * pointers point to. Hence we can destroy the table using the
796 	 * default destructors.
797 	 */
798 	D2(vswp, "vsw_detach: destroying hash tables..");
799 	mod_hash_destroy_hash(vswp->fdb);
800 	vswp->fdb = NULL;
801 
802 	WRITE_ENTER(&vswp->mfdbrw);
803 	mod_hash_destroy_hash(vswp->mfdb);
804 	vswp->mfdb = NULL;
805 	RW_EXIT(&vswp->mfdbrw);
806 	rw_destroy(&vswp->mfdbrw);
807 
808 	ddi_remove_minor_node(dip, NULL);
809 
810 	rw_destroy(&vswp->plist.lockrw);
811 	WRITE_ENTER(&vsw_rw);
812 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
813 		if (*vswpp == vswp) {
814 			*vswpp = vswp->next;
815 			break;
816 		}
817 	}
818 	RW_EXIT(&vsw_rw);
819 	ddi_soft_state_free(vsw_state, instance);
820 
821 	return (DDI_SUCCESS);
822 }
823 
824 static int
825 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
826 {
827 	_NOTE(ARGUNUSED(dip))
828 
829 	vsw_t	*vswp = NULL;
830 	dev_t	dev = (dev_t)arg;
831 	int	instance;
832 
833 	instance = getminor(dev);
834 
835 	switch (infocmd) {
836 	case DDI_INFO_DEVT2DEVINFO:
837 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
838 			*result = NULL;
839 			return (DDI_FAILURE);
840 		}
841 		*result = vswp->dip;
842 		return (DDI_SUCCESS);
843 
844 	case DDI_INFO_DEVT2INSTANCE:
845 		*result = (void *)(uintptr_t)instance;
846 		return (DDI_SUCCESS);
847 
848 	default:
849 		*result = NULL;
850 		return (DDI_FAILURE);
851 	}
852 }
853 
854 /*
855  * Get the value of the "vsw-phys-dev" property in the specified
856  * node. This property is the name of the physical device that
857  * the virtual switch will use to talk to the outside world.
858  *
859  * Note it is valid for this property to be NULL (but the property
860  * itself must exist). Callers of this routine should verify that
861  * the value returned is what they expected (i.e. either NULL or non NULL).
862  *
863  * On success returns value of the property in region pointed to by
864  * the 'name' argument, and with return value of 0. Otherwise returns 1.
865  */
866 static int
867 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
868 {
869 	int	len = 0;
870 	char	*physname = NULL;
871 	char	*dev;
872 
873 	if (md_get_prop_data(mdp, node, physdev_propname,
874 				(uint8_t **)(&physname), &len) != 0) {
875 		cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical "
876 				"device(s) from MD", vswp->instance);
877 		return (1);
878 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
879 		cmn_err(CE_WARN, "!vsw%d: %s is too long a device name",
880 			vswp->instance, physname);
881 		return (1);
882 	} else {
883 		(void) strncpy(name, physname, strlen(physname) + 1);
884 		D2(vswp, "%s: using first device specified (%s)",
885 			__func__, physname);
886 	}
887 
888 #ifdef DEBUG
889 	/*
890 	 * As a temporary measure to aid testing we check to see if there
891 	 * is a vsw.conf file present. If there is we use the value of the
892 	 * vsw_physname property in the file as the name of the physical
893 	 * device, overriding the value from the MD.
894 	 *
895 	 * There may be multiple devices listed, but for the moment
896 	 * we just use the first one.
897 	 */
898 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
899 		"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
900 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
901 			cmn_err(CE_WARN, "vsw%d: %s is too long a device name",
902 				vswp->instance, dev);
903 			ddi_prop_free(dev);
904 			return (1);
905 		} else {
906 			cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from "
907 				"config file", vswp->instance, dev);
908 
909 			(void) strncpy(name, dev, strlen(dev) + 1);
910 		}
911 
912 		ddi_prop_free(dev);
913 	}
914 #endif
915 
916 	return (0);
917 }
918 
919 /*
920  * Read the 'vsw-switch-mode' property from the specified MD node.
921  *
922  * Returns 0 on success and the number of modes found in 'found',
923  * otherwise returns 1.
924  */
925 static int
926 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
927 						uint8_t *modes, int *found)
928 {
929 	int		len = 0;
930 	int		smode_num = 0;
931 	char		*smode = NULL;
932 	char		*curr_mode = NULL;
933 
934 	D1(vswp, "%s: enter", __func__);
935 
936 	/*
937 	 * Get the switch-mode property. The modes are listed in
938 	 * decreasing order of preference, i.e. prefered mode is
939 	 * first item in list.
940 	 */
941 	len = 0;
942 	smode_num = 0;
943 	if (md_get_prop_data(mdp, node, smode_propname,
944 				(uint8_t **)(&smode), &len) != 0) {
945 		/*
946 		 * Unable to get switch-mode property from MD, nothing
947 		 * more we can do.
948 		 */
949 		cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
950 			" from the MD", vswp->instance);
951 		*found = 0;
952 		return (1);
953 	}
954 
955 	curr_mode = smode;
956 	/*
957 	 * Modes of operation:
958 	 * 'switched'	 - layer 2 switching, underlying HW in
959 	 *			programmed mode.
960 	 * 'promiscuous' - layer 2 switching, underlying HW in
961 	 *			promiscuous mode.
962 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
963 	 *			in non-promiscuous mode.
964 	 */
965 	while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) {
966 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
967 		if (strcmp(curr_mode, "switched") == 0) {
968 			modes[smode_num++] = VSW_LAYER2;
969 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
970 			modes[smode_num++] = VSW_LAYER2_PROMISC;
971 		} else if (strcmp(curr_mode, "routed") == 0) {
972 			modes[smode_num++] = VSW_LAYER3;
973 		} else {
974 			cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
975 				"setting to default switched mode",
976 				vswp->instance, curr_mode);
977 			modes[smode_num++] = VSW_LAYER2;
978 		}
979 		curr_mode += strlen(curr_mode) + 1;
980 	}
981 	*found = smode_num;
982 
983 	D2(vswp, "%s: %d modes found", __func__, smode_num);
984 
985 	D1(vswp, "%s: exit", __func__);
986 
987 	return (0);
988 }
989 
990 /*
991  * Get the mac address of the physical device.
992  *
993  * Returns 0 on success, 1 on failure.
994  */
995 static int
996 vsw_get_physaddr(vsw_t *vswp)
997 {
998 	mac_handle_t	mh;
999 	char		drv[LIFNAMSIZ];
1000 	uint_t		ddi_instance;
1001 
1002 	D1(vswp, "%s: enter", __func__);
1003 
1004 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS)
1005 		return (1);
1006 
1007 	if (mac_open(vswp->physname, ddi_instance, &mh) != 0) {
1008 		cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
1009 				vswp->instance, vswp->physname);
1010 		return (1);
1011 	}
1012 
1013 	READ_ENTER(&vswp->if_lockrw);
1014 	mac_unicst_get(mh, vswp->if_addr.ether_addr_octet);
1015 	RW_EXIT(&vswp->if_lockrw);
1016 
1017 	mac_close(mh);
1018 
1019 	vswp->mdprops |= VSW_DEV_MACADDR;
1020 
1021 	D1(vswp, "%s: exit", __func__);
1022 
1023 	return (0);
1024 }
1025 
1026 /*
1027  * Check to see if the card supports the setting of multiple unicst
1028  * addresses.
1029  *
1030  * Returns 0 if card supports the programming of multiple unicast addresses
1031  * and there are free address slots available, otherwise returns 1.
1032  */
1033 static int
1034 vsw_get_hw_maddr(vsw_t *vswp)
1035 {
1036 	D1(vswp, "%s: enter", __func__);
1037 
1038 	mutex_enter(&vswp->mac_lock);
1039 	if (vswp->mh == NULL) {
1040 		mutex_exit(&vswp->mac_lock);
1041 		return (1);
1042 	}
1043 
1044 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
1045 		DWARN(vswp, "Unable to get capabilities of"
1046 			" underlying device (%s)", vswp->physname);
1047 		mutex_exit(&vswp->mac_lock);
1048 		return (1);
1049 	}
1050 	mutex_exit(&vswp->mac_lock);
1051 
1052 	if (vswp->maddr.maddr_naddrfree == 0) {
1053 		cmn_err(CE_WARN,
1054 			"!vsw%d: device %s has no free unicast address slots",
1055 				vswp->instance, vswp->physname);
1056 		return (1);
1057 	}
1058 
1059 	D2(vswp, "%s: %d addrs : %d free", __func__,
1060 		vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
1061 
1062 	D1(vswp, "%s: exit", __func__);
1063 
1064 	return (0);
1065 }
1066 
1067 /*
1068  * Setup the required switching mode.
1069  *
1070  * Returns 0 on success, 1 on failure.
1071  */
1072 static int
1073 vsw_setup_switching(vsw_t *vswp)
1074 {
1075 	int	i, rv = 1;
1076 
1077 	D1(vswp, "%s: enter", __func__);
1078 
1079 	/* select best switching mode */
1080 	for (i = 0; i < vswp->smode_num; i++) {
1081 		vswp->smode_idx = i;
1082 		switch (vswp->smode[i]) {
1083 		case VSW_LAYER2:
1084 		case VSW_LAYER2_PROMISC:
1085 			rv = vsw_setup_layer2(vswp);
1086 			break;
1087 
1088 		case VSW_LAYER3:
1089 			rv = vsw_setup_layer3(vswp);
1090 			break;
1091 
1092 		default:
1093 			DERR(vswp, "unknown switch mode");
1094 			rv = 1;
1095 			break;
1096 		}
1097 
1098 		if (rv == 0)
1099 			break;
1100 	}
1101 
1102 	if (rv == 1) {
1103 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
1104 			"switching mode", vswp->instance);
1105 		return (rv);
1106 	}
1107 
1108 	D2(vswp, "%s: Operating in mode %d", __func__,
1109 					vswp->smode[vswp->smode_idx]);
1110 
1111 	D1(vswp, "%s: exit", __func__);
1112 
1113 	return (0);
1114 }
1115 
1116 /*
1117  * Setup for layer 2 switching.
1118  *
1119  * Returns 0 on success, 1 on failure.
1120  */
1121 static int
1122 vsw_setup_layer2(vsw_t *vswp)
1123 {
1124 	D1(vswp, "%s: enter", __func__);
1125 
1126 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
1127 
1128 	/*
1129 	 * Attempt to link into the MAC layer so we can get
1130 	 * and send packets out over the physical adapter.
1131 	 */
1132 	if (vswp->mdprops & VSW_MD_PHYSNAME) {
1133 		if (vsw_mac_attach(vswp) != 0) {
1134 			/*
1135 			 * Registration with the MAC layer has failed,
1136 			 * so return 1 so that can fall back to next
1137 			 * prefered switching method.
1138 			 */
1139 			cmn_err(CE_WARN, "!vsw%d: Unable to join as MAC layer "
1140 				"client", vswp->instance);
1141 			return (1);
1142 		}
1143 
1144 		if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
1145 			/*
1146 			 * Verify that underlying device can support multiple
1147 			 * unicast mac addresses, and has free capacity.
1148 			 */
1149 			if (vsw_get_hw_maddr(vswp) != 0) {
1150 				cmn_err(CE_WARN, "!vsw%d: Unable to setup "
1151 					"switching", vswp->instance);
1152 				vsw_mac_detach(vswp);
1153 				return (1);
1154 			}
1155 		}
1156 
1157 	} else {
1158 		/*
1159 		 * No physical device name found in MD which is
1160 		 * required for layer 2.
1161 		 */
1162 		cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
1163 			vswp->instance);
1164 		return (1);
1165 	}
1166 
1167 	D1(vswp, "%s: exit", __func__);
1168 
1169 	return (0);
1170 }
1171 
1172 static int
1173 vsw_setup_layer3(vsw_t *vswp)
1174 {
1175 	D1(vswp, "%s: enter", __func__);
1176 
1177 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1178 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
1179 
1180 	D1(vswp, "%s: exit", __func__);
1181 
1182 	return (0);
1183 }
1184 
1185 /*
1186  * Link into the MAC layer to gain access to the services provided by
1187  * the underlying physical device driver (which should also have
1188  * registered with the MAC layer).
1189  *
1190  * Only when in layer 2 mode.
1191  */
1192 static int
1193 vsw_mac_attach(vsw_t *vswp)
1194 {
1195 	char	drv[LIFNAMSIZ];
1196 	uint_t	ddi_instance;
1197 
1198 	D1(vswp, "%s: enter", __func__);
1199 
1200 	ASSERT(vswp->mh == NULL);
1201 	ASSERT(vswp->mrh == NULL);
1202 	ASSERT(vswp->mstarted == B_FALSE);
1203 	ASSERT(vswp->mresources == B_FALSE);
1204 
1205 	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
1206 
1207 	mutex_enter(&vswp->mac_lock);
1208 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
1209 		cmn_err(CE_WARN, "!vsw%d: invalid device name: %s",
1210 			vswp->instance, vswp->physname);
1211 		goto mac_fail_exit;
1212 	}
1213 
1214 	if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
1215 		cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
1216 			vswp->instance, vswp->physname);
1217 		goto mac_fail_exit;
1218 	}
1219 
1220 	ASSERT(vswp->mh != NULL);
1221 
1222 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1223 
1224 	if (vsw_multi_ring_enable) {
1225 		/*
1226 		 * Initialize the ring table.
1227 		 */
1228 		vsw_mac_ring_tbl_init(vswp);
1229 
1230 		/*
1231 		 * Register our rx callback function.
1232 		 */
1233 		vswp->mrh = mac_rx_add(vswp->mh,
1234 			vsw_rx_queue_cb, (void *)vswp);
1235 		ASSERT(vswp->mrh != NULL);
1236 
1237 		/*
1238 		 * Register our mac resource callback.
1239 		 */
1240 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
1241 		vswp->mresources = B_TRUE;
1242 
1243 		/*
1244 		 * Get the ring resources available to us from
1245 		 * the mac below us.
1246 		 */
1247 		mac_resources(vswp->mh);
1248 	} else {
1249 		/*
1250 		 * Just register our rx callback function
1251 		 */
1252 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1253 		ASSERT(vswp->mrh != NULL);
1254 	}
1255 
1256 	/* Get the MAC tx fn */
1257 	vswp->txinfo = mac_tx_get(vswp->mh);
1258 
1259 	/* start the interface */
1260 	if (mac_start(vswp->mh) != 0) {
1261 		cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
1262 			vswp->instance);
1263 		goto mac_fail_exit;
1264 	}
1265 
1266 	mutex_exit(&vswp->mac_lock);
1267 
1268 	vswp->mstarted = B_TRUE;
1269 
1270 	D1(vswp, "%s: exit", __func__);
1271 	return (0);
1272 
1273 mac_fail_exit:
1274 	mutex_exit(&vswp->mac_lock);
1275 	vsw_mac_detach(vswp);
1276 
1277 	D1(vswp, "%s: exit", __func__);
1278 	return (1);
1279 }
1280 
1281 static void
1282 vsw_mac_detach(vsw_t *vswp)
1283 {
1284 	D1(vswp, "vsw_mac_detach: enter");
1285 
1286 	ASSERT(vswp != NULL);
1287 
1288 	if (vsw_multi_ring_enable) {
1289 		vsw_mac_ring_tbl_destroy(vswp);
1290 	}
1291 
1292 	mutex_enter(&vswp->mac_lock);
1293 
1294 	if (vswp->mh != NULL) {
1295 		if (vswp->mstarted)
1296 			mac_stop(vswp->mh);
1297 		if (vswp->mrh != NULL)
1298 			mac_rx_remove(vswp->mh, vswp->mrh);
1299 		if (vswp->mresources)
1300 			mac_resource_set(vswp->mh, NULL, NULL);
1301 		mac_close(vswp->mh);
1302 	}
1303 
1304 	vswp->mrh = NULL;
1305 	vswp->mh = NULL;
1306 	vswp->txinfo = NULL;
1307 	vswp->mstarted = B_FALSE;
1308 
1309 	mutex_exit(&vswp->mac_lock);
1310 
1311 	D1(vswp, "vsw_mac_detach: exit");
1312 }
1313 
1314 /*
1315  * Depending on the mode specified, the capabilites and capacity
1316  * of the underlying device setup the physical device.
1317  *
1318  * If in layer 3 mode, then do nothing.
1319  *
1320  * If in layer 2 programmed mode attempt to program the unicast address
1321  * associated with the port into the physical device. If this is not
1322  * possible due to resource exhaustion or simply because the device does
1323  * not support multiple unicast addresses then if required fallback onto
1324  * putting the card into promisc mode.
1325  *
1326  * If in promisc mode then simply set the card into promisc mode.
1327  *
1328  * Returns 0 success, 1 on failure.
1329  */
1330 static int
1331 vsw_set_hw(vsw_t *vswp, vsw_port_t *port)
1332 {
1333 	mac_multi_addr_t	mac_addr;
1334 	void			*mah;
1335 	int			err;
1336 
1337 	D1(vswp, "%s: enter", __func__);
1338 
1339 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1340 		return (0);
1341 
1342 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
1343 		return (vsw_set_hw_promisc(vswp, port));
1344 	}
1345 
1346 	if (vswp->maddr.maddr_handle == NULL)
1347 		return (1);
1348 
1349 	mah = vswp->maddr.maddr_handle;
1350 
1351 	/*
1352 	 * Attempt to program the unicast address into the HW.
1353 	 */
1354 	mac_addr.mma_addrlen = ETHERADDRL;
1355 	ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
1356 
1357 	err = vswp->maddr.maddr_add(mah, &mac_addr);
1358 	if (err != 0) {
1359 		cmn_err(CE_WARN, "!vsw%d: failed to program addr "
1360 			"%x:%x:%x:%x:%x:%x for port %d into device %s "
1361 			": err %d", vswp->instance,
1362 			port->p_macaddr.ether_addr_octet[0],
1363 			port->p_macaddr.ether_addr_octet[1],
1364 			port->p_macaddr.ether_addr_octet[2],
1365 			port->p_macaddr.ether_addr_octet[3],
1366 			port->p_macaddr.ether_addr_octet[4],
1367 			port->p_macaddr.ether_addr_octet[5],
1368 			port->p_instance, vswp->physname, err);
1369 
1370 		/*
1371 		 * Mark that attempt should be made to re-config sometime
1372 		 * in future if a port is deleted.
1373 		 */
1374 		vswp->recfg_reqd = B_TRUE;
1375 
1376 		/*
1377 		 * Only 1 mode specified, nothing more to do.
1378 		 */
1379 		if (vswp->smode_num == 1)
1380 			return (err);
1381 
1382 		/*
1383 		 * If promiscuous was next mode specified try to
1384 		 * set the card into that mode.
1385 		 */
1386 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
1387 			(vswp->smode[vswp->smode_idx + 1]
1388 					== VSW_LAYER2_PROMISC)) {
1389 			vswp->smode_idx += 1;
1390 			return (vsw_set_hw_promisc(vswp, port));
1391 		}
1392 		return (err);
1393 	}
1394 
1395 	port->addr_slot = mac_addr.mma_slot;
1396 	port->addr_set = VSW_ADDR_HW;
1397 
1398 	D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x for port %d "
1399 		"into slot %d of device %s",
1400 		port->p_macaddr.ether_addr_octet[0],
1401 		port->p_macaddr.ether_addr_octet[1],
1402 		port->p_macaddr.ether_addr_octet[2],
1403 		port->p_macaddr.ether_addr_octet[3],
1404 		port->p_macaddr.ether_addr_octet[4],
1405 		port->p_macaddr.ether_addr_octet[5],
1406 		port->p_instance, port->addr_slot, vswp->physname);
1407 
1408 	D1(vswp, "%s: exit", __func__);
1409 
1410 	return (0);
1411 }
1412 
1413 /*
1414  * If in layer 3 mode do nothing.
1415  *
1416  * If in layer 2 switched mode remove the address from the physical
1417  * device.
1418  *
1419  * If in layer 2 promiscuous mode disable promisc mode.
1420  *
1421  * Returns 0 on success.
1422  */
1423 static int
1424 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port)
1425 {
1426 	int		err;
1427 	void		*mah;
1428 
1429 	D1(vswp, "%s: enter", __func__);
1430 
1431 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1432 		return (0);
1433 
1434 	if (port->addr_set == VSW_ADDR_PROMISC) {
1435 		return (vsw_unset_hw_promisc(vswp, port));
1436 	}
1437 
1438 	if (port->addr_set == VSW_ADDR_HW) {
1439 		if (vswp->maddr.maddr_handle == NULL)
1440 			return (1);
1441 
1442 		mah = vswp->maddr.maddr_handle;
1443 
1444 		err = vswp->maddr.maddr_remove(mah, port->addr_slot);
1445 		if (err != 0) {
1446 			cmn_err(CE_WARN, "!vsw%d: Unable to remove addr "
1447 				"%x:%x:%x:%x:%x:%x for port %d from device %s"
1448 				" : (err %d)", vswp->instance,
1449 				port->p_macaddr.ether_addr_octet[0],
1450 				port->p_macaddr.ether_addr_octet[1],
1451 				port->p_macaddr.ether_addr_octet[2],
1452 				port->p_macaddr.ether_addr_octet[3],
1453 				port->p_macaddr.ether_addr_octet[4],
1454 				port->p_macaddr.ether_addr_octet[5],
1455 				port->p_instance, vswp->physname, err);
1456 			return (err);
1457 		}
1458 
1459 		port->addr_set = VSW_ADDR_UNSET;
1460 
1461 		D2(vswp, "removed addr %x:%x:%x:%x:%x:%x for "
1462 			"port %d from device %s",
1463 			port->p_macaddr.ether_addr_octet[0],
1464 			port->p_macaddr.ether_addr_octet[1],
1465 			port->p_macaddr.ether_addr_octet[2],
1466 			port->p_macaddr.ether_addr_octet[3],
1467 			port->p_macaddr.ether_addr_octet[4],
1468 			port->p_macaddr.ether_addr_octet[5],
1469 			port->p_instance, vswp->physname);
1470 	}
1471 
1472 	D1(vswp, "%s: exit", __func__);
1473 	return (0);
1474 }
1475 
1476 /*
1477  * Set network card into promisc mode.
1478  *
1479  * Returns 0 on success, 1 on failure.
1480  */
1481 static int
1482 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port)
1483 {
1484 	D1(vswp, "%s: enter", __func__);
1485 
1486 	mutex_enter(&vswp->mac_lock);
1487 	if (vswp->mh == NULL) {
1488 		mutex_exit(&vswp->mac_lock);
1489 		return (1);
1490 	}
1491 
1492 	if (vswp->promisc_cnt++ == 0) {
1493 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1494 			vswp->promisc_cnt--;
1495 			mutex_exit(&vswp->mac_lock);
1496 			return (1);
1497 		}
1498 		cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
1499 			"promiscuous mode", vswp->instance, vswp->physname);
1500 	}
1501 	mutex_exit(&vswp->mac_lock);
1502 	port->addr_set = VSW_ADDR_PROMISC;
1503 
1504 	D1(vswp, "%s: exit", __func__);
1505 
1506 	return (0);
1507 }
1508 
1509 /*
1510  * Turn off promiscuous mode on network card.
1511  *
1512  * Returns 0 on success, 1 on failure.
1513  */
1514 static int
1515 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port)
1516 {
1517 	vsw_port_list_t 	*plist = &vswp->plist;
1518 
1519 	D2(vswp, "%s: enter", __func__);
1520 
1521 	mutex_enter(&vswp->mac_lock);
1522 	if (vswp->mh == NULL) {
1523 		mutex_exit(&vswp->mac_lock);
1524 		return (1);
1525 	}
1526 
1527 	ASSERT(port->addr_set == VSW_ADDR_PROMISC);
1528 
1529 	if (--vswp->promisc_cnt == 0) {
1530 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
1531 			vswp->promisc_cnt++;
1532 			mutex_exit(&vswp->mac_lock);
1533 			return (1);
1534 		}
1535 
1536 		/*
1537 		 * We are exiting promisc mode either because we were
1538 		 * only in promisc mode because we had failed over from
1539 		 * switched mode due to HW resource issues, or the user
1540 		 * wanted the card in promisc mode for all the ports and
1541 		 * the last port is now being deleted. Tweak the message
1542 		 * accordingly.
1543 		 */
1544 		if (plist->num_ports != 0) {
1545 			cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
1546 				"programmed mode", vswp->instance,
1547 				vswp->physname);
1548 		} else {
1549 			cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
1550 				"promiscuous mode", vswp->instance,
1551 				vswp->physname);
1552 		}
1553 	}
1554 	mutex_exit(&vswp->mac_lock);
1555 	port->addr_set = VSW_ADDR_UNSET;
1556 
1557 	D1(vswp, "%s: exit", __func__);
1558 	return (0);
1559 }
1560 
1561 /*
1562  * Determine whether or not we are operating in our prefered
1563  * mode and if not whether the physical resources now allow us
1564  * to operate in it.
1565  *
1566  * Should only be invoked after port which is being deleted has been
1567  * removed from the port list.
1568  */
1569 static int
1570 vsw_reconfig_hw(vsw_t *vswp)
1571 {
1572 	vsw_port_list_t 	*plist = &vswp->plist;
1573 	mac_multi_addr_t	mac_addr;
1574 	vsw_port_t		*tp;
1575 	void			*mah;
1576 	int			rv = 0;
1577 	int			s_idx;
1578 
1579 	D1(vswp, "%s: enter", __func__);
1580 
1581 	if (vswp->maddr.maddr_handle == NULL)
1582 		return (1);
1583 
1584 	/*
1585 	 * Check if there are now sufficient HW resources to
1586 	 * attempt a re-config.
1587 	 */
1588 	if (plist->num_ports > vswp->maddr.maddr_naddrfree)
1589 		return (1);
1590 
1591 	/*
1592 	 * If we are in layer 2 (i.e. switched) or would like to be
1593 	 * in layer 2 then check if any ports need to be programmed
1594 	 * into the HW.
1595 	 *
1596 	 * This can happen in two cases - switched was specified as
1597 	 * the prefered mode of operation but we exhausted the HW
1598 	 * resources and so failed over to the next specifed mode,
1599 	 * or switched was the only mode specified so after HW
1600 	 * resources were exhausted there was nothing more we
1601 	 * could do.
1602 	 */
1603 	if (vswp->smode_idx > 0)
1604 		s_idx = vswp->smode_idx - 1;
1605 	else
1606 		s_idx = vswp->smode_idx;
1607 
1608 	if (vswp->smode[s_idx] == VSW_LAYER2) {
1609 		mah = vswp->maddr.maddr_handle;
1610 
1611 		D2(vswp, "%s: attempting reconfig..", __func__);
1612 
1613 		/*
1614 		 * Scan the port list for any port whose address has not
1615 		 * be programmed in HW - there should be a max of one.
1616 		 */
1617 		for (tp = plist->head; tp != NULL; tp = tp->p_next) {
1618 			if (tp->addr_set != VSW_ADDR_HW) {
1619 				mac_addr.mma_addrlen = ETHERADDRL;
1620 				ether_copy(&tp->p_macaddr, &mac_addr.mma_addr);
1621 
1622 				rv = vswp->maddr.maddr_add(mah, &mac_addr);
1623 				if (rv != 0) {
1624 					DWARN(vswp, "Error setting addr in "
1625 						"HW for port %d err %d",
1626 						tp->p_instance, rv);
1627 					goto reconfig_err_exit;
1628 				}
1629 				tp->addr_slot = mac_addr.mma_slot;
1630 
1631 				D2(vswp, "re-programmed port %d "
1632 					"addr %x:%x:%x:%x:%x:%x into slot %d"
1633 					" of device %s", tp->p_instance,
1634 					tp->p_macaddr.ether_addr_octet[0],
1635 					tp->p_macaddr.ether_addr_octet[1],
1636 					tp->p_macaddr.ether_addr_octet[2],
1637 					tp->p_macaddr.ether_addr_octet[3],
1638 					tp->p_macaddr.ether_addr_octet[4],
1639 					tp->p_macaddr.ether_addr_octet[5],
1640 					tp->addr_slot, vswp->physname);
1641 
1642 				/*
1643 				 * If up to now we had to put the card into
1644 				 * promisc mode to see this address, we
1645 				 * can now safely disable promisc mode.
1646 				 */
1647 				if (tp->addr_set == VSW_ADDR_PROMISC)
1648 					(void) vsw_unset_hw_promisc(vswp, tp);
1649 
1650 				tp->addr_set = VSW_ADDR_HW;
1651 			}
1652 		}
1653 
1654 		/* no further re-config needed */
1655 		vswp->recfg_reqd = B_FALSE;
1656 
1657 		vswp->smode_idx = s_idx;
1658 
1659 		return (0);
1660 	}
1661 
1662 reconfig_err_exit:
1663 	return (rv);
1664 }
1665 
1666 static void
1667 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
1668 {
1669 	ringp->ring_state = VSW_MAC_RING_FREE;
1670 	ringp->ring_arg = NULL;
1671 	ringp->ring_blank = NULL;
1672 	ringp->ring_vqp = NULL;
1673 	ringp->ring_vswp = vswp;
1674 }
1675 
1676 static void
1677 vsw_mac_ring_tbl_init(vsw_t *vswp)
1678 {
1679 	int		i;
1680 
1681 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
1682 
1683 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
1684 	vswp->mac_ring_tbl  =
1685 		kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t),
1686 		KM_SLEEP);
1687 
1688 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
1689 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
1690 }
1691 
1692 static void
1693 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
1694 {
1695 	int		i;
1696 	vsw_mac_ring_t	*ringp;
1697 
1698 	mutex_enter(&vswp->mac_ring_lock);
1699 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1700 		ringp = &vswp->mac_ring_tbl[i];
1701 
1702 		if (ringp->ring_state != VSW_MAC_RING_FREE) {
1703 			/*
1704 			 * Destroy the queue.
1705 			 */
1706 			vsw_queue_stop(ringp->ring_vqp);
1707 			vsw_queue_destroy(ringp->ring_vqp);
1708 
1709 			/*
1710 			 * Re-initialize the structure.
1711 			 */
1712 			vsw_mac_ring_tbl_entry_init(vswp, ringp);
1713 		}
1714 	}
1715 	mutex_exit(&vswp->mac_ring_lock);
1716 
1717 	mutex_destroy(&vswp->mac_ring_lock);
1718 	kmem_free(vswp->mac_ring_tbl,
1719 		vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
1720 	vswp->mac_ring_tbl_sz = 0;
1721 }
1722 
1723 /*
1724  * Handle resource add callbacks from the driver below.
1725  */
1726 static mac_resource_handle_t
1727 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
1728 {
1729 	vsw_t		*vswp = (vsw_t *)arg;
1730 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
1731 	vsw_mac_ring_t	*ringp;
1732 	vsw_queue_t	*vqp;
1733 	int		i;
1734 
1735 	ASSERT(vswp != NULL);
1736 	ASSERT(mrp != NULL);
1737 	ASSERT(vswp->mac_ring_tbl != NULL);
1738 
1739 	D1(vswp, "%s: enter", __func__);
1740 
1741 	/*
1742 	 * Check to make sure we have the correct resource type.
1743 	 */
1744 	if (mrp->mr_type != MAC_RX_FIFO)
1745 		return (NULL);
1746 
1747 	/*
1748 	 * Find a open entry in the ring table.
1749 	 */
1750 	mutex_enter(&vswp->mac_ring_lock);
1751 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1752 		ringp = &vswp->mac_ring_tbl[i];
1753 
1754 		/*
1755 		 * Check for an empty slot, if found, then setup queue
1756 		 * and thread.
1757 		 */
1758 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
1759 			/*
1760 			 * Create the queue for this ring.
1761 			 */
1762 			vqp = vsw_queue_create();
1763 
1764 			/*
1765 			 * Initialize the ring data structure.
1766 			 */
1767 			ringp->ring_vqp = vqp;
1768 			ringp->ring_arg = mrfp->mrf_arg;
1769 			ringp->ring_blank = mrfp->mrf_blank;
1770 			ringp->ring_state = VSW_MAC_RING_INUSE;
1771 
1772 			/*
1773 			 * Create the worker thread.
1774 			 */
1775 			vqp->vq_worker = thread_create(NULL, 0,
1776 				vsw_queue_worker, ringp, 0, &p0,
1777 				TS_RUN, minclsyspri);
1778 			if (vqp->vq_worker == NULL) {
1779 				vsw_queue_destroy(vqp);
1780 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
1781 				ringp = NULL;
1782 			}
1783 
1784 			if (ringp != NULL) {
1785 				/*
1786 				 * Make sure thread get's running state for
1787 				 * this ring.
1788 				 */
1789 				mutex_enter(&vqp->vq_lock);
1790 				while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
1791 					(vqp->vq_state != VSW_QUEUE_DRAINED)) {
1792 					cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1793 				}
1794 
1795 				/*
1796 				 * If the thread is not running, cleanup.
1797 				 */
1798 				if (vqp->vq_state == VSW_QUEUE_DRAINED) {
1799 					vsw_queue_destroy(vqp);
1800 					vsw_mac_ring_tbl_entry_init(vswp,
1801 						ringp);
1802 					ringp = NULL;
1803 				}
1804 				mutex_exit(&vqp->vq_lock);
1805 			}
1806 
1807 			mutex_exit(&vswp->mac_ring_lock);
1808 			D1(vswp, "%s: exit", __func__);
1809 			return ((mac_resource_handle_t)ringp);
1810 		}
1811 	}
1812 	mutex_exit(&vswp->mac_ring_lock);
1813 
1814 	/*
1815 	 * No slots in the ring table available.
1816 	 */
1817 	D1(vswp, "%s: exit", __func__);
1818 	return (NULL);
1819 }
1820 
1821 static void
1822 vsw_queue_stop(vsw_queue_t *vqp)
1823 {
1824 	mutex_enter(&vqp->vq_lock);
1825 
1826 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
1827 		vqp->vq_state = VSW_QUEUE_STOP;
1828 		cv_signal(&vqp->vq_cv);
1829 
1830 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
1831 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1832 	}
1833 
1834 	vqp->vq_state = VSW_QUEUE_STOPPED;
1835 
1836 	mutex_exit(&vqp->vq_lock);
1837 }
1838 
1839 static vsw_queue_t *
1840 vsw_queue_create()
1841 {
1842 	vsw_queue_t *vqp;
1843 
1844 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
1845 
1846 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
1847 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
1848 	vqp->vq_first = NULL;
1849 	vqp->vq_last = NULL;
1850 	vqp->vq_state = VSW_QUEUE_STOPPED;
1851 
1852 	return (vqp);
1853 }
1854 
1855 static void
1856 vsw_queue_destroy(vsw_queue_t *vqp)
1857 {
1858 	cv_destroy(&vqp->vq_cv);
1859 	mutex_destroy(&vqp->vq_lock);
1860 	kmem_free(vqp, sizeof (vsw_queue_t));
1861 }
1862 
1863 static void
1864 vsw_queue_worker(vsw_mac_ring_t *rrp)
1865 {
1866 	mblk_t		*mp;
1867 	vsw_queue_t	*vqp = rrp->ring_vqp;
1868 	vsw_t		*vswp = rrp->ring_vswp;
1869 
1870 	mutex_enter(&vqp->vq_lock);
1871 
1872 	ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
1873 
1874 	/*
1875 	 * Set the state to running, since the thread is now active.
1876 	 */
1877 	vqp->vq_state = VSW_QUEUE_RUNNING;
1878 	cv_signal(&vqp->vq_cv);
1879 
1880 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
1881 		/*
1882 		 * Wait for work to do or the state has changed
1883 		 * to not running.
1884 		 */
1885 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
1886 				(vqp->vq_first == NULL)) {
1887 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1888 		}
1889 
1890 		/*
1891 		 * Process packets that we received from the interface.
1892 		 */
1893 		if (vqp->vq_first != NULL) {
1894 			mp = vqp->vq_first;
1895 
1896 			vqp->vq_first = NULL;
1897 			vqp->vq_last = NULL;
1898 
1899 			mutex_exit(&vqp->vq_lock);
1900 
1901 			/* switch the chain of packets received */
1902 			vswp->vsw_switch_frame(vswp, mp,
1903 						VSW_PHYSDEV, NULL, NULL);
1904 
1905 			mutex_enter(&vqp->vq_lock);
1906 		}
1907 	}
1908 
1909 	/*
1910 	 * We are drained and signal we are done.
1911 	 */
1912 	vqp->vq_state = VSW_QUEUE_DRAINED;
1913 	cv_signal(&vqp->vq_cv);
1914 
1915 	/*
1916 	 * Exit lock and drain the remaining packets.
1917 	 */
1918 	mutex_exit(&vqp->vq_lock);
1919 
1920 	/*
1921 	 * Exit the thread
1922 	 */
1923 	thread_exit();
1924 }
1925 
1926 /*
1927  * static void
1928  * vsw_rx_queue_cb() - Receive callback routine when
1929  *	vsw_multi_ring_enable is non-zero.  Queue the packets
1930  *	to a packet queue for a worker thread to process.
1931  */
1932 static void
1933 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1934 {
1935 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
1936 	vsw_t		*vswp = (vsw_t *)arg;
1937 	vsw_queue_t	*vqp;
1938 	mblk_t		*bp, *last;
1939 
1940 	ASSERT(mrh != NULL);
1941 	ASSERT(vswp != NULL);
1942 	ASSERT(mp != NULL);
1943 
1944 	D1(vswp, "%s: enter", __func__);
1945 
1946 	/*
1947 	 * Find the last element in the mblk chain.
1948 	 */
1949 	bp = mp;
1950 	do {
1951 		last = bp;
1952 		bp = bp->b_next;
1953 	} while (bp != NULL);
1954 
1955 	/* Get the queue for the packets */
1956 	vqp = ringp->ring_vqp;
1957 
1958 	/*
1959 	 * Grab the lock such we can queue the packets.
1960 	 */
1961 	mutex_enter(&vqp->vq_lock);
1962 
1963 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
1964 		freemsg(mp);
1965 		mutex_exit(&vqp->vq_lock);
1966 		goto vsw_rx_queue_cb_exit;
1967 	}
1968 
1969 	/*
1970 	 * Add the mblk chain to the queue.  If there
1971 	 * is some mblks in the queue, then add the new
1972 	 * chain to the end.
1973 	 */
1974 	if (vqp->vq_first == NULL)
1975 		vqp->vq_first = mp;
1976 	else
1977 		vqp->vq_last->b_next = mp;
1978 
1979 	vqp->vq_last = last;
1980 
1981 	/*
1982 	 * Signal the worker thread that there is work to
1983 	 * do.
1984 	 */
1985 	cv_signal(&vqp->vq_cv);
1986 
1987 	/*
1988 	 * Let go of the lock and exit.
1989 	 */
1990 	mutex_exit(&vqp->vq_lock);
1991 
1992 vsw_rx_queue_cb_exit:
1993 	D1(vswp, "%s: exit", __func__);
1994 }
1995 
1996 /*
1997  * receive callback routine. Invoked by MAC layer when there
1998  * are pkts being passed up from physical device.
1999  *
2000  * PERF: It may be more efficient when the card is in promisc
2001  * mode to check the dest address of the pkts here (against
2002  * the FDB) rather than checking later. Needs to be investigated.
2003  */
2004 static void
2005 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
2006 {
2007 	_NOTE(ARGUNUSED(mrh))
2008 
2009 	vsw_t		*vswp = (vsw_t *)arg;
2010 
2011 	ASSERT(vswp != NULL);
2012 
2013 	D1(vswp, "vsw_rx_cb: enter");
2014 
2015 	/* switch the chain of packets received */
2016 	vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
2017 
2018 	D1(vswp, "vsw_rx_cb: exit");
2019 }
2020 
2021 /*
2022  * Send a message out over the physical device via the MAC layer.
2023  *
2024  * Returns any mblks that it was unable to transmit.
2025  */
2026 static mblk_t *
2027 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
2028 {
2029 	const mac_txinfo_t	*mtp;
2030 	mblk_t			*nextp;
2031 
2032 	mutex_enter(&vswp->mac_lock);
2033 	if (vswp->mh == NULL) {
2034 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
2035 		mutex_exit(&vswp->mac_lock);
2036 		return (mp);
2037 	} else {
2038 		for (;;) {
2039 			nextp = mp->b_next;
2040 			mp->b_next = NULL;
2041 
2042 			mtp = vswp->txinfo;
2043 
2044 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
2045 				mp->b_next = nextp;
2046 				break;
2047 			}
2048 
2049 			if ((mp = nextp) == NULL)
2050 				break;
2051 		}
2052 	}
2053 	mutex_exit(&vswp->mac_lock);
2054 
2055 	return (mp);
2056 }
2057 
2058 /*
2059  * Register with the MAC layer as a network device, so we
2060  * can be plumbed if necessary.
2061  */
2062 static int
2063 vsw_mac_register(vsw_t *vswp)
2064 {
2065 	mac_register_t	*macp;
2066 	int		rv;
2067 
2068 	D1(vswp, "%s: enter", __func__);
2069 
2070 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
2071 		return (EINVAL);
2072 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2073 	macp->m_driver = vswp;
2074 	macp->m_dip = vswp->dip;
2075 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
2076 	macp->m_callbacks = &vsw_m_callbacks;
2077 	macp->m_min_sdu = 0;
2078 	macp->m_max_sdu = ETHERMTU;
2079 	rv = mac_register(macp, &vswp->if_mh);
2080 	mac_free(macp);
2081 	if (rv == 0)
2082 		vswp->if_state |= VSW_IF_REG;
2083 
2084 	D1(vswp, "%s: exit", __func__);
2085 
2086 	return (rv);
2087 }
2088 
2089 static int
2090 vsw_mac_unregister(vsw_t *vswp)
2091 {
2092 	int		rv = 0;
2093 
2094 	D1(vswp, "%s: enter", __func__);
2095 
2096 	WRITE_ENTER(&vswp->if_lockrw);
2097 
2098 	if (vswp->if_state & VSW_IF_REG) {
2099 		rv = mac_unregister(vswp->if_mh);
2100 		if (rv != 0) {
2101 			DWARN(vswp, "%s: unable to unregister from MAC "
2102 				"framework", __func__);
2103 
2104 			RW_EXIT(&vswp->if_lockrw);
2105 			D1(vswp, "%s: fail exit", __func__);
2106 			return (rv);
2107 		}
2108 
2109 		/* mark i/f as down and unregistered */
2110 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
2111 	}
2112 	RW_EXIT(&vswp->if_lockrw);
2113 
2114 	D1(vswp, "%s: exit", __func__);
2115 
2116 	return (rv);
2117 }
2118 
2119 static int
2120 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
2121 {
2122 	vsw_t			*vswp = (vsw_t *)arg;
2123 
2124 	D1(vswp, "%s: enter", __func__);
2125 
2126 	mutex_enter(&vswp->mac_lock);
2127 	if (vswp->mh == NULL) {
2128 		mutex_exit(&vswp->mac_lock);
2129 		return (EINVAL);
2130 	}
2131 
2132 	/* return stats from underlying device */
2133 	*val = mac_stat_get(vswp->mh, stat);
2134 
2135 	mutex_exit(&vswp->mac_lock);
2136 
2137 	return (0);
2138 }
2139 
2140 static void
2141 vsw_m_stop(void *arg)
2142 {
2143 	vsw_t		*vswp = (vsw_t *)arg;
2144 
2145 	D1(vswp, "%s: enter", __func__);
2146 
2147 	WRITE_ENTER(&vswp->if_lockrw);
2148 	vswp->if_state &= ~VSW_IF_UP;
2149 	RW_EXIT(&vswp->if_lockrw);
2150 
2151 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2152 }
2153 
2154 static int
2155 vsw_m_start(void *arg)
2156 {
2157 	vsw_t		*vswp = (vsw_t *)arg;
2158 
2159 	D1(vswp, "%s: enter", __func__);
2160 
2161 	WRITE_ENTER(&vswp->if_lockrw);
2162 	vswp->if_state |= VSW_IF_UP;
2163 	RW_EXIT(&vswp->if_lockrw);
2164 
2165 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2166 	return (0);
2167 }
2168 
2169 /*
2170  * Change the local interface address.
2171  */
2172 static int
2173 vsw_m_unicst(void *arg, const uint8_t *macaddr)
2174 {
2175 	vsw_t		*vswp = (vsw_t *)arg;
2176 
2177 	D1(vswp, "%s: enter", __func__);
2178 
2179 	WRITE_ENTER(&vswp->if_lockrw);
2180 	ether_copy(macaddr, &vswp->if_addr);
2181 	RW_EXIT(&vswp->if_lockrw);
2182 
2183 	D1(vswp, "%s: exit", __func__);
2184 
2185 	return (0);
2186 }
2187 
2188 static int
2189 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
2190 {
2191 	vsw_t		*vswp = (vsw_t *)arg;
2192 	mcst_addr_t	*mcst_p = NULL;
2193 	uint64_t	addr = 0x0;
2194 	int		i, ret = 0;
2195 
2196 	D1(vswp, "%s: enter", __func__);
2197 
2198 	/*
2199 	 * Convert address into form that can be used
2200 	 * as hash table key.
2201 	 */
2202 	for (i = 0; i < ETHERADDRL; i++) {
2203 		addr = (addr << 8) | mca[i];
2204 	}
2205 
2206 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
2207 
2208 	if (add) {
2209 		D2(vswp, "%s: adding multicast", __func__);
2210 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2211 			/*
2212 			 * Update the list of multicast addresses
2213 			 * contained within the vsw_t structure to
2214 			 * include this new one.
2215 			 */
2216 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
2217 			if (mcst_p == NULL) {
2218 				DERR(vswp, "%s unable to alloc mem", __func__);
2219 				return (1);
2220 			}
2221 			mcst_p->addr = addr;
2222 
2223 			mutex_enter(&vswp->mca_lock);
2224 			mcst_p->nextp = vswp->mcap;
2225 			vswp->mcap = mcst_p;
2226 			mutex_exit(&vswp->mca_lock);
2227 
2228 			/*
2229 			 * Call into the underlying driver to program the
2230 			 * address into HW.
2231 			 */
2232 			mutex_enter(&vswp->mac_lock);
2233 			if (vswp->mh != NULL) {
2234 				ret = mac_multicst_add(vswp->mh, mca);
2235 				if (ret != 0) {
2236 					cmn_err(CE_WARN, "!vsw%d: unable to "
2237 						"add multicast address",
2238 						vswp->instance);
2239 					mutex_exit(&vswp->mac_lock);
2240 					goto vsw_remove_addr;
2241 				}
2242 			}
2243 			mutex_exit(&vswp->mac_lock);
2244 		} else {
2245 			cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
2246 				"address", vswp->instance);
2247 		}
2248 		return (ret);
2249 	}
2250 
2251 vsw_remove_addr:
2252 
2253 	D2(vswp, "%s: removing multicast", __func__);
2254 	/*
2255 	 * Remove the address from the hash table..
2256 	 */
2257 	if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2258 
2259 		/*
2260 		 * ..and then from the list maintained in the
2261 		 * vsw_t structure.
2262 		 */
2263 		vsw_del_addr(VSW_LOCALDEV, vswp, addr);
2264 
2265 		mutex_enter(&vswp->mac_lock);
2266 		if (vswp->mh != NULL)
2267 			(void) mac_multicst_remove(vswp->mh, mca);
2268 		mutex_exit(&vswp->mac_lock);
2269 	}
2270 
2271 	D1(vswp, "%s: exit", __func__);
2272 
2273 	return (0);
2274 }
2275 
2276 static int
2277 vsw_m_promisc(void *arg, boolean_t on)
2278 {
2279 	vsw_t		*vswp = (vsw_t *)arg;
2280 
2281 	D1(vswp, "%s: enter", __func__);
2282 
2283 	WRITE_ENTER(&vswp->if_lockrw);
2284 	if (on)
2285 		vswp->if_state |= VSW_IF_PROMISC;
2286 	else
2287 		vswp->if_state &= ~VSW_IF_PROMISC;
2288 	RW_EXIT(&vswp->if_lockrw);
2289 
2290 	D1(vswp, "%s: exit", __func__);
2291 
2292 	return (0);
2293 }
2294 
2295 static mblk_t *
2296 vsw_m_tx(void *arg, mblk_t *mp)
2297 {
2298 	vsw_t		*vswp = (vsw_t *)arg;
2299 
2300 	D1(vswp, "%s: enter", __func__);
2301 
2302 	vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
2303 
2304 	D1(vswp, "%s: exit", __func__);
2305 
2306 	return (NULL);
2307 }
2308 
2309 /*
2310  * Register for machine description (MD) updates.
2311  *
2312  * Returns 0 on success, 1 on failure.
2313  */
2314 static int
2315 vsw_mdeg_register(vsw_t *vswp)
2316 {
2317 	mdeg_prop_spec_t	*pspecp;
2318 	mdeg_node_spec_t	*inst_specp;
2319 	mdeg_handle_t		mdeg_hdl, mdeg_port_hdl;
2320 	size_t			templatesz;
2321 	int			inst, rv;
2322 
2323 	D1(vswp, "%s: enter", __func__);
2324 
2325 	/*
2326 	 * In each 'virtual-device' node in the MD there is a
2327 	 * 'cfg-handle' property which is the MD's concept of
2328 	 * an instance number (this may be completely different from
2329 	 * the device drivers instance #). OBP reads that value and
2330 	 * stores it in the 'reg' property of the appropriate node in
2331 	 * the device tree. So we use the 'reg' value when registering
2332 	 * with the mdeg framework, to ensure we get events for the
2333 	 * correct nodes.
2334 	 */
2335 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
2336 		DDI_PROP_DONTPASS, reg_propname, -1);
2337 	if (inst == -1) {
2338 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from "
2339 			"OBP device tree", vswp->instance, reg_propname);
2340 		return (1);
2341 	}
2342 
2343 	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
2344 
2345 	/*
2346 	 * Allocate and initialize a per-instance copy
2347 	 * of the global property spec array that will
2348 	 * uniquely identify this vsw instance.
2349 	 */
2350 	templatesz = sizeof (vsw_prop_template);
2351 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
2352 
2353 	bcopy(vsw_prop_template, pspecp, templatesz);
2354 
2355 	VSW_SET_MDEG_PROP_INST(pspecp, inst);
2356 
2357 	/* initialize the complete prop spec structure */
2358 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
2359 	inst_specp->namep = "virtual-device";
2360 	inst_specp->specp = pspecp;
2361 
2362 	/*
2363 	 * Register an interest in 'virtual-device' nodes with a
2364 	 * 'name' property of 'virtual-network-switch'
2365 	 */
2366 	rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb,
2367 	    (void *)vswp, &mdeg_hdl);
2368 	if (rv != MDEG_SUCCESS) {
2369 		DERR(vswp, "%s: mdeg_register failed (%d) for vsw node",
2370 			__func__, rv);
2371 		goto mdeg_reg_fail;
2372 	}
2373 
2374 	/*
2375 	 * Register an interest in 'vsw-port' nodes.
2376 	 */
2377 	rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb,
2378 	    (void *)vswp, &mdeg_port_hdl);
2379 	if (rv != MDEG_SUCCESS) {
2380 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
2381 		(void) mdeg_unregister(mdeg_hdl);
2382 		goto mdeg_reg_fail;
2383 	}
2384 
2385 	/* save off data that will be needed later */
2386 	vswp->inst_spec = inst_specp;
2387 	vswp->mdeg_hdl = mdeg_hdl;
2388 	vswp->mdeg_port_hdl = mdeg_port_hdl;
2389 
2390 	D1(vswp, "%s: exit", __func__);
2391 	return (0);
2392 
2393 mdeg_reg_fail:
2394 	cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks",
2395 				vswp->instance);
2396 	kmem_free(pspecp, templatesz);
2397 	kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
2398 
2399 	vswp->mdeg_hdl = NULL;
2400 	vswp->mdeg_port_hdl = NULL;
2401 
2402 	return (1);
2403 }
2404 
2405 static void
2406 vsw_mdeg_unregister(vsw_t *vswp)
2407 {
2408 	D1(vswp, "vsw_mdeg_unregister: enter");
2409 
2410 	if (vswp->mdeg_hdl != NULL)
2411 		(void) mdeg_unregister(vswp->mdeg_hdl);
2412 
2413 	if (vswp->mdeg_port_hdl != NULL)
2414 		(void) mdeg_unregister(vswp->mdeg_port_hdl);
2415 
2416 	if (vswp->inst_spec != NULL) {
2417 		if (vswp->inst_spec->specp != NULL) {
2418 			(void) kmem_free(vswp->inst_spec->specp,
2419 				sizeof (vsw_prop_template));
2420 			vswp->inst_spec->specp = NULL;
2421 		}
2422 
2423 		(void) kmem_free(vswp->inst_spec,
2424 			sizeof (mdeg_node_spec_t));
2425 		vswp->inst_spec = NULL;
2426 	}
2427 
2428 	D1(vswp, "vsw_mdeg_unregister: exit");
2429 }
2430 
2431 /*
2432  * Mdeg callback invoked for the vsw node itself.
2433  */
2434 static int
2435 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2436 {
2437 	vsw_t		*vswp;
2438 	int		idx;
2439 	md_t		*mdp;
2440 	mde_cookie_t	node;
2441 	uint64_t	inst;
2442 	char		*node_name = NULL;
2443 
2444 	if (resp == NULL)
2445 		return (MDEG_FAILURE);
2446 
2447 	vswp = (vsw_t *)cb_argp;
2448 
2449 	D1(vswp, "%s: added %d : removed %d : curr matched %d"
2450 		" : prev matched %d", __func__, resp->added.nelem,
2451 		resp->removed.nelem, resp->match_curr.nelem,
2452 		resp->match_prev.nelem);
2453 
2454 	/*
2455 	 * Expect 'added' to be non-zero if virtual-network-switch
2456 	 * nodes exist in the MD when the driver attaches.
2457 	 */
2458 	for (idx = 0; idx < resp->added.nelem; idx++) {
2459 		mdp = resp->added.mdp;
2460 		node = resp->added.mdep[idx];
2461 
2462 		if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
2463 			DERR(vswp, "%s: unable to get node name for "
2464 				"node(%d) 0x%lx", __func__, idx, node);
2465 			continue;
2466 		}
2467 
2468 		if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
2469 			DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
2470 				__func__, idx);
2471 			continue;
2472 		}
2473 
2474 		D2(vswp, "%s: added node(%d) 0x%lx with name %s "
2475 			"and inst %d", __func__, idx, node, node_name, inst);
2476 
2477 		vsw_get_initial_md_properties(vswp, mdp, node);
2478 	}
2479 
2480 	/*
2481 	 * A non-zero 'match' value indicates that the MD has been
2482 	 * updated and that a virtual-network-switch node is present
2483 	 * which may or may not have been updated. It is up to the clients
2484 	 * to examine their own nodes and determine if they have changed.
2485 	 */
2486 	for (idx = 0; idx < resp->match_curr.nelem; idx++) {
2487 		mdp = resp->match_curr.mdp;
2488 		node = resp->match_curr.mdep[idx];
2489 
2490 		if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
2491 			DERR(vswp, "%s: unable to get node name for "
2492 				"node(%d) 0x%lx", __func__, idx, node);
2493 			continue;
2494 		}
2495 
2496 		if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
2497 			DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
2498 				__func__, idx);
2499 			continue;
2500 		}
2501 
2502 		D2(vswp, "%s: changed node(%d) 0x%lx with name %s "
2503 			"and inst %d", __func__, idx, node, node_name, inst);
2504 
2505 		vsw_update_md_prop(vswp, mdp, node);
2506 	}
2507 
2508 	return (MDEG_SUCCESS);
2509 }
2510 
2511 /*
2512  * Mdeg callback invoked for changes to the vsw-port nodes
2513  * under the vsw node.
2514  */
2515 static int
2516 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2517 {
2518 	vsw_t		*vswp;
2519 	int		idx;
2520 	md_t		*mdp;
2521 	mde_cookie_t	node;
2522 	uint64_t	inst;
2523 
2524 	if ((resp == NULL) || (cb_argp == NULL))
2525 		return (MDEG_FAILURE);
2526 
2527 	vswp = (vsw_t *)cb_argp;
2528 
2529 	D2(vswp, "%s: added %d : removed %d : curr matched %d"
2530 		" : prev matched %d", __func__, resp->added.nelem,
2531 		resp->removed.nelem, resp->match_curr.nelem,
2532 		resp->match_prev.nelem);
2533 
2534 	/* process added ports */
2535 	for (idx = 0; idx < resp->added.nelem; idx++) {
2536 		mdp = resp->added.mdp;
2537 		node = resp->added.mdep[idx];
2538 
2539 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
2540 
2541 		if (vsw_port_add(vswp, mdp, &node) != 0) {
2542 			cmn_err(CE_WARN, "!vsw%d: Unable to add new port "
2543 				"(0x%lx)", vswp->instance, node);
2544 		}
2545 	}
2546 
2547 	/* process removed ports */
2548 	for (idx = 0; idx < resp->removed.nelem; idx++) {
2549 		mdp = resp->removed.mdp;
2550 		node = resp->removed.mdep[idx];
2551 
2552 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
2553 			DERR(vswp, "%s: prop(%s) not found in port(%d)",
2554 				__func__, id_propname, idx);
2555 			continue;
2556 		}
2557 
2558 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
2559 
2560 		if (vsw_port_detach(vswp, inst) != 0) {
2561 			cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld",
2562 				vswp->instance, inst);
2563 		}
2564 	}
2565 
2566 	/*
2567 	 * Currently no support for updating already active ports.
2568 	 * So, ignore the match_curr and match_priv arrays for now.
2569 	 */
2570 
2571 	D1(vswp, "%s: exit", __func__);
2572 
2573 	return (MDEG_SUCCESS);
2574 }
2575 
2576 /*
2577  * Read the initial start-of-day values from the specified MD node.
2578  */
2579 static void
2580 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
2581 {
2582 	int		i;
2583 	uint64_t 	macaddr = 0;
2584 
2585 	D1(vswp, "%s: enter", __func__);
2586 
2587 	if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) == 0) {
2588 		/*
2589 		 * Note it is valid for the physname property to
2590 		 * be NULL so check actual name length to determine
2591 		 * if we have a actual device name.
2592 		 */
2593 		if (strlen(vswp->physname) > 0)
2594 			vswp->mdprops |= VSW_MD_PHYSNAME;
2595 	} else {
2596 		cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
2597 			"device from MD", vswp->instance);
2598 		return;
2599 	}
2600 
2601 	/* mac address for vswitch device itself */
2602 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
2603 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
2604 			vswp->instance);
2605 
2606 		/*
2607 		 * Fallback to using the mac address of the physical
2608 		 * device.
2609 		 */
2610 		if (vsw_get_physaddr(vswp) == 0) {
2611 			cmn_err(CE_NOTE, "!vsw%d: Using MAC address from "
2612 				"physical device (%s)", vswp->instance,
2613 				vswp->physname);
2614 		} else {
2615 			cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address"
2616 				"from device %s", vswp->instance,
2617 				vswp->physname);
2618 		}
2619 	} else {
2620 		WRITE_ENTER(&vswp->if_lockrw);
2621 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2622 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
2623 			macaddr >>= 8;
2624 		}
2625 		RW_EXIT(&vswp->if_lockrw);
2626 		vswp->mdprops |= VSW_MD_MACADDR;
2627 	}
2628 
2629 	if (vsw_get_md_smodes(vswp, mdp, node,
2630 				vswp->smode, &vswp->smode_num)) {
2631 		cmn_err(CE_WARN, "vsw%d: Unable to read %s property from "
2632 			"MD, defaulting to programmed mode", vswp->instance,
2633 			smode_propname);
2634 
2635 		for (i = 0; i < NUM_SMODES; i++)
2636 			vswp->smode[i] = VSW_LAYER2;
2637 
2638 		vswp->smode_num = NUM_SMODES;
2639 	} else {
2640 		ASSERT(vswp->smode_num != 0);
2641 		vswp->mdprops |= VSW_MD_SMODE;
2642 	}
2643 
2644 	/*
2645 	 * Unable to setup any switching mode, nothing more
2646 	 * we can do.
2647 	 */
2648 	if (vsw_setup_switching(vswp))
2649 		return;
2650 
2651 	WRITE_ENTER(&vswp->if_lockrw);
2652 	vswp->if_state &= ~VSW_IF_UP;
2653 	RW_EXIT(&vswp->if_lockrw);
2654 	if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
2655 		if (vsw_mac_register(vswp) != 0) {
2656 			/*
2657 			 * Treat this as a non-fatal error as we may be
2658 			 * able to operate in some other mode.
2659 			 */
2660 			cmn_err(CE_WARN, "vsw%d: Unable to register as "
2661 				"provider with MAC layer", vswp->instance);
2662 		}
2663 	}
2664 
2665 	D1(vswp, "%s: exit", __func__);
2666 }
2667 
2668 /*
2669  * Check to see if the relevant properties in the specified node have
2670  * changed, and if so take the appropriate action.
2671  *
2672  * If any of the properties are missing or invalid we don't take
2673  * any action, as this function should only be invoked when modifications
2674  * have been made to what we assume is a working configuration, which
2675  * we leave active.
2676  *
2677  * Note it is legal for this routine to be invoked even if none of the
2678  * properties in the port node within the MD have actually changed.
2679  */
2680 static void
2681 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
2682 {
2683 	char		physname[LIFNAMSIZ];
2684 	char		drv[LIFNAMSIZ];
2685 	uint_t		ddi_instance;
2686 	uint8_t		new_smode[NUM_SMODES];
2687 	int		i, smode_num = 0;
2688 	uint64_t 	macaddr = 0;
2689 	vsw_port_list_t *plist = &vswp->plist;
2690 	vsw_port_t	*port = NULL;
2691 	enum		{MD_init = 0x1,
2692 				MD_physname = 0x2,
2693 				MD_macaddr = 0x4,
2694 				MD_smode = 0x8} updated;
2695 
2696 	updated = MD_init;
2697 
2698 	D1(vswp, "%s: enter", __func__);
2699 
2700 	/*
2701 	 * Check if name of physical device in MD has changed.
2702 	 */
2703 	if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) {
2704 		/*
2705 		 * Do basic sanity check on new device name/instance,
2706 		 * if its non NULL. It is valid for the device name to
2707 		 * have changed from a non NULL to a NULL value, i.e.
2708 		 * the vsw is being changed to 'routed' mode.
2709 		 */
2710 		if ((strlen(physname) != 0) &&
2711 			(ddi_parse(physname, drv,
2712 				&ddi_instance) != DDI_SUCCESS)) {
2713 			cmn_err(CE_WARN, "!vsw%d: new device name %s is not"
2714 				" a valid device name/instance",
2715 				vswp->instance, physname);
2716 			goto fail_reconf;
2717 		}
2718 
2719 		if (strcmp(physname, vswp->physname)) {
2720 			D2(vswp, "%s: device name changed from %s to %s",
2721 					__func__, vswp->physname, physname);
2722 
2723 			updated |= MD_physname;
2724 		} else {
2725 			D2(vswp, "%s: device name unchanged at %s",
2726 					__func__, vswp->physname);
2727 		}
2728 	} else {
2729 		cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
2730 			"device from updated MD.", vswp->instance);
2731 		goto fail_reconf;
2732 	}
2733 
2734 	/*
2735 	 * Check if MAC address has changed.
2736 	 */
2737 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
2738 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
2739 			vswp->instance);
2740 		goto fail_reconf;
2741 	} else {
2742 		READ_ENTER(&vswp->if_lockrw);
2743 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2744 			if (vswp->if_addr.ether_addr_octet[i]
2745 							!= (macaddr & 0xFF)) {
2746 				D2(vswp, "%s: octet[%d] 0x%x != 0x%x",
2747 					__func__, i,
2748 					vswp->if_addr.ether_addr_octet[i],
2749 					(macaddr & 0xFF));
2750 				updated |= MD_macaddr;
2751 				break;
2752 			}
2753 			macaddr >>= 8;
2754 		}
2755 		RW_EXIT(&vswp->if_lockrw);
2756 	}
2757 
2758 	/*
2759 	 * Check if switching modes have changed.
2760 	 */
2761 	if (vsw_get_md_smodes(vswp, mdp, node,
2762 				new_smode, &smode_num)) {
2763 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
2764 					vswp->instance, smode_propname);
2765 		goto fail_reconf;
2766 	} else {
2767 		ASSERT(smode_num != 0);
2768 		if (smode_num != vswp->smode_num) {
2769 			D2(vswp, "%s: number of modes changed from %d to %d",
2770 				__func__, vswp->smode_num, smode_num);
2771 		}
2772 
2773 		for (i = 0; i < smode_num; i++) {
2774 			if (new_smode[i] != vswp->smode[i]) {
2775 				D2(vswp, "%s: mode changed from %d to %d",
2776 					__func__, vswp->smode[i], new_smode[i]);
2777 				updated |= MD_smode;
2778 				break;
2779 			}
2780 		}
2781 	}
2782 
2783 	/*
2784 	 * Now make any changes which are needed...
2785 	 */
2786 
2787 	if (updated & (MD_physname | MD_smode)) {
2788 		/*
2789 		 * Disconnect all ports from the current card
2790 		 */
2791 		WRITE_ENTER(&plist->lockrw);
2792 		for (port = plist->head; port != NULL; port = port->p_next) {
2793 			/* Remove address if was programmed into HW. */
2794 			if (vsw_unset_hw(vswp, port)) {
2795 				RW_EXIT(&plist->lockrw);
2796 				goto fail_update;
2797 			}
2798 		}
2799 		RW_EXIT(&plist->lockrw);
2800 
2801 		/*
2802 		 * Stop, detach the old device..
2803 		 */
2804 		vsw_mac_detach(vswp);
2805 
2806 		/*
2807 		 * Update phys name.
2808 		 */
2809 		if (updated & MD_physname) {
2810 			cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s",
2811 				vswp->instance, vswp->physname, physname);
2812 			(void) strncpy(vswp->physname,
2813 					physname, strlen(physname) + 1);
2814 
2815 			if (strlen(vswp->physname) > 0)
2816 				vswp->mdprops |= VSW_MD_PHYSNAME;
2817 		}
2818 
2819 		/*
2820 		 * Update array with the new switch mode values.
2821 		 */
2822 		if (updated & MD_smode) {
2823 			for (i = 0; i < smode_num; i++)
2824 				vswp->smode[i] = new_smode[i];
2825 
2826 			vswp->smode_num = smode_num;
2827 			vswp->smode_idx = 0;
2828 		}
2829 
2830 		/*
2831 		 * ..and attach, start the new device.
2832 		 */
2833 		if (vsw_setup_switching(vswp))
2834 			goto fail_update;
2835 
2836 		/*
2837 		 * Connect ports to new card.
2838 		 */
2839 		WRITE_ENTER(&plist->lockrw);
2840 		for (port = plist->head; port != NULL; port = port->p_next) {
2841 			if (vsw_set_hw(vswp, port)) {
2842 				RW_EXIT(&plist->lockrw);
2843 				goto fail_update;
2844 			}
2845 		}
2846 		RW_EXIT(&plist->lockrw);
2847 	}
2848 
2849 	if (updated & MD_macaddr) {
2850 		cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx",
2851 				vswp->instance, macaddr);
2852 
2853 		WRITE_ENTER(&vswp->if_lockrw);
2854 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2855 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
2856 			macaddr >>= 8;
2857 		}
2858 		RW_EXIT(&vswp->if_lockrw);
2859 
2860 		/*
2861 		 * Notify the MAC layer of the changed address.
2862 		 */
2863 		mac_unicst_update(vswp->if_mh, (uint8_t *)&vswp->if_addr);
2864 	}
2865 
2866 	return;
2867 
2868 fail_reconf:
2869 	cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance);
2870 	return;
2871 
2872 fail_update:
2873 	cmn_err(CE_WARN, "!vsw%d: update of configuration failed",
2874 			vswp->instance);
2875 }
2876 
2877 /*
2878  * Add a new port to the system.
2879  *
2880  * Returns 0 on success, 1 on failure.
2881  */
2882 int
2883 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
2884 {
2885 	uint64_t		ldc_id;
2886 	uint8_t			*addrp;
2887 	int			i, addrsz;
2888 	int			num_nodes = 0, nchan = 0;
2889 	int			listsz = 0;
2890 	mde_cookie_t		*listp = NULL;
2891 	struct ether_addr	ea;
2892 	uint64_t		macaddr;
2893 	uint64_t		inst = 0;
2894 	vsw_port_t		*port;
2895 
2896 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
2897 		DWARN(vswp, "%s: prop(%s) not found", __func__,
2898 			id_propname);
2899 		return (1);
2900 	}
2901 
2902 	/*
2903 	 * Find the channel endpoint node(s) (which should be under this
2904 	 * port node) which contain the channel id(s).
2905 	 */
2906 	if ((num_nodes = md_node_count(mdp)) <= 0) {
2907 		DERR(vswp, "%s: invalid number of nodes found (%d)",
2908 			__func__, num_nodes);
2909 		return (1);
2910 	}
2911 
2912 	D2(vswp, "%s: %d nodes found", __func__, num_nodes);
2913 
2914 	/* allocate enough space for node list */
2915 	listsz = num_nodes * sizeof (mde_cookie_t);
2916 	listp = kmem_zalloc(listsz, KM_SLEEP);
2917 
2918 	nchan = md_scan_dag(mdp, *node,
2919 		md_find_name(mdp, chan_propname),
2920 		md_find_name(mdp, "fwd"), listp);
2921 
2922 	if (nchan <= 0) {
2923 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
2924 		kmem_free(listp, listsz);
2925 		return (1);
2926 	}
2927 
2928 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
2929 
2930 	/* use property from first node found */
2931 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
2932 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
2933 			id_propname);
2934 		kmem_free(listp, listsz);
2935 		return (1);
2936 	}
2937 
2938 	/* don't need list any more */
2939 	kmem_free(listp, listsz);
2940 
2941 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
2942 
2943 	/* read mac-address property */
2944 	if (md_get_prop_data(mdp, *node, remaddr_propname,
2945 					&addrp, &addrsz)) {
2946 		DWARN(vswp, "%s: prop(%s) not found",
2947 				__func__, remaddr_propname);
2948 		return (1);
2949 	}
2950 
2951 	if (addrsz < ETHERADDRL) {
2952 		DWARN(vswp, "%s: invalid address size", __func__);
2953 		return (1);
2954 	}
2955 
2956 	macaddr = *((uint64_t *)addrp);
2957 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
2958 
2959 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2960 		ea.ether_addr_octet[i] = macaddr & 0xFF;
2961 		macaddr >>= 8;
2962 	}
2963 
2964 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
2965 		DERR(vswp, "%s: failed to attach port", __func__);
2966 		return (1);
2967 	}
2968 
2969 	port = vsw_lookup_port(vswp, (int)inst);
2970 
2971 	/* just successfuly created the port, so it should exist */
2972 	ASSERT(port != NULL);
2973 
2974 	return (0);
2975 }
2976 
2977 /*
2978  * Attach the specified port.
2979  *
2980  * Returns 0 on success, 1 on failure.
2981  */
2982 static int
2983 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
2984 struct ether_addr *macaddr)
2985 {
2986 	vsw_port_list_t		*plist = &vswp->plist;
2987 	vsw_port_t		*port, **prev_port;
2988 	int			i;
2989 
2990 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
2991 
2992 	/* port already exists? */
2993 	READ_ENTER(&plist->lockrw);
2994 	for (port = plist->head; port != NULL; port = port->p_next) {
2995 		if (port->p_instance == p_instance) {
2996 			DWARN(vswp, "%s: port instance %d already attached",
2997 				__func__, p_instance);
2998 			RW_EXIT(&plist->lockrw);
2999 			return (1);
3000 		}
3001 	}
3002 	RW_EXIT(&plist->lockrw);
3003 
3004 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
3005 	port->p_vswp = vswp;
3006 	port->p_instance = p_instance;
3007 	port->p_ldclist.num_ldcs = 0;
3008 	port->p_ldclist.head = NULL;
3009 	port->addr_set = VSW_ADDR_UNSET;
3010 
3011 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
3012 
3013 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
3014 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
3015 
3016 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
3017 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
3018 
3019 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
3020 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
3021 	port->state = VSW_PORT_INIT;
3022 
3023 	if (nids > VSW_PORT_MAX_LDCS) {
3024 		D2(vswp, "%s: using first of %d ldc ids",
3025 			__func__, nids);
3026 		nids = VSW_PORT_MAX_LDCS;
3027 	}
3028 
3029 	D2(vswp, "%s: %d nids", __func__, nids);
3030 	for (i = 0; i < nids; i++) {
3031 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
3032 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
3033 			DERR(vswp, "%s: ldc_attach failed", __func__);
3034 
3035 			rw_destroy(&port->p_ldclist.lockrw);
3036 
3037 			cv_destroy(&port->ref_cv);
3038 			mutex_destroy(&port->ref_lock);
3039 
3040 			cv_destroy(&port->state_cv);
3041 			mutex_destroy(&port->state_lock);
3042 
3043 			mutex_destroy(&port->tx_lock);
3044 			mutex_destroy(&port->mca_lock);
3045 			kmem_free(port, sizeof (vsw_port_t));
3046 			return (1);
3047 		}
3048 	}
3049 
3050 	ether_copy(macaddr, &port->p_macaddr);
3051 
3052 	WRITE_ENTER(&plist->lockrw);
3053 
3054 	/* create the fdb entry for this port/mac address */
3055 	(void) vsw_add_fdb(vswp, port);
3056 
3057 	(void) vsw_set_hw(vswp, port);
3058 
3059 	/* link it into the list of ports for this vsw instance */
3060 	prev_port = (vsw_port_t **)(&plist->head);
3061 	port->p_next = *prev_port;
3062 	*prev_port = port;
3063 	plist->num_ports++;
3064 	RW_EXIT(&plist->lockrw);
3065 
3066 	/*
3067 	 * Initialise the port and any ldc's under it.
3068 	 */
3069 	(void) vsw_init_ldcs(port);
3070 
3071 	D1(vswp, "%s: exit", __func__);
3072 	return (0);
3073 }
3074 
3075 /*
3076  * Detach the specified port.
3077  *
3078  * Returns 0 on success, 1 on failure.
3079  */
3080 static int
3081 vsw_port_detach(vsw_t *vswp, int p_instance)
3082 {
3083 	vsw_port_t	*port = NULL;
3084 	vsw_port_list_t	*plist = &vswp->plist;
3085 
3086 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
3087 
3088 	WRITE_ENTER(&plist->lockrw);
3089 
3090 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
3091 		RW_EXIT(&plist->lockrw);
3092 		return (1);
3093 	}
3094 
3095 	if (vsw_plist_del_node(vswp, port)) {
3096 		RW_EXIT(&plist->lockrw);
3097 		return (1);
3098 	}
3099 
3100 	/* Remove address if was programmed into HW. */
3101 	(void) vsw_unset_hw(vswp, port);
3102 
3103 	/* Remove the fdb entry for this port/mac address */
3104 	(void) vsw_del_fdb(vswp, port);
3105 
3106 	/* Remove any multicast addresses.. */
3107 	vsw_del_mcst_port(port);
3108 
3109 	/*
3110 	 * No longer need to hold writer lock on port list now
3111 	 * that we have unlinked the target port from the list.
3112 	 */
3113 	RW_EXIT(&plist->lockrw);
3114 
3115 	READ_ENTER(&plist->lockrw);
3116 
3117 	if (vswp->recfg_reqd)
3118 		(void) vsw_reconfig_hw(vswp);
3119 
3120 	RW_EXIT(&plist->lockrw);
3121 
3122 	if (vsw_port_delete(port)) {
3123 		return (1);
3124 	}
3125 
3126 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
3127 	return (0);
3128 }
3129 
3130 /*
3131  * Detach all active ports.
3132  *
3133  * Returns 0 on success, 1 on failure.
3134  */
3135 static int
3136 vsw_detach_ports(vsw_t *vswp)
3137 {
3138 	vsw_port_list_t 	*plist = &vswp->plist;
3139 	vsw_port_t		*port = NULL;
3140 
3141 	D1(vswp, "%s: enter", __func__);
3142 
3143 	WRITE_ENTER(&plist->lockrw);
3144 
3145 	while ((port = plist->head) != NULL) {
3146 		if (vsw_plist_del_node(vswp, port)) {
3147 			DERR(vswp, "%s: Error deleting port %d"
3148 				" from port list", __func__,
3149 				port->p_instance);
3150 			RW_EXIT(&plist->lockrw);
3151 			return (1);
3152 		}
3153 
3154 		/* Remove address if was programmed into HW. */
3155 		(void) vsw_unset_hw(vswp, port);
3156 
3157 		/* Remove the fdb entry for this port/mac address */
3158 		(void) vsw_del_fdb(vswp, port);
3159 
3160 		/* Remove any multicast addresses.. */
3161 		vsw_del_mcst_port(port);
3162 
3163 		/*
3164 		 * No longer need to hold the lock on the port list
3165 		 * now that we have unlinked the target port from the
3166 		 * list.
3167 		 */
3168 		RW_EXIT(&plist->lockrw);
3169 		if (vsw_port_delete(port)) {
3170 			DERR(vswp, "%s: Error deleting port %d",
3171 				__func__, port->p_instance);
3172 			return (1);
3173 		}
3174 		WRITE_ENTER(&plist->lockrw);
3175 	}
3176 	RW_EXIT(&plist->lockrw);
3177 
3178 	D1(vswp, "%s: exit", __func__);
3179 
3180 	return (0);
3181 }
3182 
3183 /*
3184  * Delete the specified port.
3185  *
3186  * Returns 0 on success, 1 on failure.
3187  */
3188 static int
3189 vsw_port_delete(vsw_port_t *port)
3190 {
3191 	vsw_ldc_list_t 		*ldcl;
3192 	vsw_t			*vswp = port->p_vswp;
3193 
3194 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
3195 
3196 	(void) vsw_uninit_ldcs(port);
3197 
3198 	/*
3199 	 * Wait for any pending ctrl msg tasks which reference this
3200 	 * port to finish.
3201 	 */
3202 	if (vsw_drain_port_taskq(port))
3203 		return (1);
3204 
3205 	/*
3206 	 * Wait for port reference count to hit zero.
3207 	 */
3208 	mutex_enter(&port->ref_lock);
3209 	while (port->ref_cnt != 0)
3210 		cv_wait(&port->ref_cv, &port->ref_lock);
3211 	mutex_exit(&port->ref_lock);
3212 
3213 	/*
3214 	 * Wait for any active callbacks to finish
3215 	 */
3216 	if (vsw_drain_ldcs(port))
3217 		return (1);
3218 
3219 	ldcl = &port->p_ldclist;
3220 	WRITE_ENTER(&ldcl->lockrw);
3221 	while (ldcl->num_ldcs > 0) {
3222 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
3223 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
3224 					vswp->instance, ldcl->head->ldc_id);
3225 			RW_EXIT(&ldcl->lockrw);
3226 			return (1);
3227 		}
3228 	}
3229 	RW_EXIT(&ldcl->lockrw);
3230 
3231 	rw_destroy(&port->p_ldclist.lockrw);
3232 
3233 	mutex_destroy(&port->mca_lock);
3234 	mutex_destroy(&port->tx_lock);
3235 	cv_destroy(&port->ref_cv);
3236 	mutex_destroy(&port->ref_lock);
3237 
3238 	cv_destroy(&port->state_cv);
3239 	mutex_destroy(&port->state_lock);
3240 
3241 	kmem_free(port, sizeof (vsw_port_t));
3242 
3243 	D1(vswp, "%s: exit", __func__);
3244 
3245 	return (0);
3246 }
3247 
3248 /*
3249  * Attach a logical domain channel (ldc) under a specified port.
3250  *
3251  * Returns 0 on success, 1 on failure.
3252  */
3253 static int
3254 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
3255 {
3256 	vsw_t 		*vswp = port->p_vswp;
3257 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
3258 	vsw_ldc_t 	*ldcp = NULL;
3259 	ldc_attr_t 	attr;
3260 	ldc_status_t	istatus;
3261 	int 		status = DDI_FAILURE;
3262 	int		rv;
3263 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
3264 				PROG_callback = 0x2}
3265 			progress;
3266 
3267 	progress = PROG_init;
3268 
3269 	D1(vswp, "%s: enter", __func__);
3270 
3271 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
3272 	if (ldcp == NULL) {
3273 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
3274 		return (1);
3275 	}
3276 	ldcp->ldc_id = ldc_id;
3277 
3278 	/* allocate pool of receive mblks */
3279 	rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
3280 	if (rv) {
3281 		DWARN(vswp, "%s: unable to create free mblk pool for"
3282 			" channel %ld (rv %d)", __func__, ldc_id, rv);
3283 		kmem_free(ldcp, sizeof (vsw_ldc_t));
3284 		return (1);
3285 	}
3286 
3287 	progress |= PROG_mblks;
3288 
3289 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
3290 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
3291 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
3292 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
3293 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
3294 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
3295 
3296 	/* required for handshake with peer */
3297 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
3298 	ldcp->peer_session = 0;
3299 	ldcp->session_status = 0;
3300 
3301 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
3302 	ldcp->hss_id = 1;	/* Initial handshake session id */
3303 
3304 	/* only set for outbound lane, inbound set by peer */
3305 	mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
3306 	mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
3307 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
3308 
3309 	attr.devclass = LDC_DEV_NT_SVC;
3310 	attr.instance = ddi_get_instance(vswp->dip);
3311 	attr.mode = LDC_MODE_UNRELIABLE;
3312 	attr.mtu = VSW_LDC_MTU;
3313 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
3314 	if (status != 0) {
3315 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
3316 		    __func__, ldc_id, status);
3317 		goto ldc_attach_fail;
3318 	}
3319 
3320 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
3321 	if (status != 0) {
3322 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
3323 		    __func__, ldc_id, status);
3324 		(void) ldc_fini(ldcp->ldc_handle);
3325 		goto ldc_attach_fail;
3326 	}
3327 
3328 	progress |= PROG_callback;
3329 
3330 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
3331 
3332 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
3333 		DERR(vswp, "%s: ldc_status failed", __func__);
3334 		mutex_destroy(&ldcp->status_lock);
3335 		goto ldc_attach_fail;
3336 	}
3337 
3338 	ldcp->ldc_status = istatus;
3339 	ldcp->ldc_port = port;
3340 	ldcp->ldc_vswp = vswp;
3341 
3342 	/* link it into the list of channels for this port */
3343 	WRITE_ENTER(&ldcl->lockrw);
3344 	ldcp->ldc_next = ldcl->head;
3345 	ldcl->head = ldcp;
3346 	ldcl->num_ldcs++;
3347 	RW_EXIT(&ldcl->lockrw);
3348 
3349 	D1(vswp, "%s: exit", __func__);
3350 	return (0);
3351 
3352 ldc_attach_fail:
3353 	mutex_destroy(&ldcp->ldc_txlock);
3354 	mutex_destroy(&ldcp->ldc_cblock);
3355 
3356 	cv_destroy(&ldcp->drain_cv);
3357 
3358 	rw_destroy(&ldcp->lane_in.dlistrw);
3359 	rw_destroy(&ldcp->lane_out.dlistrw);
3360 
3361 	if (progress & PROG_callback) {
3362 		(void) ldc_unreg_callback(ldcp->ldc_handle);
3363 	}
3364 
3365 	if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) {
3366 		if (vio_destroy_mblks(ldcp->rxh) != 0) {
3367 			/*
3368 			 * Something odd has happened, as the destroy
3369 			 * will only fail if some mblks have been allocated
3370 			 * from the pool already (which shouldn't happen)
3371 			 * and have not been returned.
3372 			 *
3373 			 * Add the pool pointer to a list maintained in
3374 			 * the device instance. Another attempt will be made
3375 			 * to free the pool when the device itself detaches.
3376 			 */
3377 			cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld "
3378 				"failed and cannot destroy associated mblk "
3379 				"pool", vswp->instance, ldc_id);
3380 			ldcp->rxh->nextp =  vswp->rxh;
3381 			vswp->rxh = ldcp->rxh;
3382 		}
3383 	}
3384 	mutex_destroy(&ldcp->drain_cv_lock);
3385 	mutex_destroy(&ldcp->hss_lock);
3386 
3387 	mutex_destroy(&ldcp->lane_in.seq_lock);
3388 	mutex_destroy(&ldcp->lane_out.seq_lock);
3389 	kmem_free(ldcp, sizeof (vsw_ldc_t));
3390 
3391 	return (1);
3392 }
3393 
3394 /*
3395  * Detach a logical domain channel (ldc) belonging to a
3396  * particular port.
3397  *
3398  * Returns 0 on success, 1 on failure.
3399  */
3400 static int
3401 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
3402 {
3403 	vsw_t 		*vswp = port->p_vswp;
3404 	vsw_ldc_t 	*ldcp, *prev_ldcp;
3405 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3406 	int 		rv;
3407 
3408 	prev_ldcp = ldcl->head;
3409 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
3410 		if (ldcp->ldc_id == ldc_id) {
3411 			break;
3412 		}
3413 	}
3414 
3415 	/* specified ldc id not found */
3416 	if (ldcp == NULL) {
3417 		DERR(vswp, "%s: ldcp = NULL", __func__);
3418 		return (1);
3419 	}
3420 
3421 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
3422 
3423 	/*
3424 	 * Before we can close the channel we must release any mapped
3425 	 * resources (e.g. drings).
3426 	 */
3427 	vsw_free_lane_resources(ldcp, INBOUND);
3428 	vsw_free_lane_resources(ldcp, OUTBOUND);
3429 
3430 	/*
3431 	 * If the close fails we are in serious trouble, as won't
3432 	 * be able to delete the parent port.
3433 	 */
3434 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
3435 		DERR(vswp, "%s: error %d closing channel %lld",
3436 			__func__, rv, ldcp->ldc_id);
3437 		return (1);
3438 	}
3439 
3440 	(void) ldc_fini(ldcp->ldc_handle);
3441 
3442 	ldcp->ldc_status = LDC_INIT;
3443 	ldcp->ldc_handle = NULL;
3444 	ldcp->ldc_vswp = NULL;
3445 
3446 	if (ldcp->rxh != NULL) {
3447 		if (vio_destroy_mblks(ldcp->rxh)) {
3448 			/*
3449 			 * Mostly likely some mblks are still in use and
3450 			 * have not been returned to the pool. Add the pool
3451 			 * to the list maintained in the device instance.
3452 			 * Another attempt will be made to destroy the pool
3453 			 * when the device detaches.
3454 			 */
3455 			ldcp->rxh->nextp =  vswp->rxh;
3456 			vswp->rxh = ldcp->rxh;
3457 		}
3458 	}
3459 
3460 	/* unlink it from the list */
3461 	prev_ldcp = ldcp->ldc_next;
3462 	ldcl->num_ldcs--;
3463 
3464 	mutex_destroy(&ldcp->ldc_txlock);
3465 	mutex_destroy(&ldcp->ldc_cblock);
3466 	cv_destroy(&ldcp->drain_cv);
3467 	mutex_destroy(&ldcp->drain_cv_lock);
3468 	mutex_destroy(&ldcp->hss_lock);
3469 	mutex_destroy(&ldcp->lane_in.seq_lock);
3470 	mutex_destroy(&ldcp->lane_out.seq_lock);
3471 	mutex_destroy(&ldcp->status_lock);
3472 	rw_destroy(&ldcp->lane_in.dlistrw);
3473 	rw_destroy(&ldcp->lane_out.dlistrw);
3474 
3475 	kmem_free(ldcp, sizeof (vsw_ldc_t));
3476 
3477 	return (0);
3478 }
3479 
3480 /*
3481  * Open and attempt to bring up the channel. Note that channel
3482  * can only be brought up if peer has also opened channel.
3483  *
3484  * Returns 0 if can open and bring up channel, otherwise
3485  * returns 1.
3486  */
3487 static int
3488 vsw_ldc_init(vsw_ldc_t *ldcp)
3489 {
3490 	vsw_t 		*vswp = ldcp->ldc_vswp;
3491 	ldc_status_t	istatus = 0;
3492 	int		rv;
3493 
3494 	D1(vswp, "%s: enter", __func__);
3495 
3496 	LDC_ENTER_LOCK(ldcp);
3497 
3498 	/* don't start at 0 in case clients don't like that */
3499 	ldcp->next_ident = 1;
3500 
3501 	rv = ldc_open(ldcp->ldc_handle);
3502 	if (rv != 0) {
3503 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
3504 		    __func__, ldcp->ldc_id, rv);
3505 		LDC_EXIT_LOCK(ldcp);
3506 		return (1);
3507 	}
3508 
3509 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
3510 		DERR(vswp, "%s: unable to get status", __func__);
3511 		LDC_EXIT_LOCK(ldcp);
3512 		return (1);
3513 
3514 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
3515 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
3516 		    __func__, ldcp->ldc_id, istatus);
3517 		LDC_EXIT_LOCK(ldcp);
3518 		return (1);
3519 	}
3520 
3521 	mutex_enter(&ldcp->status_lock);
3522 	ldcp->ldc_status = istatus;
3523 	mutex_exit(&ldcp->status_lock);
3524 
3525 	rv = ldc_up(ldcp->ldc_handle);
3526 	if (rv != 0) {
3527 		/*
3528 		 * Not a fatal error for ldc_up() to fail, as peer
3529 		 * end point may simply not be ready yet.
3530 		 */
3531 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
3532 			ldcp->ldc_id, rv);
3533 		LDC_EXIT_LOCK(ldcp);
3534 		return (1);
3535 	}
3536 
3537 	/*
3538 	 * ldc_up() call is non-blocking so need to explicitly
3539 	 * check channel status to see if in fact the channel
3540 	 * is UP.
3541 	 */
3542 	mutex_enter(&ldcp->status_lock);
3543 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
3544 		DERR(vswp, "%s: unable to get status", __func__);
3545 		mutex_exit(&ldcp->status_lock);
3546 		LDC_EXIT_LOCK(ldcp);
3547 		return (1);
3548 
3549 	}
3550 
3551 	if (ldcp->ldc_status == LDC_UP) {
3552 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
3553 			ldcp->ldc_id, istatus);
3554 		mutex_exit(&ldcp->status_lock);
3555 		LDC_EXIT_LOCK(ldcp);
3556 
3557 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
3558 		return (0);
3559 	}
3560 
3561 	mutex_exit(&ldcp->status_lock);
3562 	LDC_EXIT_LOCK(ldcp);
3563 
3564 	D1(vswp, "%s: exit", __func__);
3565 	return (0);
3566 }
3567 
3568 /* disable callbacks on the channel */
3569 static int
3570 vsw_ldc_uninit(vsw_ldc_t *ldcp)
3571 {
3572 	vsw_t	*vswp = ldcp->ldc_vswp;
3573 	int	rv;
3574 
3575 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
3576 
3577 	LDC_ENTER_LOCK(ldcp);
3578 
3579 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
3580 	if (rv != 0) {
3581 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
3582 			"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
3583 		LDC_EXIT_LOCK(ldcp);
3584 		return (1);
3585 	}
3586 
3587 	mutex_enter(&ldcp->status_lock);
3588 	ldcp->ldc_status = LDC_INIT;
3589 	mutex_exit(&ldcp->status_lock);
3590 
3591 	LDC_EXIT_LOCK(ldcp);
3592 
3593 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
3594 
3595 	return (0);
3596 }
3597 
3598 static int
3599 vsw_init_ldcs(vsw_port_t *port)
3600 {
3601 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3602 	vsw_ldc_t	*ldcp;
3603 
3604 	READ_ENTER(&ldcl->lockrw);
3605 	ldcp =  ldcl->head;
3606 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3607 		(void) vsw_ldc_init(ldcp);
3608 	}
3609 	RW_EXIT(&ldcl->lockrw);
3610 
3611 	return (0);
3612 }
3613 
3614 static int
3615 vsw_uninit_ldcs(vsw_port_t *port)
3616 {
3617 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3618 	vsw_ldc_t	*ldcp;
3619 
3620 	D1(NULL, "vsw_uninit_ldcs: enter\n");
3621 
3622 	READ_ENTER(&ldcl->lockrw);
3623 	ldcp =  ldcl->head;
3624 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3625 		(void) vsw_ldc_uninit(ldcp);
3626 	}
3627 	RW_EXIT(&ldcl->lockrw);
3628 
3629 	D1(NULL, "vsw_uninit_ldcs: exit\n");
3630 
3631 	return (0);
3632 }
3633 
3634 /*
3635  * Wait until the callback(s) associated with the ldcs under the specified
3636  * port have completed.
3637  *
3638  * Prior to this function being invoked each channel under this port
3639  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3640  *
3641  * A short explaination of what we are doing below..
3642  *
3643  * The simplest approach would be to have a reference counter in
3644  * the ldc structure which is increment/decremented by the callbacks as
3645  * they use the channel. The drain function could then simply disable any
3646  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
3647  * there is a tiny window here - before the callback is able to get the lock
3648  * on the channel it is interrupted and this function gets to execute. It
3649  * sees that the ref count is zero and believes its free to delete the
3650  * associated data structures.
3651  *
3652  * We get around this by taking advantage of the fact that before the ldc
3653  * framework invokes a callback it sets a flag to indicate that there is a
3654  * callback active (or about to become active). If when we attempt to
3655  * unregister a callback when this active flag is set then the unregister
3656  * will fail with EWOULDBLOCK.
3657  *
3658  * If the unregister fails we do a cv_timedwait. We will either be signaled
3659  * by the callback as it is exiting (note we have to wait a short period to
3660  * allow the callback to return fully to the ldc framework and it to clear
3661  * the active flag), or by the timer expiring. In either case we again attempt
3662  * the unregister. We repeat this until we can succesfully unregister the
3663  * callback.
3664  *
3665  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
3666  * the case where the callback has finished but the ldc framework has not yet
3667  * cleared the active flag. In this case we would never get a cv_signal.
3668  */
3669 static int
3670 vsw_drain_ldcs(vsw_port_t *port)
3671 {
3672 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3673 	vsw_ldc_t	*ldcp;
3674 	vsw_t		*vswp = port->p_vswp;
3675 
3676 	D1(vswp, "%s: enter", __func__);
3677 
3678 	READ_ENTER(&ldcl->lockrw);
3679 
3680 	ldcp = ldcl->head;
3681 
3682 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3683 		/*
3684 		 * If we can unregister the channel callback then we
3685 		 * know that there is no callback either running or
3686 		 * scheduled to run for this channel so move on to next
3687 		 * channel in the list.
3688 		 */
3689 		mutex_enter(&ldcp->drain_cv_lock);
3690 
3691 		/* prompt active callbacks to quit */
3692 		ldcp->drain_state = VSW_LDC_DRAINING;
3693 
3694 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
3695 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
3696 				ldcp->ldc_id);
3697 			mutex_exit(&ldcp->drain_cv_lock);
3698 			continue;
3699 		} else {
3700 			/*
3701 			 * If we end up here we know that either 1) a callback
3702 			 * is currently executing, 2) is about to start (i.e.
3703 			 * the ldc framework has set the active flag but
3704 			 * has not actually invoked the callback yet, or 3)
3705 			 * has finished and has returned to the ldc framework
3706 			 * but the ldc framework has not yet cleared the
3707 			 * active bit.
3708 			 *
3709 			 * Wait for it to finish.
3710 			 */
3711 			while (ldc_unreg_callback(ldcp->ldc_handle)
3712 								== EWOULDBLOCK)
3713 				(void) cv_timedwait(&ldcp->drain_cv,
3714 					&ldcp->drain_cv_lock, lbolt + hz);
3715 
3716 			mutex_exit(&ldcp->drain_cv_lock);
3717 			D2(vswp, "%s: unreg callback for chan %ld after "
3718 				"timeout", __func__, ldcp->ldc_id);
3719 		}
3720 	}
3721 	RW_EXIT(&ldcl->lockrw);
3722 
3723 	D1(vswp, "%s: exit", __func__);
3724 	return (0);
3725 }
3726 
3727 /*
3728  * Wait until all tasks which reference this port have completed.
3729  *
3730  * Prior to this function being invoked each channel under this port
3731  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3732  */
3733 static int
3734 vsw_drain_port_taskq(vsw_port_t *port)
3735 {
3736 	vsw_t		*vswp = port->p_vswp;
3737 
3738 	D1(vswp, "%s: enter", __func__);
3739 
3740 	/*
3741 	 * Mark the port as in the process of being detached, and
3742 	 * dispatch a marker task to the queue so we know when all
3743 	 * relevant tasks have completed.
3744 	 */
3745 	mutex_enter(&port->state_lock);
3746 	port->state = VSW_PORT_DETACHING;
3747 
3748 	if ((vswp->taskq_p == NULL) ||
3749 		(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
3750 			port, DDI_NOSLEEP) != DDI_SUCCESS)) {
3751 		DERR(vswp, "%s: unable to dispatch marker task",
3752 			__func__);
3753 		mutex_exit(&port->state_lock);
3754 		return (1);
3755 	}
3756 
3757 	/*
3758 	 * Wait for the marker task to finish.
3759 	 */
3760 	while (port->state != VSW_PORT_DETACHABLE)
3761 		cv_wait(&port->state_cv, &port->state_lock);
3762 
3763 	mutex_exit(&port->state_lock);
3764 
3765 	D1(vswp, "%s: exit", __func__);
3766 
3767 	return (0);
3768 }
3769 
3770 static void
3771 vsw_marker_task(void *arg)
3772 {
3773 	vsw_port_t	*port = arg;
3774 	vsw_t		*vswp = port->p_vswp;
3775 
3776 	D1(vswp, "%s: enter", __func__);
3777 
3778 	mutex_enter(&port->state_lock);
3779 
3780 	/*
3781 	 * No further tasks should be dispatched which reference
3782 	 * this port so ok to mark it as safe to detach.
3783 	 */
3784 	port->state = VSW_PORT_DETACHABLE;
3785 
3786 	cv_signal(&port->state_cv);
3787 
3788 	mutex_exit(&port->state_lock);
3789 
3790 	D1(vswp, "%s: exit", __func__);
3791 }
3792 
3793 static vsw_port_t *
3794 vsw_lookup_port(vsw_t *vswp, int p_instance)
3795 {
3796 	vsw_port_list_t *plist = &vswp->plist;
3797 	vsw_port_t	*port;
3798 
3799 	for (port = plist->head; port != NULL; port = port->p_next) {
3800 		if (port->p_instance == p_instance) {
3801 			D2(vswp, "vsw_lookup_port: found p_instance\n");
3802 			return (port);
3803 		}
3804 	}
3805 
3806 	return (NULL);
3807 }
3808 
3809 /*
3810  * Search for and remove the specified port from the port
3811  * list. Returns 0 if able to locate and remove port, otherwise
3812  * returns 1.
3813  */
3814 static int
3815 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
3816 {
3817 	vsw_port_list_t *plist = &vswp->plist;
3818 	vsw_port_t	*curr_p, *prev_p;
3819 
3820 	if (plist->head == NULL)
3821 		return (1);
3822 
3823 	curr_p = prev_p = plist->head;
3824 
3825 	while (curr_p != NULL) {
3826 		if (curr_p == port) {
3827 			if (prev_p == curr_p) {
3828 				plist->head = curr_p->p_next;
3829 			} else {
3830 				prev_p->p_next = curr_p->p_next;
3831 			}
3832 			plist->num_ports--;
3833 			break;
3834 		} else {
3835 			prev_p = curr_p;
3836 			curr_p = curr_p->p_next;
3837 		}
3838 	}
3839 	return (0);
3840 }
3841 
3842 /*
3843  * Interrupt handler for ldc messages.
3844  */
3845 static uint_t
3846 vsw_ldc_cb(uint64_t event, caddr_t arg)
3847 {
3848 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
3849 	vsw_t 		*vswp = ldcp->ldc_vswp;
3850 
3851 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3852 
3853 	mutex_enter(&ldcp->ldc_cblock);
3854 
3855 	mutex_enter(&ldcp->status_lock);
3856 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
3857 		mutex_exit(&ldcp->status_lock);
3858 		mutex_exit(&ldcp->ldc_cblock);
3859 		return (LDC_SUCCESS);
3860 	}
3861 	mutex_exit(&ldcp->status_lock);
3862 
3863 	if (event & LDC_EVT_UP) {
3864 		/*
3865 		 * Channel has come up.
3866 		 */
3867 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
3868 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3869 
3870 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
3871 
3872 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
3873 	}
3874 
3875 	if (event & LDC_EVT_READ) {
3876 		/*
3877 		 * Data available for reading.
3878 		 */
3879 		D2(vswp, "%s: id(ld) event(%llx) data READ",
3880 				__func__, ldcp->ldc_id, event);
3881 
3882 		vsw_process_pkt(ldcp);
3883 
3884 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
3885 
3886 		goto vsw_cb_exit;
3887 	}
3888 
3889 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
3890 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
3891 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3892 
3893 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
3894 	}
3895 
3896 	/*
3897 	 * Catch either LDC_EVT_WRITE which we don't support or any
3898 	 * unknown event.
3899 	 */
3900 	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
3901 					| LDC_EVT_DOWN | LDC_EVT_READ)) {
3902 
3903 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
3904 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
3905 	}
3906 
3907 vsw_cb_exit:
3908 	mutex_exit(&ldcp->ldc_cblock);
3909 
3910 	/*
3911 	 * Let the drain function know we are finishing if it
3912 	 * is waiting.
3913 	 */
3914 	mutex_enter(&ldcp->drain_cv_lock);
3915 	if (ldcp->drain_state == VSW_LDC_DRAINING)
3916 		cv_signal(&ldcp->drain_cv);
3917 	mutex_exit(&ldcp->drain_cv_lock);
3918 
3919 	return (LDC_SUCCESS);
3920 }
3921 
3922 /*
3923  * Reinitialise data structures associated with the channel.
3924  */
3925 static void
3926 vsw_ldc_reinit(vsw_ldc_t *ldcp)
3927 {
3928 	vsw_t		*vswp = ldcp->ldc_vswp;
3929 	vsw_port_t	*port;
3930 	vsw_ldc_list_t	*ldcl;
3931 
3932 	D1(vswp, "%s: enter", __func__);
3933 
3934 	port = ldcp->ldc_port;
3935 	ldcl = &port->p_ldclist;
3936 
3937 	READ_ENTER(&ldcl->lockrw);
3938 
3939 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
3940 		ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3941 
3942 	vsw_free_lane_resources(ldcp, INBOUND);
3943 	vsw_free_lane_resources(ldcp, OUTBOUND);
3944 	RW_EXIT(&ldcl->lockrw);
3945 
3946 	ldcp->lane_in.lstate = 0;
3947 	ldcp->lane_out.lstate = 0;
3948 
3949 	/*
3950 	 * Remove parent port from any multicast groups
3951 	 * it may have registered with. Client must resend
3952 	 * multicast add command after handshake completes.
3953 	 */
3954 	(void) vsw_del_fdb(vswp, port);
3955 
3956 	vsw_del_mcst_port(port);
3957 
3958 	ldcp->peer_session = 0;
3959 	ldcp->session_status = 0;
3960 	ldcp->hcnt = 0;
3961 	ldcp->hphase = VSW_MILESTONE0;
3962 
3963 	D1(vswp, "%s: exit", __func__);
3964 }
3965 
3966 /*
3967  * Process a connection event.
3968  *
3969  * Note - care must be taken to ensure that this function is
3970  * not called with the dlistrw lock held.
3971  */
3972 static void
3973 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
3974 {
3975 	vsw_t		*vswp = ldcp->ldc_vswp;
3976 	vsw_conn_evt_t	*conn = NULL;
3977 
3978 	D1(vswp, "%s: enter", __func__);
3979 
3980 	/*
3981 	 * Check if either a reset or restart event is pending
3982 	 * or in progress. If so just return.
3983 	 *
3984 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
3985 	 * being received by the callback handler, or a ECONNRESET error
3986 	 * code being returned from a ldc_read() or ldc_write() call.
3987 	 *
3988 	 * A VSW_CONN_RESTART event occurs when some error checking code
3989 	 * decides that there is a problem with data from the channel,
3990 	 * and that the handshake should be restarted.
3991 	 */
3992 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
3993 			(ldstub((uint8_t *)&ldcp->reset_active)))
3994 		return;
3995 
3996 	/*
3997 	 * If it is an LDC_UP event we first check the recorded
3998 	 * state of the channel. If this is UP then we know that
3999 	 * the channel moving to the UP state has already been dealt
4000 	 * with and don't need to dispatch a  new task.
4001 	 *
4002 	 * The reason for this check is that when we do a ldc_up(),
4003 	 * depending on the state of the peer, we may or may not get
4004 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
4005 	 * every time we do ldc_up() we explicitly check the channel
4006 	 * status to see has it come up (ldc_up() is asynch and will
4007 	 * complete at some undefined time), and take the appropriate
4008 	 * action.
4009 	 *
4010 	 * The flip side of this is that we may get a LDC_UP event
4011 	 * when we have already seen that the channel is up and have
4012 	 * dealt with that.
4013 	 */
4014 	mutex_enter(&ldcp->status_lock);
4015 	if (evt == VSW_CONN_UP) {
4016 		if ((ldcp->ldc_status == LDC_UP) ||
4017 					(ldcp->reset_active != 0)) {
4018 			mutex_exit(&ldcp->status_lock);
4019 			return;
4020 		}
4021 	}
4022 	mutex_exit(&ldcp->status_lock);
4023 
4024 	/*
4025 	 * The transaction group id allows us to identify and discard
4026 	 * any tasks which are still pending on the taskq and refer
4027 	 * to the handshake session we are about to restart or reset.
4028 	 * These stale messages no longer have any real meaning.
4029 	 */
4030 	mutex_enter(&ldcp->hss_lock);
4031 	ldcp->hss_id++;
4032 	mutex_exit(&ldcp->hss_lock);
4033 
4034 	ASSERT(vswp->taskq_p != NULL);
4035 
4036 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
4037 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
4038 			" connection event", vswp->instance);
4039 		goto err_exit;
4040 	}
4041 
4042 	conn->evt = evt;
4043 	conn->ldcp = ldcp;
4044 
4045 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
4046 		DDI_NOSLEEP) != DDI_SUCCESS) {
4047 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
4048 			vswp->instance);
4049 
4050 		kmem_free(conn, sizeof (vsw_conn_evt_t));
4051 		goto err_exit;
4052 	}
4053 
4054 	D1(vswp, "%s: exit", __func__);
4055 	return;
4056 
4057 err_exit:
4058 	/*
4059 	 * Have mostly likely failed due to memory shortage. Clear the flag so
4060 	 * that future requests will at least be attempted and will hopefully
4061 	 * succeed.
4062 	 */
4063 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
4064 		ldcp->reset_active = 0;
4065 }
4066 
4067 /*
4068  * Deal with events relating to a connection. Invoked from a taskq.
4069  */
4070 static void
4071 vsw_conn_task(void *arg)
4072 {
4073 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
4074 	vsw_ldc_t	*ldcp = NULL;
4075 	vsw_t		*vswp = NULL;
4076 	uint16_t	evt;
4077 	ldc_status_t	curr_status;
4078 
4079 	ldcp = conn->ldcp;
4080 	evt = conn->evt;
4081 	vswp = ldcp->ldc_vswp;
4082 
4083 	D1(vswp, "%s: enter", __func__);
4084 
4085 	/* can safely free now have copied out data */
4086 	kmem_free(conn, sizeof (vsw_conn_evt_t));
4087 
4088 	mutex_enter(&ldcp->status_lock);
4089 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
4090 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
4091 			"channel %ld", vswp->instance, ldcp->ldc_id);
4092 		mutex_exit(&ldcp->status_lock);
4093 		return;
4094 	}
4095 
4096 	/*
4097 	 * If we wish to restart the handshake on this channel, then if
4098 	 * the channel is UP we bring it DOWN to flush the underlying
4099 	 * ldc queue.
4100 	 */
4101 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
4102 		(void) ldc_down(ldcp->ldc_handle);
4103 
4104 	/*
4105 	 * re-init all the associated data structures.
4106 	 */
4107 	vsw_ldc_reinit(ldcp);
4108 
4109 	/*
4110 	 * Bring the channel back up (note it does no harm to
4111 	 * do this even if the channel is already UP, Just
4112 	 * becomes effectively a no-op).
4113 	 */
4114 	(void) ldc_up(ldcp->ldc_handle);
4115 
4116 	/*
4117 	 * Check if channel is now UP. This will only happen if
4118 	 * peer has also done a ldc_up().
4119 	 */
4120 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
4121 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
4122 			"channel %ld", vswp->instance, ldcp->ldc_id);
4123 		mutex_exit(&ldcp->status_lock);
4124 		return;
4125 	}
4126 
4127 	ldcp->ldc_status = curr_status;
4128 
4129 	/* channel UP so restart handshake by sending version info */
4130 	if (curr_status == LDC_UP) {
4131 		if (ldcp->hcnt++ > vsw_num_handshakes) {
4132 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
4133 				" handshake attempts (%d) on channel %ld",
4134 				vswp->instance, ldcp->hcnt, ldcp->ldc_id);
4135 			mutex_exit(&ldcp->status_lock);
4136 			return;
4137 		}
4138 
4139 		if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
4140 			DDI_NOSLEEP) != DDI_SUCCESS) {
4141 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
4142 				vswp->instance);
4143 
4144 			/*
4145 			 * Don't count as valid restart attempt if couldn't
4146 			 * send version msg.
4147 			 */
4148 			if (ldcp->hcnt > 0)
4149 				ldcp->hcnt--;
4150 		}
4151 	}
4152 
4153 	/*
4154 	 * Mark that the process is complete by clearing the flag.
4155 	 *
4156 	 * Note is it possible that the taskq dispatch above may have failed,
4157 	 * most likely due to memory shortage. We still clear the flag so
4158 	 * future attempts will at least be attempted and will hopefully
4159 	 * succeed.
4160 	 */
4161 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
4162 		ldcp->reset_active = 0;
4163 
4164 	mutex_exit(&ldcp->status_lock);
4165 
4166 	D1(vswp, "%s: exit", __func__);
4167 }
4168 
4169 /*
4170  * returns 0 if legal for event signified by flag to have
4171  * occured at the time it did. Otherwise returns 1.
4172  */
4173 int
4174 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
4175 {
4176 	vsw_t		*vswp = ldcp->ldc_vswp;
4177 	uint64_t	state;
4178 	uint64_t	phase;
4179 
4180 	if (dir == INBOUND)
4181 		state = ldcp->lane_in.lstate;
4182 	else
4183 		state = ldcp->lane_out.lstate;
4184 
4185 	phase = ldcp->hphase;
4186 
4187 	switch (flag) {
4188 	case VSW_VER_INFO_RECV:
4189 		if (phase > VSW_MILESTONE0) {
4190 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
4191 				" when in state %d\n", ldcp->ldc_id, phase);
4192 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4193 			return (1);
4194 		}
4195 		break;
4196 
4197 	case VSW_VER_ACK_RECV:
4198 	case VSW_VER_NACK_RECV:
4199 		if (!(state & VSW_VER_INFO_SENT)) {
4200 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
4201 				" or VER_NACK when in state %d\n",
4202 				ldcp->ldc_id, phase);
4203 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4204 			return (1);
4205 		} else
4206 			state &= ~VSW_VER_INFO_SENT;
4207 		break;
4208 
4209 	case VSW_ATTR_INFO_RECV:
4210 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
4211 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
4212 				" when in state %d\n", ldcp->ldc_id, phase);
4213 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4214 			return (1);
4215 		}
4216 		break;
4217 
4218 	case VSW_ATTR_ACK_RECV:
4219 	case VSW_ATTR_NACK_RECV:
4220 		if (!(state & VSW_ATTR_INFO_SENT)) {
4221 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
4222 				" or ATTR_NACK when in state %d\n",
4223 				ldcp->ldc_id, phase);
4224 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4225 			return (1);
4226 		} else
4227 			state &= ~VSW_ATTR_INFO_SENT;
4228 		break;
4229 
4230 	case VSW_DRING_INFO_RECV:
4231 		if (phase < VSW_MILESTONE1) {
4232 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
4233 				" when in state %d\n", ldcp->ldc_id, phase);
4234 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4235 			return (1);
4236 		}
4237 		break;
4238 
4239 	case VSW_DRING_ACK_RECV:
4240 	case VSW_DRING_NACK_RECV:
4241 		if (!(state & VSW_DRING_INFO_SENT)) {
4242 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
4243 				" or DRING_NACK when in state %d\n",
4244 				ldcp->ldc_id, phase);
4245 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4246 			return (1);
4247 		} else
4248 			state &= ~VSW_DRING_INFO_SENT;
4249 		break;
4250 
4251 	case VSW_RDX_INFO_RECV:
4252 		if (phase < VSW_MILESTONE3) {
4253 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
4254 				" when in state %d\n", ldcp->ldc_id, phase);
4255 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4256 			return (1);
4257 		}
4258 		break;
4259 
4260 	case VSW_RDX_ACK_RECV:
4261 	case VSW_RDX_NACK_RECV:
4262 		if (!(state & VSW_RDX_INFO_SENT)) {
4263 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
4264 				" or RDX_NACK when in state %d\n",
4265 				ldcp->ldc_id, phase);
4266 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4267 			return (1);
4268 		} else
4269 			state &= ~VSW_RDX_INFO_SENT;
4270 		break;
4271 
4272 	case VSW_MCST_INFO_RECV:
4273 		if (phase < VSW_MILESTONE3) {
4274 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
4275 				" when in state %d\n", ldcp->ldc_id, phase);
4276 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4277 			return (1);
4278 		}
4279 		break;
4280 
4281 	default:
4282 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
4283 				ldcp->ldc_id, flag);
4284 		return (1);
4285 	}
4286 
4287 	if (dir == INBOUND)
4288 		ldcp->lane_in.lstate = state;
4289 	else
4290 		ldcp->lane_out.lstate = state;
4291 
4292 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
4293 
4294 	return (0);
4295 }
4296 
4297 void
4298 vsw_next_milestone(vsw_ldc_t *ldcp)
4299 {
4300 	vsw_t		*vswp = ldcp->ldc_vswp;
4301 
4302 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
4303 		ldcp->ldc_id, ldcp->hphase);
4304 
4305 	DUMP_FLAGS(ldcp->lane_in.lstate);
4306 	DUMP_FLAGS(ldcp->lane_out.lstate);
4307 
4308 	switch (ldcp->hphase) {
4309 
4310 	case VSW_MILESTONE0:
4311 		/*
4312 		 * If we haven't started to handshake with our peer,
4313 		 * start to do so now.
4314 		 */
4315 		if (ldcp->lane_out.lstate == 0) {
4316 			D2(vswp, "%s: (chan %lld) starting handshake "
4317 				"with peer", __func__, ldcp->ldc_id);
4318 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
4319 		}
4320 
4321 		/*
4322 		 * Only way to pass this milestone is to have successfully
4323 		 * negotiated version info.
4324 		 */
4325 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
4326 			(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
4327 
4328 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
4329 				__func__, ldcp->ldc_id);
4330 
4331 			/*
4332 			 * Next milestone is passed when attribute
4333 			 * information has been successfully exchanged.
4334 			 */
4335 			ldcp->hphase = VSW_MILESTONE1;
4336 			vsw_send_attr(ldcp);
4337 
4338 		}
4339 		break;
4340 
4341 	case VSW_MILESTONE1:
4342 		/*
4343 		 * Only way to pass this milestone is to have successfully
4344 		 * negotiated attribute information.
4345 		 */
4346 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
4347 
4348 			ldcp->hphase = VSW_MILESTONE2;
4349 
4350 			/*
4351 			 * If the peer device has said it wishes to
4352 			 * use descriptor rings then we send it our ring
4353 			 * info, otherwise we just set up a private ring
4354 			 * which we use an internal buffer
4355 			 */
4356 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
4357 				vsw_send_dring_info(ldcp);
4358 		}
4359 		break;
4360 
4361 	case VSW_MILESTONE2:
4362 		/*
4363 		 * If peer has indicated in its attribute message that
4364 		 * it wishes to use descriptor rings then the only way
4365 		 * to pass this milestone is for us to have received
4366 		 * valid dring info.
4367 		 *
4368 		 * If peer is not using descriptor rings then just fall
4369 		 * through.
4370 		 */
4371 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
4372 			(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
4373 			break;
4374 
4375 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
4376 				__func__, ldcp->ldc_id);
4377 
4378 		ldcp->hphase = VSW_MILESTONE3;
4379 		vsw_send_rdx(ldcp);
4380 		break;
4381 
4382 	case VSW_MILESTONE3:
4383 		/*
4384 		 * Pass this milestone when all paramaters have been
4385 		 * successfully exchanged and RDX sent in both directions.
4386 		 *
4387 		 * Mark outbound lane as available to transmit data.
4388 		 */
4389 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
4390 			(ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
4391 
4392 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
4393 				__func__, ldcp->ldc_id);
4394 			D2(vswp, "%s: ** handshake complete (0x%llx : "
4395 				"0x%llx) **", __func__, ldcp->lane_in.lstate,
4396 				ldcp->lane_out.lstate);
4397 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
4398 			ldcp->hphase = VSW_MILESTONE4;
4399 			ldcp->hcnt = 0;
4400 			DISPLAY_STATE();
4401 		} else {
4402 			D2(vswp, "%s: still in milestone 3 (0x%llx :"
4403 				" 0x%llx", __func__, ldcp->lane_in.lstate,
4404 				ldcp->lane_out.lstate);
4405 		}
4406 		break;
4407 
4408 	case VSW_MILESTONE4:
4409 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
4410 							ldcp->ldc_id);
4411 		break;
4412 
4413 	default:
4414 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
4415 			ldcp->ldc_id, ldcp->hphase);
4416 	}
4417 
4418 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
4419 		ldcp->hphase);
4420 }
4421 
4422 /*
4423  * Check if major version is supported.
4424  *
4425  * Returns 0 if finds supported major number, and if necessary
4426  * adjusts the minor field.
4427  *
4428  * Returns 1 if can't match major number exactly. Sets mjor/minor
4429  * to next lowest support values, or to zero if no other values possible.
4430  */
4431 static int
4432 vsw_supported_version(vio_ver_msg_t *vp)
4433 {
4434 	int	i;
4435 
4436 	D1(NULL, "vsw_supported_version: enter");
4437 
4438 	for (i = 0; i < VSW_NUM_VER; i++) {
4439 		if (vsw_versions[i].ver_major == vp->ver_major) {
4440 			/*
4441 			 * Matching or lower major version found. Update
4442 			 * minor number if necessary.
4443 			 */
4444 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
4445 				D2(NULL, "%s: adjusting minor value"
4446 					" from %d to %d", __func__,
4447 					vp->ver_minor,
4448 					vsw_versions[i].ver_minor);
4449 				vp->ver_minor = vsw_versions[i].ver_minor;
4450 			}
4451 
4452 			return (0);
4453 		}
4454 
4455 		if (vsw_versions[i].ver_major < vp->ver_major) {
4456 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
4457 				D2(NULL, "%s: adjusting minor value"
4458 					" from %d to %d", __func__,
4459 					vp->ver_minor,
4460 					vsw_versions[i].ver_minor);
4461 				vp->ver_minor = vsw_versions[i].ver_minor;
4462 			}
4463 			return (1);
4464 		}
4465 	}
4466 
4467 	/* No match was possible, zero out fields */
4468 	vp->ver_major = 0;
4469 	vp->ver_minor = 0;
4470 
4471 	D1(NULL, "vsw_supported_version: exit");
4472 
4473 	return (1);
4474 }
4475 
4476 /*
4477  * Main routine for processing messages received over LDC.
4478  */
4479 static void
4480 vsw_process_pkt(void *arg)
4481 {
4482 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
4483 	vsw_t 		*vswp = ldcp->ldc_vswp;
4484 	size_t		msglen;
4485 	vio_msg_tag_t	tag;
4486 	def_msg_t	dmsg;
4487 	int 		rv = 0;
4488 
4489 
4490 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4491 
4492 	/*
4493 	 * If channel is up read messages until channel is empty.
4494 	 */
4495 	do {
4496 		msglen = sizeof (dmsg);
4497 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
4498 
4499 		if (rv != 0) {
4500 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
4501 				"len(%d)\n", __func__, ldcp->ldc_id,
4502 							rv, msglen);
4503 		}
4504 
4505 		/* channel has been reset */
4506 		if (rv == ECONNRESET) {
4507 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4508 			break;
4509 		}
4510 
4511 		if (msglen == 0) {
4512 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
4513 			ldcp->ldc_id);
4514 			break;
4515 		}
4516 
4517 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
4518 		    ldcp->ldc_id, msglen);
4519 
4520 		/*
4521 		 * Figure out what sort of packet we have gotten by
4522 		 * examining the msg tag, and then switch it appropriately.
4523 		 */
4524 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
4525 
4526 		switch (tag.vio_msgtype) {
4527 		case VIO_TYPE_CTRL:
4528 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
4529 			break;
4530 		case VIO_TYPE_DATA:
4531 			vsw_process_data_pkt(ldcp, &dmsg, tag);
4532 			break;
4533 		case VIO_TYPE_ERR:
4534 			vsw_process_err_pkt(ldcp, &dmsg, tag);
4535 			break;
4536 		default:
4537 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
4538 				"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
4539 			break;
4540 		}
4541 	} while (msglen);
4542 
4543 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4544 }
4545 
4546 /*
4547  * Dispatch a task to process a VIO control message.
4548  */
4549 static void
4550 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
4551 {
4552 	vsw_ctrl_task_t		*ctaskp = NULL;
4553 	vsw_port_t		*port = ldcp->ldc_port;
4554 	vsw_t			*vswp = port->p_vswp;
4555 
4556 	D1(vswp, "%s: enter", __func__);
4557 
4558 	/*
4559 	 * We need to handle RDX ACK messages in-band as once they
4560 	 * are exchanged it is possible that we will get an
4561 	 * immediate (legitimate) data packet.
4562 	 */
4563 	if ((tag.vio_subtype_env == VIO_RDX) &&
4564 		(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
4565 
4566 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
4567 			return;
4568 
4569 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
4570 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
4571 			"(ostate 0x%llx : hphase %d)", __func__,
4572 			ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
4573 		vsw_next_milestone(ldcp);
4574 		return;
4575 	}
4576 
4577 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
4578 
4579 	if (ctaskp == NULL) {
4580 		DERR(vswp, "%s: unable to alloc space for ctrl"
4581 			" msg", __func__);
4582 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4583 		return;
4584 	}
4585 
4586 	ctaskp->ldcp = ldcp;
4587 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
4588 	mutex_enter(&ldcp->hss_lock);
4589 	ctaskp->hss_id = ldcp->hss_id;
4590 	mutex_exit(&ldcp->hss_lock);
4591 
4592 	/*
4593 	 * Dispatch task to processing taskq if port is not in
4594 	 * the process of being detached.
4595 	 */
4596 	mutex_enter(&port->state_lock);
4597 	if (port->state == VSW_PORT_INIT) {
4598 		if ((vswp->taskq_p == NULL) ||
4599 			(ddi_taskq_dispatch(vswp->taskq_p,
4600 			vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
4601 							!= DDI_SUCCESS)) {
4602 			DERR(vswp, "%s: unable to dispatch task to taskq",
4603 				__func__);
4604 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4605 			mutex_exit(&port->state_lock);
4606 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4607 			return;
4608 		}
4609 	} else {
4610 		DWARN(vswp, "%s: port %d detaching, not dispatching "
4611 			"task", __func__, port->p_instance);
4612 	}
4613 
4614 	mutex_exit(&port->state_lock);
4615 
4616 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
4617 			ldcp->ldc_id);
4618 	D1(vswp, "%s: exit", __func__);
4619 }
4620 
4621 /*
4622  * Process a VIO ctrl message. Invoked from taskq.
4623  */
4624 static void
4625 vsw_process_ctrl_pkt(void *arg)
4626 {
4627 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
4628 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
4629 	vsw_t 		*vswp = ldcp->ldc_vswp;
4630 	vio_msg_tag_t	tag;
4631 	uint16_t	env;
4632 
4633 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4634 
4635 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
4636 	env = tag.vio_subtype_env;
4637 
4638 	/* stale pkt check */
4639 	mutex_enter(&ldcp->hss_lock);
4640 	if (ctaskp->hss_id < ldcp->hss_id) {
4641 		DWARN(vswp, "%s: discarding stale packet belonging to"
4642 			" earlier (%ld) handshake session", __func__,
4643 			ctaskp->hss_id);
4644 		mutex_exit(&ldcp->hss_lock);
4645 		return;
4646 	}
4647 	mutex_exit(&ldcp->hss_lock);
4648 
4649 	/* session id check */
4650 	if (ldcp->session_status & VSW_PEER_SESSION) {
4651 		if (ldcp->peer_session != tag.vio_sid) {
4652 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
4653 				__func__, ldcp->ldc_id, tag.vio_sid);
4654 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4655 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4656 			return;
4657 		}
4658 	}
4659 
4660 	/*
4661 	 * Switch on vio_subtype envelope, then let lower routines
4662 	 * decide if its an INFO, ACK or NACK packet.
4663 	 */
4664 	switch (env) {
4665 	case VIO_VER_INFO:
4666 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
4667 		break;
4668 	case VIO_DRING_REG:
4669 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
4670 		break;
4671 	case VIO_DRING_UNREG:
4672 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
4673 		break;
4674 	case VIO_ATTR_INFO:
4675 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
4676 		break;
4677 	case VNET_MCAST_INFO:
4678 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
4679 		break;
4680 	case VIO_RDX:
4681 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
4682 		break;
4683 	default:
4684 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
4685 							__func__, env);
4686 	}
4687 
4688 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4689 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4690 }
4691 
4692 /*
4693  * Version negotiation. We can end up here either because our peer
4694  * has responded to a handshake message we have sent it, or our peer
4695  * has initiated a handshake with us. If its the former then can only
4696  * be ACK or NACK, if its the later can only be INFO.
4697  *
4698  * If its an ACK we move to the next stage of the handshake, namely
4699  * attribute exchange. If its a NACK we see if we can specify another
4700  * version, if we can't we stop.
4701  *
4702  * If it is an INFO we reset all params associated with communication
4703  * in that direction over this channel (remember connection is
4704  * essentially 2 independent simplex channels).
4705  */
4706 void
4707 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
4708 {
4709 	vio_ver_msg_t	*ver_pkt;
4710 	vsw_t 		*vswp = ldcp->ldc_vswp;
4711 
4712 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4713 
4714 	/*
4715 	 * We know this is a ctrl/version packet so
4716 	 * cast it into the correct structure.
4717 	 */
4718 	ver_pkt = (vio_ver_msg_t *)pkt;
4719 
4720 	switch (ver_pkt->tag.vio_subtype) {
4721 	case VIO_SUBTYPE_INFO:
4722 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
4723 
4724 		/*
4725 		 * Record the session id, which we will use from now
4726 		 * until we see another VER_INFO msg. Even then the
4727 		 * session id in most cases will be unchanged, execpt
4728 		 * if channel was reset.
4729 		 */
4730 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
4731 			(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
4732 			DERR(vswp, "%s: updating session id for chan %lld "
4733 				"from %llx to %llx", __func__, ldcp->ldc_id,
4734 				ldcp->peer_session, ver_pkt->tag.vio_sid);
4735 		}
4736 
4737 		ldcp->peer_session = ver_pkt->tag.vio_sid;
4738 		ldcp->session_status |= VSW_PEER_SESSION;
4739 
4740 		/* Legal message at this time ? */
4741 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
4742 			return;
4743 
4744 		/*
4745 		 * First check the device class. Currently only expect
4746 		 * to be talking to a network device. In the future may
4747 		 * also talk to another switch.
4748 		 */
4749 		if (ver_pkt->dev_class != VDEV_NETWORK) {
4750 			DERR(vswp, "%s: illegal device class %d", __func__,
4751 				ver_pkt->dev_class);
4752 
4753 			ver_pkt->tag.vio_sid = ldcp->local_session;
4754 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4755 
4756 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4757 
4758 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
4759 					sizeof (vio_ver_msg_t), B_TRUE);
4760 
4761 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
4762 			vsw_next_milestone(ldcp);
4763 			return;
4764 		} else {
4765 			ldcp->dev_class = ver_pkt->dev_class;
4766 		}
4767 
4768 		/*
4769 		 * Now check the version.
4770 		 */
4771 		if (vsw_supported_version(ver_pkt) == 0) {
4772 			/*
4773 			 * Support this major version and possibly
4774 			 * adjusted minor version.
4775 			 */
4776 
4777 			D2(vswp, "%s: accepted ver %d:%d", __func__,
4778 				ver_pkt->ver_major, ver_pkt->ver_minor);
4779 
4780 			/* Store accepted values */
4781 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4782 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4783 
4784 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4785 
4786 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
4787 		} else {
4788 			/*
4789 			 * NACK back with the next lower major/minor
4790 			 * pairing we support (if don't suuport any more
4791 			 * versions then they will be set to zero.
4792 			 */
4793 
4794 			D2(vswp, "%s: replying with ver %d:%d", __func__,
4795 				ver_pkt->ver_major, ver_pkt->ver_minor);
4796 
4797 			/* Store updated values */
4798 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4799 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4800 
4801 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4802 
4803 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
4804 		}
4805 
4806 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4807 		ver_pkt->tag.vio_sid = ldcp->local_session;
4808 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
4809 			sizeof (vio_ver_msg_t), B_TRUE);
4810 
4811 		vsw_next_milestone(ldcp);
4812 		break;
4813 
4814 	case VIO_SUBTYPE_ACK:
4815 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
4816 
4817 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
4818 			return;
4819 
4820 		/* Store updated values */
4821 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
4822 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4823 
4824 
4825 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
4826 		vsw_next_milestone(ldcp);
4827 
4828 		break;
4829 
4830 	case VIO_SUBTYPE_NACK:
4831 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
4832 
4833 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
4834 			return;
4835 
4836 		/*
4837 		 * If our peer sent us a NACK with the ver fields set to
4838 		 * zero then there is nothing more we can do. Otherwise see
4839 		 * if we support either the version suggested, or a lesser
4840 		 * one.
4841 		 */
4842 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
4843 			DERR(vswp, "%s: peer unable to negotiate any "
4844 				"further.", __func__);
4845 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
4846 			vsw_next_milestone(ldcp);
4847 			return;
4848 		}
4849 
4850 		/*
4851 		 * Check to see if we support this major version or
4852 		 * a lower one. If we don't then maj/min will be set
4853 		 * to zero.
4854 		 */
4855 		(void) vsw_supported_version(ver_pkt);
4856 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
4857 			/* Nothing more we can do */
4858 			DERR(vswp, "%s: version negotiation failed.\n",
4859 								__func__);
4860 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
4861 			vsw_next_milestone(ldcp);
4862 		} else {
4863 			/* found a supported major version */
4864 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
4865 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
4866 
4867 			D2(vswp, "%s: resending with updated values (%x, %x)",
4868 				__func__, ver_pkt->ver_major,
4869 				ver_pkt->ver_minor);
4870 
4871 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
4872 			ver_pkt->tag.vio_sid = ldcp->local_session;
4873 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4874 
4875 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4876 
4877 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
4878 				sizeof (vio_ver_msg_t), B_TRUE);
4879 
4880 			vsw_next_milestone(ldcp);
4881 
4882 		}
4883 		break;
4884 
4885 	default:
4886 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4887 			ver_pkt->tag.vio_subtype);
4888 	}
4889 
4890 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
4891 }
4892 
4893 /*
4894  * Process an attribute packet. We can end up here either because our peer
4895  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
4896  * peer has sent us an attribute INFO message
4897  *
4898  * If its an ACK we then move to the next stage of the handshake which
4899  * is to send our descriptor ring info to our peer. If its a NACK then
4900  * there is nothing more we can (currently) do.
4901  *
4902  * If we get a valid/acceptable INFO packet (and we have already negotiated
4903  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
4904  * NACK back and reset channel state to INACTIV.
4905  *
4906  * FUTURE: in time we will probably negotiate over attributes, but for
4907  * the moment unacceptable attributes are regarded as a fatal error.
4908  *
4909  */
4910 void
4911 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
4912 {
4913 	vnet_attr_msg_t		*attr_pkt;
4914 	vsw_t			*vswp = ldcp->ldc_vswp;
4915 	vsw_port_t		*port = ldcp->ldc_port;
4916 	uint64_t		macaddr = 0;
4917 	int			i;
4918 
4919 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4920 
4921 	/*
4922 	 * We know this is a ctrl/attr packet so
4923 	 * cast it into the correct structure.
4924 	 */
4925 	attr_pkt = (vnet_attr_msg_t *)pkt;
4926 
4927 	switch (attr_pkt->tag.vio_subtype) {
4928 	case VIO_SUBTYPE_INFO:
4929 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4930 
4931 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
4932 			return;
4933 
4934 		/*
4935 		 * If the attributes are unacceptable then we NACK back.
4936 		 */
4937 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
4938 
4939 			DERR(vswp, "%s (chan %d): invalid attributes",
4940 				__func__, ldcp->ldc_id);
4941 
4942 			vsw_free_lane_resources(ldcp, INBOUND);
4943 
4944 			attr_pkt->tag.vio_sid = ldcp->local_session;
4945 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4946 
4947 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
4948 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
4949 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
4950 				sizeof (vnet_attr_msg_t), B_TRUE);
4951 
4952 			vsw_next_milestone(ldcp);
4953 			return;
4954 		}
4955 
4956 		/*
4957 		 * Otherwise store attributes for this lane and update
4958 		 * lane state.
4959 		 */
4960 		ldcp->lane_in.mtu = attr_pkt->mtu;
4961 		ldcp->lane_in.addr = attr_pkt->addr;
4962 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
4963 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
4964 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
4965 
4966 		macaddr = ldcp->lane_in.addr;
4967 		for (i = ETHERADDRL - 1; i >= 0; i--) {
4968 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
4969 			macaddr >>= 8;
4970 		}
4971 
4972 		/* create the fdb entry for this port/mac address */
4973 		(void) vsw_add_fdb(vswp, port);
4974 
4975 		/* setup device specifc xmit routines */
4976 		mutex_enter(&port->tx_lock);
4977 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
4978 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
4979 			port->transmit = vsw_dringsend;
4980 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
4981 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
4982 			vsw_create_privring(ldcp);
4983 			port->transmit = vsw_descrsend;
4984 		}
4985 		mutex_exit(&port->tx_lock);
4986 
4987 		attr_pkt->tag.vio_sid = ldcp->local_session;
4988 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4989 
4990 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
4991 
4992 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
4993 
4994 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
4995 				sizeof (vnet_attr_msg_t), B_TRUE);
4996 
4997 		vsw_next_milestone(ldcp);
4998 		break;
4999 
5000 	case VIO_SUBTYPE_ACK:
5001 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5002 
5003 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
5004 			return;
5005 
5006 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
5007 		vsw_next_milestone(ldcp);
5008 		break;
5009 
5010 	case VIO_SUBTYPE_NACK:
5011 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5012 
5013 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
5014 			return;
5015 
5016 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
5017 		vsw_next_milestone(ldcp);
5018 		break;
5019 
5020 	default:
5021 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5022 			attr_pkt->tag.vio_subtype);
5023 	}
5024 
5025 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5026 }
5027 
5028 /*
5029  * Process a dring info packet. We can end up here either because our peer
5030  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
5031  * peer has sent us a dring INFO message.
5032  *
5033  * If we get a valid/acceptable INFO packet (and we have already negotiated
5034  * a version) we ACK back and update the lane state, otherwise we NACK back.
5035  *
5036  * FUTURE: nothing to stop client from sending us info on multiple dring's
5037  * but for the moment we will just use the first one we are given.
5038  *
5039  */
5040 void
5041 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
5042 {
5043 	vio_dring_reg_msg_t	*dring_pkt;
5044 	vsw_t			*vswp = ldcp->ldc_vswp;
5045 	ldc_mem_info_t		minfo;
5046 	dring_info_t		*dp, *dbp;
5047 	int			dring_found = 0;
5048 
5049 	/*
5050 	 * We know this is a ctrl/dring packet so
5051 	 * cast it into the correct structure.
5052 	 */
5053 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
5054 
5055 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5056 
5057 	switch (dring_pkt->tag.vio_subtype) {
5058 	case VIO_SUBTYPE_INFO:
5059 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5060 
5061 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
5062 			return;
5063 
5064 		/*
5065 		 * If the dring params are unacceptable then we NACK back.
5066 		 */
5067 		if (vsw_check_dring_info(dring_pkt)) {
5068 
5069 			DERR(vswp, "%s (%lld): invalid dring info",
5070 				__func__, ldcp->ldc_id);
5071 
5072 			vsw_free_lane_resources(ldcp, INBOUND);
5073 
5074 			dring_pkt->tag.vio_sid = ldcp->local_session;
5075 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5076 
5077 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5078 
5079 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5080 
5081 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5082 				sizeof (vio_dring_reg_msg_t), B_TRUE);
5083 
5084 			vsw_next_milestone(ldcp);
5085 			return;
5086 		}
5087 
5088 		/*
5089 		 * Otherwise, attempt to map in the dring using the
5090 		 * cookie. If that succeeds we send back a unique dring
5091 		 * identifier that the sending side will use in future
5092 		 * to refer to this descriptor ring.
5093 		 */
5094 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5095 
5096 		dp->num_descriptors = dring_pkt->num_descriptors;
5097 		dp->descriptor_size = dring_pkt->descriptor_size;
5098 		dp->options = dring_pkt->options;
5099 		dp->ncookies = dring_pkt->ncookies;
5100 
5101 		/*
5102 		 * Note: should only get one cookie. Enforced in
5103 		 * the ldc layer.
5104 		 */
5105 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
5106 			sizeof (ldc_mem_cookie_t));
5107 
5108 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
5109 			dp->num_descriptors, dp->descriptor_size);
5110 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
5111 			dp->options, dp->ncookies);
5112 
5113 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
5114 			dp->ncookies, dp->num_descriptors,
5115 			dp->descriptor_size, LDC_SHADOW_MAP,
5116 			&(dp->handle))) != 0) {
5117 
5118 			DERR(vswp, "%s: dring_map failed\n", __func__);
5119 
5120 			kmem_free(dp, sizeof (dring_info_t));
5121 			vsw_free_lane_resources(ldcp, INBOUND);
5122 
5123 			dring_pkt->tag.vio_sid = ldcp->local_session;
5124 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5125 
5126 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5127 
5128 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5129 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5130 				sizeof (vio_dring_reg_msg_t), B_TRUE);
5131 
5132 			vsw_next_milestone(ldcp);
5133 			return;
5134 		}
5135 
5136 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
5137 
5138 			DERR(vswp, "%s: dring_addr failed\n", __func__);
5139 
5140 			kmem_free(dp, sizeof (dring_info_t));
5141 			vsw_free_lane_resources(ldcp, INBOUND);
5142 
5143 			dring_pkt->tag.vio_sid = ldcp->local_session;
5144 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5145 
5146 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5147 
5148 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5149 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5150 				sizeof (vio_dring_reg_msg_t), B_TRUE);
5151 
5152 			vsw_next_milestone(ldcp);
5153 			return;
5154 		} else {
5155 			/* store the address of the pub part of ring */
5156 			dp->pub_addr = minfo.vaddr;
5157 		}
5158 
5159 		/* no private section as we are importing */
5160 		dp->priv_addr = NULL;
5161 
5162 		/*
5163 		 * Using simple mono increasing int for ident at
5164 		 * the moment.
5165 		 */
5166 		dp->ident = ldcp->next_ident;
5167 		ldcp->next_ident++;
5168 
5169 		dp->end_idx = 0;
5170 		dp->next = NULL;
5171 
5172 		/*
5173 		 * Link it onto the end of the list of drings
5174 		 * for this lane.
5175 		 */
5176 		if (ldcp->lane_in.dringp == NULL) {
5177 			D2(vswp, "%s: adding first INBOUND dring", __func__);
5178 			ldcp->lane_in.dringp = dp;
5179 		} else {
5180 			dbp = ldcp->lane_in.dringp;
5181 
5182 			while (dbp->next != NULL)
5183 				dbp = dbp->next;
5184 
5185 			dbp->next = dp;
5186 		}
5187 
5188 		/* acknowledge it */
5189 		dring_pkt->tag.vio_sid = ldcp->local_session;
5190 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5191 		dring_pkt->dring_ident = dp->ident;
5192 
5193 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5194 			sizeof (vio_dring_reg_msg_t), B_TRUE);
5195 
5196 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
5197 		vsw_next_milestone(ldcp);
5198 		break;
5199 
5200 	case VIO_SUBTYPE_ACK:
5201 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5202 
5203 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
5204 			return;
5205 
5206 		/*
5207 		 * Peer is acknowledging our dring info and will have
5208 		 * sent us a dring identifier which we will use to
5209 		 * refer to this ring w.r.t. our peer.
5210 		 */
5211 		dp = ldcp->lane_out.dringp;
5212 		if (dp != NULL) {
5213 			/*
5214 			 * Find the ring this ident should be associated
5215 			 * with.
5216 			 */
5217 			if (vsw_dring_match(dp, dring_pkt)) {
5218 				dring_found = 1;
5219 
5220 			} else while (dp != NULL) {
5221 				if (vsw_dring_match(dp, dring_pkt)) {
5222 					dring_found = 1;
5223 					break;
5224 				}
5225 				dp = dp->next;
5226 			}
5227 
5228 			if (dring_found == 0) {
5229 				DERR(NULL, "%s: unrecognised ring cookie",
5230 					__func__);
5231 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5232 				return;
5233 			}
5234 
5235 		} else {
5236 			DERR(vswp, "%s: DRING ACK received but no drings "
5237 				"allocated", __func__);
5238 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5239 			return;
5240 		}
5241 
5242 		/* store ident */
5243 		dp->ident = dring_pkt->dring_ident;
5244 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
5245 		vsw_next_milestone(ldcp);
5246 		break;
5247 
5248 	case VIO_SUBTYPE_NACK:
5249 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5250 
5251 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
5252 			return;
5253 
5254 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
5255 		vsw_next_milestone(ldcp);
5256 		break;
5257 
5258 	default:
5259 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5260 			dring_pkt->tag.vio_subtype);
5261 	}
5262 
5263 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5264 }
5265 
5266 /*
5267  * Process a request from peer to unregister a dring.
5268  *
5269  * For the moment we just restart the handshake if our
5270  * peer endpoint attempts to unregister a dring.
5271  */
5272 void
5273 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
5274 {
5275 	vsw_t			*vswp = ldcp->ldc_vswp;
5276 	vio_dring_unreg_msg_t	*dring_pkt;
5277 
5278 	/*
5279 	 * We know this is a ctrl/dring packet so
5280 	 * cast it into the correct structure.
5281 	 */
5282 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
5283 
5284 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5285 
5286 	switch (dring_pkt->tag.vio_subtype) {
5287 	case VIO_SUBTYPE_INFO:
5288 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5289 
5290 		DWARN(vswp, "%s: restarting handshake..", __func__);
5291 		break;
5292 
5293 	case VIO_SUBTYPE_ACK:
5294 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5295 
5296 		DWARN(vswp, "%s: restarting handshake..", __func__);
5297 		break;
5298 
5299 	case VIO_SUBTYPE_NACK:
5300 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5301 
5302 		DWARN(vswp, "%s: restarting handshake..", __func__);
5303 		break;
5304 
5305 	default:
5306 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5307 			dring_pkt->tag.vio_subtype);
5308 	}
5309 
5310 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5311 
5312 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5313 }
5314 
5315 #define	SND_MCST_NACK(ldcp, pkt) \
5316 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5317 	pkt->tag.vio_sid = ldcp->local_session; \
5318 	(void) vsw_send_msg(ldcp, (void *)pkt, \
5319 			sizeof (vnet_mcast_msg_t), B_TRUE);
5320 
5321 /*
5322  * Process a multicast request from a vnet.
5323  *
5324  * Vnet's specify a multicast address that they are interested in. This
5325  * address is used as a key into the hash table which forms the multicast
5326  * forwarding database (mFDB).
5327  *
5328  * The table keys are the multicast addresses, while the table entries
5329  * are pointers to lists of ports which wish to receive packets for the
5330  * specified multicast address.
5331  *
5332  * When a multicast packet is being switched we use the address as a key
5333  * into the hash table, and then walk the appropriate port list forwarding
5334  * the pkt to each port in turn.
5335  *
5336  * If a vnet is no longer interested in a particular multicast grouping
5337  * we simply find the correct location in the hash table and then delete
5338  * the relevant port from the port list.
5339  *
5340  * To deal with the case whereby a port is being deleted without first
5341  * removing itself from the lists in the hash table, we maintain a list
5342  * of multicast addresses the port has registered an interest in, within
5343  * the port structure itself. We then simply walk that list of addresses
5344  * using them as keys into the hash table and remove the port from the
5345  * appropriate lists.
5346  */
5347 static void
5348 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
5349 {
5350 	vnet_mcast_msg_t	*mcst_pkt;
5351 	vsw_port_t		*port = ldcp->ldc_port;
5352 	vsw_t			*vswp = ldcp->ldc_vswp;
5353 	int			i;
5354 
5355 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5356 
5357 	/*
5358 	 * We know this is a ctrl/mcast packet so
5359 	 * cast it into the correct structure.
5360 	 */
5361 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
5362 
5363 	switch (mcst_pkt->tag.vio_subtype) {
5364 	case VIO_SUBTYPE_INFO:
5365 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5366 
5367 		/*
5368 		 * Check if in correct state to receive a multicast
5369 		 * message (i.e. handshake complete). If not reset
5370 		 * the handshake.
5371 		 */
5372 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
5373 			return;
5374 
5375 		/*
5376 		 * Before attempting to add or remove address check
5377 		 * that they are valid multicast addresses.
5378 		 * If not, then NACK back.
5379 		 */
5380 		for (i = 0; i < mcst_pkt->count; i++) {
5381 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
5382 				DERR(vswp, "%s: invalid multicast address",
5383 								__func__);
5384 				SND_MCST_NACK(ldcp, mcst_pkt);
5385 				return;
5386 			}
5387 		}
5388 
5389 		/*
5390 		 * Now add/remove the addresses. If this fails we
5391 		 * NACK back.
5392 		 */
5393 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
5394 			SND_MCST_NACK(ldcp, mcst_pkt);
5395 			return;
5396 		}
5397 
5398 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5399 		mcst_pkt->tag.vio_sid = ldcp->local_session;
5400 
5401 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
5402 
5403 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
5404 				sizeof (vnet_mcast_msg_t), B_TRUE);
5405 		break;
5406 
5407 	case VIO_SUBTYPE_ACK:
5408 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5409 
5410 		/*
5411 		 * We shouldn't ever get a multicast ACK message as
5412 		 * at the moment we never request multicast addresses
5413 		 * to be set on some other device. This may change in
5414 		 * the future if we have cascading switches.
5415 		 */
5416 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
5417 			return;
5418 
5419 				/* Do nothing */
5420 		break;
5421 
5422 	case VIO_SUBTYPE_NACK:
5423 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5424 
5425 		/*
5426 		 * We shouldn't get a multicast NACK packet for the
5427 		 * same reasons as we shouldn't get a ACK packet.
5428 		 */
5429 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
5430 			return;
5431 
5432 				/* Do nothing */
5433 		break;
5434 
5435 	default:
5436 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5437 			mcst_pkt->tag.vio_subtype);
5438 	}
5439 
5440 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5441 }
5442 
5443 static void
5444 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
5445 {
5446 	vio_rdx_msg_t	*rdx_pkt;
5447 	vsw_t		*vswp = ldcp->ldc_vswp;
5448 
5449 	/*
5450 	 * We know this is a ctrl/rdx packet so
5451 	 * cast it into the correct structure.
5452 	 */
5453 	rdx_pkt = (vio_rdx_msg_t *)pkt;
5454 
5455 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5456 
5457 	switch (rdx_pkt->tag.vio_subtype) {
5458 	case VIO_SUBTYPE_INFO:
5459 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5460 
5461 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
5462 			return;
5463 
5464 		rdx_pkt->tag.vio_sid = ldcp->local_session;
5465 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5466 
5467 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
5468 
5469 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
5470 
5471 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
5472 			sizeof (vio_rdx_msg_t), B_TRUE);
5473 
5474 		vsw_next_milestone(ldcp);
5475 		break;
5476 
5477 	case VIO_SUBTYPE_ACK:
5478 		/*
5479 		 * Should be handled in-band by callback handler.
5480 		 */
5481 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
5482 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5483 		break;
5484 
5485 	case VIO_SUBTYPE_NACK:
5486 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5487 
5488 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
5489 			return;
5490 
5491 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
5492 		vsw_next_milestone(ldcp);
5493 		break;
5494 
5495 	default:
5496 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5497 			rdx_pkt->tag.vio_subtype);
5498 	}
5499 
5500 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5501 }
5502 
5503 static void
5504 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
5505 {
5506 	uint16_t	env = tag.vio_subtype_env;
5507 	vsw_t		*vswp = ldcp->ldc_vswp;
5508 
5509 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5510 
5511 	/* session id check */
5512 	if (ldcp->session_status & VSW_PEER_SESSION) {
5513 		if (ldcp->peer_session != tag.vio_sid) {
5514 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
5515 				__func__, ldcp->ldc_id, tag.vio_sid);
5516 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5517 			return;
5518 		}
5519 	}
5520 
5521 	/*
5522 	 * It is an error for us to be getting data packets
5523 	 * before the handshake has completed.
5524 	 */
5525 	if (ldcp->hphase != VSW_MILESTONE4) {
5526 		DERR(vswp, "%s: got data packet before handshake complete "
5527 			"hphase %d (%x: %x)", __func__, ldcp->hphase,
5528 			ldcp->lane_in.lstate, ldcp->lane_out.lstate);
5529 		DUMP_FLAGS(ldcp->lane_in.lstate);
5530 		DUMP_FLAGS(ldcp->lane_out.lstate);
5531 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5532 		return;
5533 	}
5534 
5535 	/*
5536 	 * Switch on vio_subtype envelope, then let lower routines
5537 	 * decide if its an INFO, ACK or NACK packet.
5538 	 */
5539 	if (env == VIO_DRING_DATA) {
5540 		vsw_process_data_dring_pkt(ldcp, dpkt);
5541 	} else if (env == VIO_PKT_DATA) {
5542 		vsw_process_data_raw_pkt(ldcp, dpkt);
5543 	} else if (env == VIO_DESC_DATA) {
5544 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
5545 	} else {
5546 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
5547 							__func__, env);
5548 	}
5549 
5550 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5551 }
5552 
5553 #define	SND_DRING_NACK(ldcp, pkt) \
5554 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5555 	pkt->tag.vio_sid = ldcp->local_session; \
5556 	(void) vsw_send_msg(ldcp, (void *)pkt, \
5557 			sizeof (vio_dring_msg_t), B_TRUE);
5558 
5559 static void
5560 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
5561 {
5562 	vio_dring_msg_t		*dring_pkt;
5563 	vnet_public_desc_t	*pub_addr = NULL;
5564 	vsw_private_desc_t	*priv_addr = NULL;
5565 	dring_info_t		*dp = NULL;
5566 	vsw_t			*vswp = ldcp->ldc_vswp;
5567 	mblk_t			*mp = NULL;
5568 	mblk_t			*bp = NULL;
5569 	mblk_t			*bpt = NULL;
5570 	size_t			nbytes = 0;
5571 	size_t			off = 0;
5572 	uint64_t		ncookies = 0;
5573 	uint64_t		chain = 0;
5574 	uint64_t		j, len;
5575 	uint32_t		pos, start, datalen;
5576 	uint32_t		range_start, range_end;
5577 	int32_t			end, num, cnt = 0;
5578 	int			i, rv, msg_rv = 0;
5579 	boolean_t		ack_needed = B_FALSE;
5580 	boolean_t		prev_desc_ack = B_FALSE;
5581 	int			read_attempts = 0;
5582 
5583 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5584 
5585 	/*
5586 	 * We know this is a data/dring packet so
5587 	 * cast it into the correct structure.
5588 	 */
5589 	dring_pkt = (vio_dring_msg_t *)dpkt;
5590 
5591 	/*
5592 	 * Switch on the vio_subtype. If its INFO then we need to
5593 	 * process the data. If its an ACK we need to make sure
5594 	 * it makes sense (i.e did we send an earlier data/info),
5595 	 * and if its a NACK then we maybe attempt a retry.
5596 	 */
5597 	switch (dring_pkt->tag.vio_subtype) {
5598 	case VIO_SUBTYPE_INFO:
5599 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
5600 
5601 		READ_ENTER(&ldcp->lane_in.dlistrw);
5602 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
5603 				dring_pkt->dring_ident)) == NULL) {
5604 			RW_EXIT(&ldcp->lane_in.dlistrw);
5605 
5606 			DERR(vswp, "%s(%lld): unable to find dring from "
5607 				"ident 0x%llx", __func__, ldcp->ldc_id,
5608 				dring_pkt->dring_ident);
5609 
5610 			SND_DRING_NACK(ldcp, dring_pkt);
5611 			return;
5612 		}
5613 
5614 		start = pos = dring_pkt->start_idx;
5615 		end = dring_pkt->end_idx;
5616 		len = dp->num_descriptors;
5617 
5618 		range_start = range_end = pos;
5619 
5620 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
5621 			__func__, ldcp->ldc_id, start, end);
5622 
5623 		if (end == -1) {
5624 			num = -1;
5625 		} else if (end >= 0) {
5626 			num = end >= pos ?
5627 				end - pos + 1: (len - pos + 1) + end;
5628 
5629 			/* basic sanity check */
5630 			if (end > len) {
5631 				RW_EXIT(&ldcp->lane_in.dlistrw);
5632 				DERR(vswp, "%s(%lld): endpoint %lld outside "
5633 					"ring length %lld", __func__,
5634 					ldcp->ldc_id, end, len);
5635 
5636 				SND_DRING_NACK(ldcp, dring_pkt);
5637 				return;
5638 			}
5639 		} else {
5640 			RW_EXIT(&ldcp->lane_in.dlistrw);
5641 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
5642 				__func__, ldcp->ldc_id, end);
5643 			SND_DRING_NACK(ldcp, dring_pkt);
5644 			return;
5645 		}
5646 
5647 		while (cnt != num) {
5648 vsw_recheck_desc:
5649 			if ((rv = ldc_mem_dring_acquire(dp->handle,
5650 							pos, pos)) != 0) {
5651 				RW_EXIT(&ldcp->lane_in.dlistrw);
5652 				DERR(vswp, "%s(%lld): unable to acquire "
5653 					"descriptor at pos %d: err %d",
5654 					__func__, pos, ldcp->ldc_id, rv);
5655 				SND_DRING_NACK(ldcp, dring_pkt);
5656 				return;
5657 			}
5658 
5659 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
5660 
5661 			/*
5662 			 * When given a bounded range of descriptors
5663 			 * to process, its an error to hit a descriptor
5664 			 * which is not ready. In the non-bounded case
5665 			 * (end_idx == -1) this simply indicates we have
5666 			 * reached the end of the current active range.
5667 			 */
5668 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
5669 				/* unbound - no error */
5670 				if (end == -1) {
5671 					if (read_attempts == vsw_read_attempts)
5672 						break;
5673 
5674 					delay(drv_usectohz(vsw_desc_delay));
5675 					read_attempts++;
5676 					goto vsw_recheck_desc;
5677 				}
5678 
5679 				/* bounded - error - so NACK back */
5680 				RW_EXIT(&ldcp->lane_in.dlistrw);
5681 				DERR(vswp, "%s(%lld): descriptor not READY "
5682 					"(%d)", __func__, ldcp->ldc_id,
5683 					pub_addr->hdr.dstate);
5684 				SND_DRING_NACK(ldcp, dring_pkt);
5685 				return;
5686 			}
5687 
5688 			DTRACE_PROBE1(read_attempts, int, read_attempts);
5689 
5690 			range_end = pos;
5691 
5692 			/*
5693 			 * If we ACK'd the previous descriptor then now
5694 			 * record the new range start position for later
5695 			 * ACK's.
5696 			 */
5697 			if (prev_desc_ack) {
5698 				range_start = pos;
5699 
5700 				D2(vswp, "%s(%lld): updating range start "
5701 					"to be %d", __func__, ldcp->ldc_id,
5702 					range_start);
5703 
5704 				prev_desc_ack = B_FALSE;
5705 			}
5706 
5707 			/*
5708 			 * Data is padded to align on 8 byte boundary,
5709 			 * datalen is actual data length, i.e. minus that
5710 			 * padding.
5711 			 */
5712 			datalen = pub_addr->nbytes;
5713 
5714 			/*
5715 			 * Does peer wish us to ACK when we have finished
5716 			 * with this descriptor ?
5717 			 */
5718 			if (pub_addr->hdr.ack)
5719 				ack_needed = B_TRUE;
5720 
5721 			D2(vswp, "%s(%lld): processing desc %lld at pos"
5722 				" 0x%llx : dstate 0x%lx : datalen 0x%lx",
5723 				__func__, ldcp->ldc_id, pos, pub_addr,
5724 				pub_addr->hdr.dstate, datalen);
5725 
5726 			/*
5727 			 * Mark that we are starting to process descriptor.
5728 			 */
5729 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
5730 
5731 			mp = vio_allocb(ldcp->rxh);
5732 			if (mp == NULL) {
5733 				/*
5734 				 * No free receive buffers available, so
5735 				 * fallback onto allocb(9F). Make sure that
5736 				 * we get a data buffer which is a multiple
5737 				 * of 8 as this is required by ldc_mem_copy.
5738 				 */
5739 				DTRACE_PROBE(allocb);
5740 				mp = allocb(datalen + VNET_IPALIGN + 8,
5741 								BPRI_MED);
5742 			}
5743 
5744 			/*
5745 			 * Ensure that we ask ldc for an aligned
5746 			 * number of bytes.
5747 			 */
5748 			nbytes = datalen + VNET_IPALIGN;
5749 			if (nbytes & 0x7) {
5750 				off = 8 - (nbytes & 0x7);
5751 				nbytes += off;
5752 			}
5753 
5754 			ncookies = pub_addr->ncookies;
5755 			rv = ldc_mem_copy(ldcp->ldc_handle,
5756 				(caddr_t)mp->b_rptr, 0, &nbytes,
5757 				pub_addr->memcookie, ncookies,
5758 				LDC_COPY_IN);
5759 
5760 			if (rv != 0) {
5761 				DERR(vswp, "%s(%d): unable to copy in "
5762 					"data from %d cookies in desc %d"
5763 					" (rv %d)", __func__, ldcp->ldc_id,
5764 					ncookies, pos, rv);
5765 				freemsg(mp);
5766 
5767 				pub_addr->hdr.dstate = VIO_DESC_DONE;
5768 				(void) ldc_mem_dring_release(dp->handle,
5769 								pos, pos);
5770 				break;
5771 			} else {
5772 				D2(vswp, "%s(%d): copied in %ld bytes"
5773 					" using %d cookies", __func__,
5774 					ldcp->ldc_id, nbytes, ncookies);
5775 			}
5776 
5777 			/* adjust the read pointer to skip over the padding */
5778 			mp->b_rptr += VNET_IPALIGN;
5779 
5780 			/* point to the actual end of data */
5781 			mp->b_wptr = mp->b_rptr + datalen;
5782 
5783 			/* build a chain of received packets */
5784 			if (bp == NULL) {
5785 				/* first pkt */
5786 				bp = mp;
5787 				bp->b_next = bp->b_prev = NULL;
5788 				bpt = bp;
5789 				chain = 1;
5790 			} else {
5791 				mp->b_next = NULL;
5792 				mp->b_prev = bpt;
5793 				bpt->b_next = mp;
5794 				bpt = mp;
5795 				chain++;
5796 			}
5797 
5798 			/* mark we are finished with this descriptor */
5799 			pub_addr->hdr.dstate = VIO_DESC_DONE;
5800 
5801 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
5802 
5803 			/*
5804 			 * Send an ACK back to peer if requested.
5805 			 */
5806 			if (ack_needed) {
5807 				ack_needed = B_FALSE;
5808 
5809 				dring_pkt->start_idx = range_start;
5810 				dring_pkt->end_idx = range_end;
5811 
5812 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
5813 					" requested", __func__, ldcp->ldc_id,
5814 					dring_pkt->start_idx,
5815 					dring_pkt->end_idx);
5816 
5817 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
5818 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5819 				dring_pkt->tag.vio_sid = ldcp->local_session;
5820 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
5821 						sizeof (vio_dring_msg_t),
5822 						B_FALSE);
5823 
5824 				/*
5825 				 * Check if ACK was successfully sent. If not
5826 				 * we break and deal with that below.
5827 				 */
5828 				if (msg_rv != 0)
5829 					break;
5830 
5831 				prev_desc_ack = B_TRUE;
5832 				range_start = pos;
5833 			}
5834 
5835 			/* next descriptor */
5836 			pos = (pos + 1) % len;
5837 			cnt++;
5838 
5839 			/*
5840 			 * Break out of loop here and stop processing to
5841 			 * allow some other network device (or disk) to
5842 			 * get access to the cpu.
5843 			 */
5844 			if (chain > vsw_chain_len) {
5845 				D3(vswp, "%s(%lld): switching chain of %d "
5846 					"msgs", __func__, ldcp->ldc_id, chain);
5847 				break;
5848 			}
5849 		}
5850 		RW_EXIT(&ldcp->lane_in.dlistrw);
5851 
5852 		/*
5853 		 * If when we attempted to send the ACK we found that the
5854 		 * channel had been reset then now handle this. We deal with
5855 		 * it here as we cannot reset the channel while holding the
5856 		 * dlistrw lock, and we don't want to acquire/release it
5857 		 * continuously in the above loop, as a channel reset should
5858 		 * be a rare event.
5859 		 */
5860 		if (msg_rv == ECONNRESET) {
5861 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
5862 			break;
5863 		}
5864 
5865 		/* send the chain of packets to be switched */
5866 		if (bp != NULL) {
5867 			D3(vswp, "%s(%lld): switching chain of %d msgs",
5868 					__func__, ldcp->ldc_id, chain);
5869 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
5870 							ldcp->ldc_port, NULL);
5871 		}
5872 
5873 		DTRACE_PROBE1(msg_cnt, int, cnt);
5874 
5875 		/*
5876 		 * We are now finished so ACK back with the state
5877 		 * set to STOPPING so our peer knows we are finished
5878 		 */
5879 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5880 		dring_pkt->tag.vio_sid = ldcp->local_session;
5881 
5882 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
5883 
5884 		DTRACE_PROBE(stop_process_sent);
5885 
5886 		/*
5887 		 * We have not processed any more descriptors beyond
5888 		 * the last one we ACK'd.
5889 		 */
5890 		if (prev_desc_ack)
5891 			range_start = range_end;
5892 
5893 		dring_pkt->start_idx = range_start;
5894 		dring_pkt->end_idx = range_end;
5895 
5896 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
5897 			__func__, ldcp->ldc_id, dring_pkt->start_idx,
5898 			dring_pkt->end_idx);
5899 
5900 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5901 				sizeof (vio_dring_msg_t), B_TRUE);
5902 		break;
5903 
5904 	case VIO_SUBTYPE_ACK:
5905 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
5906 		/*
5907 		 * Verify that the relevant descriptors are all
5908 		 * marked as DONE
5909 		 */
5910 		READ_ENTER(&ldcp->lane_out.dlistrw);
5911 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
5912 			dring_pkt->dring_ident)) == NULL) {
5913 			RW_EXIT(&ldcp->lane_out.dlistrw);
5914 			DERR(vswp, "%s: unknown ident in ACK", __func__);
5915 			return;
5916 		}
5917 
5918 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
5919 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
5920 
5921 		start = end = 0;
5922 		start = dring_pkt->start_idx;
5923 		end = dring_pkt->end_idx;
5924 		len = dp->num_descriptors;
5925 
5926 		j = num = 0;
5927 		/* calculate # descriptors taking into a/c wrap around */
5928 		num = end >= start ? end - start + 1: (len - start + 1) + end;
5929 
5930 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
5931 			__func__, ldcp->ldc_id, start, end, num);
5932 
5933 		mutex_enter(&dp->dlock);
5934 		dp->last_ack_recv = end;
5935 		mutex_exit(&dp->dlock);
5936 
5937 		for (i = start; j < num; i = (i + 1) % len, j++) {
5938 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
5939 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
5940 
5941 			/*
5942 			 * If the last descriptor in a range has the ACK
5943 			 * bit set then we will get two messages from our
5944 			 * peer relating to it. The normal ACK msg and then
5945 			 * a subsequent STOP msg. The first message will have
5946 			 * resulted in the descriptor being reclaimed and
5947 			 * its state set to FREE so when we encounter a non
5948 			 * DONE descriptor we need to check to see if its
5949 			 * because we have just reclaimed it.
5950 			 */
5951 			mutex_enter(&priv_addr->dstate_lock);
5952 			if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
5953 				/* clear all the fields */
5954 				bzero(priv_addr->datap, priv_addr->datalen);
5955 				priv_addr->datalen = 0;
5956 
5957 				pub_addr->hdr.dstate = VIO_DESC_FREE;
5958 				pub_addr->hdr.ack = 0;
5959 
5960 				priv_addr->dstate = VIO_DESC_FREE;
5961 				mutex_exit(&priv_addr->dstate_lock);
5962 
5963 				D3(vswp, "clearing descp %d : pub state "
5964 					"0x%llx : priv state 0x%llx", i,
5965 					pub_addr->hdr.dstate,
5966 					priv_addr->dstate);
5967 
5968 			} else {
5969 				mutex_exit(&priv_addr->dstate_lock);
5970 
5971 				if (dring_pkt->dring_process_state !=
5972 							VIO_DP_STOPPED) {
5973 					DERR(vswp, "%s: descriptor %lld at pos "
5974 						" 0x%llx not DONE (0x%lx)\n",
5975 						__func__, i, pub_addr,
5976 						pub_addr->hdr.dstate);
5977 					RW_EXIT(&ldcp->lane_out.dlistrw);
5978 					return;
5979 				}
5980 			}
5981 		}
5982 
5983 		/*
5984 		 * If our peer is stopping processing descriptors then
5985 		 * we check to make sure it has processed all the descriptors
5986 		 * we have updated. If not then we send it a new message
5987 		 * to prompt it to restart.
5988 		 */
5989 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
5990 			DTRACE_PROBE(stop_process_recv);
5991 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
5992 				__func__, ldcp->ldc_id, dring_pkt->start_idx,
5993 				dring_pkt->end_idx);
5994 
5995 			/*
5996 			 * Check next descriptor in public section of ring.
5997 			 * If its marked as READY then we need to prompt our
5998 			 * peer to start processing the ring again.
5999 			 */
6000 			i = (end + 1) % len;
6001 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6002 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6003 
6004 			/*
6005 			 * Hold the restart lock across all of this to
6006 			 * make sure that its not possible for us to
6007 			 * decide that a msg needs to be sent in the future
6008 			 * but the sending code having already checked is
6009 			 * about to exit.
6010 			 */
6011 			mutex_enter(&dp->restart_lock);
6012 			mutex_enter(&priv_addr->dstate_lock);
6013 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
6014 
6015 				mutex_exit(&priv_addr->dstate_lock);
6016 
6017 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
6018 				dring_pkt->tag.vio_sid = ldcp->local_session;
6019 
6020 				mutex_enter(&ldcp->lane_out.seq_lock);
6021 				dring_pkt->seq_num = ldcp->lane_out.seq_num++;
6022 				mutex_exit(&ldcp->lane_out.seq_lock);
6023 
6024 				dring_pkt->start_idx = (end + 1) % len;
6025 				dring_pkt->end_idx = -1;
6026 
6027 				D2(vswp, "%s(%lld) : sending restart msg:"
6028 					" %d : %d", __func__, ldcp->ldc_id,
6029 					dring_pkt->start_idx,
6030 					dring_pkt->end_idx);
6031 
6032 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
6033 					sizeof (vio_dring_msg_t), B_FALSE);
6034 
6035 			} else {
6036 				mutex_exit(&priv_addr->dstate_lock);
6037 				dp->restart_reqd = B_TRUE;
6038 			}
6039 			mutex_exit(&dp->restart_lock);
6040 		}
6041 		RW_EXIT(&ldcp->lane_out.dlistrw);
6042 
6043 		/* only do channel reset after dropping dlistrw lock */
6044 		if (msg_rv == ECONNRESET)
6045 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
6046 
6047 		break;
6048 
6049 	case VIO_SUBTYPE_NACK:
6050 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
6051 						__func__, ldcp->ldc_id);
6052 		/*
6053 		 * Something is badly wrong if we are getting NACK's
6054 		 * for our data pkts. So reset the channel.
6055 		 */
6056 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
6057 
6058 		break;
6059 
6060 	default:
6061 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
6062 			ldcp->ldc_id, dring_pkt->tag.vio_subtype);
6063 	}
6064 
6065 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
6066 }
6067 
6068 /*
6069  * VIO_PKT_DATA (a.k.a raw data mode )
6070  *
6071  * Note - currently not supported. Do nothing.
6072  */
6073 static void
6074 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
6075 {
6076 	_NOTE(ARGUNUSED(dpkt))
6077 
6078 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
6079 
6080 	DERR(NULL, "%s (%lld): currently  not supported",
6081 						__func__, ldcp->ldc_id);
6082 
6083 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
6084 }
6085 
6086 /*
6087  * Process an in-band descriptor message (most likely from
6088  * OBP).
6089  */
6090 static void
6091 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
6092 {
6093 	vnet_ibnd_desc_t	*ibnd_desc;
6094 	dring_info_t		*dp = NULL;
6095 	vsw_private_desc_t	*priv_addr = NULL;
6096 	vsw_t			*vswp = ldcp->ldc_vswp;
6097 	mblk_t			*mp = NULL;
6098 	size_t			nbytes = 0;
6099 	size_t			off = 0;
6100 	uint64_t		idx = 0;
6101 	uint32_t		num = 1, len, datalen = 0;
6102 	uint64_t		ncookies = 0;
6103 	int			i, rv;
6104 	int			j = 0;
6105 
6106 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6107 
6108 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
6109 
6110 	switch (ibnd_desc->hdr.tag.vio_subtype) {
6111 	case VIO_SUBTYPE_INFO:
6112 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
6113 
6114 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
6115 			return;
6116 
6117 		/*
6118 		 * Data is padded to align on a 8 byte boundary,
6119 		 * nbytes is actual data length, i.e. minus that
6120 		 * padding.
6121 		 */
6122 		datalen = ibnd_desc->nbytes;
6123 
6124 		D2(vswp, "%s(%lld): processing inband desc : "
6125 			": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
6126 
6127 		ncookies = ibnd_desc->ncookies;
6128 
6129 		/*
6130 		 * allocb(9F) returns an aligned data block. We
6131 		 * need to ensure that we ask ldc for an aligned
6132 		 * number of bytes also.
6133 		 */
6134 		nbytes = datalen;
6135 		if (nbytes & 0x7) {
6136 			off = 8 - (nbytes & 0x7);
6137 			nbytes += off;
6138 		}
6139 
6140 		mp = allocb(datalen, BPRI_MED);
6141 		if (mp == NULL) {
6142 			DERR(vswp, "%s(%lld): allocb failed",
6143 					__func__, ldcp->ldc_id);
6144 			return;
6145 		}
6146 
6147 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
6148 			0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
6149 			LDC_COPY_IN);
6150 
6151 		if (rv != 0) {
6152 			DERR(vswp, "%s(%d): unable to copy in data from "
6153 				"%d cookie(s)", __func__,
6154 				ldcp->ldc_id, ncookies);
6155 			freemsg(mp);
6156 			return;
6157 		} else {
6158 			D2(vswp, "%s(%d): copied in %ld bytes using %d "
6159 				"cookies", __func__, ldcp->ldc_id, nbytes,
6160 				ncookies);
6161 		}
6162 
6163 		/* point to the actual end of data */
6164 		mp->b_wptr = mp->b_rptr + datalen;
6165 
6166 		/*
6167 		 * We ACK back every in-band descriptor message we process
6168 		 */
6169 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
6170 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
6171 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
6172 				sizeof (vnet_ibnd_desc_t), B_TRUE);
6173 
6174 		/* send the packet to be switched */
6175 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
6176 					ldcp->ldc_port, NULL);
6177 
6178 		break;
6179 
6180 	case VIO_SUBTYPE_ACK:
6181 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
6182 
6183 		/* Verify the ACK is valid */
6184 		idx = ibnd_desc->hdr.desc_handle;
6185 
6186 		if (idx >= VSW_RING_NUM_EL) {
6187 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
6188 				"(idx %ld)", vswp->instance, idx);
6189 			return;
6190 		}
6191 
6192 		if ((dp = ldcp->lane_out.dringp) == NULL) {
6193 			DERR(vswp, "%s: no dring found", __func__);
6194 			return;
6195 		}
6196 
6197 		len = dp->num_descriptors;
6198 		/*
6199 		 * If the descriptor we are being ACK'ed for is not the
6200 		 * one we expected, then pkts were lost somwhere, either
6201 		 * when we tried to send a msg, or a previous ACK msg from
6202 		 * our peer. In either case we now reclaim the descriptors
6203 		 * in the range from the last ACK we received up to the
6204 		 * current ACK.
6205 		 */
6206 		if (idx != dp->last_ack_recv) {
6207 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
6208 				__func__, dp->last_ack_recv, idx);
6209 			num = idx >= dp->last_ack_recv ?
6210 				idx - dp->last_ack_recv + 1:
6211 				(len - dp->last_ack_recv + 1) + idx;
6212 		}
6213 
6214 		/*
6215 		 * When we sent the in-band message to our peer we
6216 		 * marked the copy in our private ring as READY. We now
6217 		 * check that the descriptor we are being ACK'ed for is in
6218 		 * fact READY, i.e. it is one we have shared with our peer.
6219 		 *
6220 		 * If its not we flag an error, but still reset the descr
6221 		 * back to FREE.
6222 		 */
6223 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
6224 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6225 			mutex_enter(&priv_addr->dstate_lock);
6226 			if (priv_addr->dstate != VIO_DESC_READY) {
6227 				DERR(vswp, "%s: (%ld) desc at index %ld not "
6228 					"READY (0x%lx)", __func__,
6229 					ldcp->ldc_id, idx, priv_addr->dstate);
6230 				DERR(vswp, "%s: bound %d: ncookies %ld : "
6231 					"datalen %ld", __func__,
6232 					priv_addr->bound, priv_addr->ncookies,
6233 					priv_addr->datalen);
6234 			}
6235 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
6236 				ldcp->ldc_id, idx);
6237 			/* release resources associated with sent msg */
6238 			bzero(priv_addr->datap, priv_addr->datalen);
6239 			priv_addr->datalen = 0;
6240 			priv_addr->dstate = VIO_DESC_FREE;
6241 			mutex_exit(&priv_addr->dstate_lock);
6242 		}
6243 		/* update to next expected value */
6244 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
6245 
6246 		break;
6247 
6248 	case VIO_SUBTYPE_NACK:
6249 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
6250 
6251 		/*
6252 		 * We should only get a NACK if our peer doesn't like
6253 		 * something about a message we have sent it. If this
6254 		 * happens we just release the resources associated with
6255 		 * the message. (We are relying on higher layers to decide
6256 		 * whether or not to resend.
6257 		 */
6258 
6259 		/* limit check */
6260 		idx = ibnd_desc->hdr.desc_handle;
6261 
6262 		if (idx >= VSW_RING_NUM_EL) {
6263 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
6264 				__func__, idx);
6265 			return;
6266 		}
6267 
6268 		if ((dp = ldcp->lane_out.dringp) == NULL) {
6269 			DERR(vswp, "%s: no dring found", __func__);
6270 			return;
6271 		}
6272 
6273 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6274 
6275 		/* move to correct location in ring */
6276 		priv_addr += idx;
6277 
6278 		/* release resources associated with sent msg */
6279 		mutex_enter(&priv_addr->dstate_lock);
6280 		bzero(priv_addr->datap, priv_addr->datalen);
6281 		priv_addr->datalen = 0;
6282 		priv_addr->dstate = VIO_DESC_FREE;
6283 		mutex_exit(&priv_addr->dstate_lock);
6284 
6285 		break;
6286 
6287 	default:
6288 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
6289 			ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
6290 	}
6291 
6292 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
6293 }
6294 
6295 static void
6296 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
6297 {
6298 	_NOTE(ARGUNUSED(epkt))
6299 
6300 	vsw_t		*vswp = ldcp->ldc_vswp;
6301 	uint16_t	env = tag.vio_subtype_env;
6302 
6303 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
6304 
6305 	/*
6306 	 * Error vio_subtypes have yet to be defined. So for
6307 	 * the moment we can't do anything.
6308 	 */
6309 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
6310 
6311 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
6312 }
6313 
6314 /*
6315  * Switch the given ethernet frame when operating in layer 2 mode.
6316  *
6317  * vswp: pointer to the vsw instance
6318  * mp: pointer to chain of ethernet frame(s) to be switched
6319  * caller: identifies the source of this frame as:
6320  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
6321  *		2. VSW_PHYSDEV - the physical ethernet device
6322  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
6323  * arg: argument provided by the caller.
6324  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
6325  *		2. for PHYSDEV - NULL
6326  *		3. for LOCALDEV - pointer to to this vsw_t(self)
6327  */
6328 void
6329 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
6330 			vsw_port_t *arg, mac_resource_handle_t mrh)
6331 {
6332 	struct ether_header	*ehp;
6333 	vsw_port_t		*port = NULL;
6334 	mblk_t			*bp, *ret_m;
6335 	mblk_t			*nmp = NULL;
6336 	vsw_port_list_t		*plist = &vswp->plist;
6337 
6338 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
6339 
6340 	/*
6341 	 * PERF: rather than breaking up the chain here, scan it
6342 	 * to find all mblks heading to same destination and then
6343 	 * pass that sub-chain to the lower transmit functions.
6344 	 */
6345 
6346 	/* process the chain of packets */
6347 	bp = mp;
6348 	while (bp) {
6349 		mp = bp;
6350 		bp = bp->b_next;
6351 		mp->b_next = mp->b_prev = NULL;
6352 		ehp = (struct ether_header *)mp->b_rptr;
6353 
6354 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
6355 			__func__, MBLKSIZE(mp), MBLKL(mp));
6356 
6357 		READ_ENTER(&vswp->if_lockrw);
6358 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
6359 			/*
6360 			 * If destination is VSW_LOCALDEV (vsw as an eth
6361 			 * interface) and if the device is up & running,
6362 			 * send the packet up the stack on this host.
6363 			 * If the virtual interface is down, drop the packet.
6364 			 */
6365 			if (caller != VSW_LOCALDEV) {
6366 				if (vswp->if_state & VSW_IF_UP) {
6367 					RW_EXIT(&vswp->if_lockrw);
6368 					mac_rx(vswp->if_mh, mrh, mp);
6369 				} else {
6370 					RW_EXIT(&vswp->if_lockrw);
6371 					/* Interface down, drop pkt */
6372 					freemsg(mp);
6373 				}
6374 			} else {
6375 				RW_EXIT(&vswp->if_lockrw);
6376 				freemsg(mp);
6377 			}
6378 			continue;
6379 		}
6380 		RW_EXIT(&vswp->if_lockrw);
6381 
6382 		READ_ENTER(&plist->lockrw);
6383 		port = vsw_lookup_fdb(vswp, ehp);
6384 		if (port) {
6385 			/*
6386 			 * Mark the port as in-use.
6387 			 */
6388 			mutex_enter(&port->ref_lock);
6389 			port->ref_cnt++;
6390 			mutex_exit(&port->ref_lock);
6391 			RW_EXIT(&plist->lockrw);
6392 
6393 			/*
6394 			 * If plumbed and in promisc mode then copy msg
6395 			 * and send up the stack.
6396 			 */
6397 			READ_ENTER(&vswp->if_lockrw);
6398 			if (VSW_U_P(vswp->if_state)) {
6399 				RW_EXIT(&vswp->if_lockrw);
6400 				nmp = copymsg(mp);
6401 				if (nmp)
6402 					mac_rx(vswp->if_mh, mrh, nmp);
6403 			} else {
6404 				RW_EXIT(&vswp->if_lockrw);
6405 			}
6406 
6407 			/*
6408 			 * If the destination is in FDB, the packet
6409 			 * should be forwarded to the correponding
6410 			 * vsw_port (connected to a vnet device -
6411 			 * VSW_VNETPORT)
6412 			 */
6413 			(void) vsw_portsend(port, mp);
6414 
6415 			/*
6416 			 * Decrement use count in port and check if
6417 			 * should wake delete thread.
6418 			 */
6419 			mutex_enter(&port->ref_lock);
6420 			port->ref_cnt--;
6421 			if (port->ref_cnt == 0)
6422 				cv_signal(&port->ref_cv);
6423 			mutex_exit(&port->ref_lock);
6424 		} else {
6425 			RW_EXIT(&plist->lockrw);
6426 			/*
6427 			 * Destination not in FDB.
6428 			 *
6429 			 * If the destination is broadcast or
6430 			 * multicast forward the packet to all
6431 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
6432 			 * except the caller.
6433 			 */
6434 			if (IS_BROADCAST(ehp)) {
6435 				D3(vswp, "%s: BROADCAST pkt", __func__);
6436 				(void) vsw_forward_all(vswp, mp,
6437 								caller, arg);
6438 			} else if (IS_MULTICAST(ehp)) {
6439 				D3(vswp, "%s: MULTICAST pkt", __func__);
6440 				(void) vsw_forward_grp(vswp, mp,
6441 							caller, arg);
6442 			} else {
6443 				/*
6444 				 * If the destination is unicast, and came
6445 				 * from either a logical network device or
6446 				 * the switch itself when it is plumbed, then
6447 				 * send it out on the physical device and also
6448 				 * up the stack if the logical interface is
6449 				 * in promiscious mode.
6450 				 *
6451 				 * NOTE:  The assumption here is that if we
6452 				 * cannot find the destination in our fdb, its
6453 				 * a unicast address, and came from either a
6454 				 * vnet or down the stack (when plumbed) it
6455 				 * must be destinded for an ethernet device
6456 				 * outside our ldoms.
6457 				 */
6458 				if (caller == VSW_VNETPORT) {
6459 					READ_ENTER(&vswp->if_lockrw);
6460 					if (VSW_U_P(vswp->if_state)) {
6461 						RW_EXIT(&vswp->if_lockrw);
6462 						nmp = copymsg(mp);
6463 						if (nmp)
6464 							mac_rx(vswp->if_mh,
6465 								mrh, nmp);
6466 					} else {
6467 						RW_EXIT(&vswp->if_lockrw);
6468 					}
6469 					if ((ret_m = vsw_tx_msg(vswp, mp))
6470 								!= NULL) {
6471 						DERR(vswp, "%s: drop mblks to "
6472 							"phys dev", __func__);
6473 						freemsg(ret_m);
6474 					}
6475 
6476 				} else if (caller == VSW_PHYSDEV) {
6477 					/*
6478 					 * Pkt seen because card in promisc
6479 					 * mode. Send up stack if plumbed in
6480 					 * promisc mode, else drop it.
6481 					 */
6482 					READ_ENTER(&vswp->if_lockrw);
6483 					if (VSW_U_P(vswp->if_state)) {
6484 						RW_EXIT(&vswp->if_lockrw);
6485 						mac_rx(vswp->if_mh, mrh, mp);
6486 					} else {
6487 						RW_EXIT(&vswp->if_lockrw);
6488 						freemsg(mp);
6489 					}
6490 
6491 				} else if (caller == VSW_LOCALDEV) {
6492 					/*
6493 					 * Pkt came down the stack, send out
6494 					 * over physical device.
6495 					 */
6496 					if ((ret_m = vsw_tx_msg(vswp, mp))
6497 								!= NULL) {
6498 						DERR(vswp, "%s: drop mblks to "
6499 							"phys dev", __func__);
6500 						freemsg(ret_m);
6501 					}
6502 				}
6503 			}
6504 		}
6505 	}
6506 	D1(vswp, "%s: exit\n", __func__);
6507 }
6508 
6509 /*
6510  * Switch ethernet frame when in layer 3 mode (i.e. using IP
6511  * layer to do the routing).
6512  *
6513  * There is a large amount of overlap between this function and
6514  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
6515  * both these functions.
6516  */
6517 void
6518 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
6519 			vsw_port_t *arg, mac_resource_handle_t mrh)
6520 {
6521 	struct ether_header	*ehp;
6522 	vsw_port_t		*port = NULL;
6523 	mblk_t			*bp = NULL;
6524 	vsw_port_list_t		*plist = &vswp->plist;
6525 
6526 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
6527 
6528 	/*
6529 	 * In layer 3 mode should only ever be switching packets
6530 	 * between IP layer and vnet devices. So make sure thats
6531 	 * who is invoking us.
6532 	 */
6533 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
6534 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
6535 		freemsgchain(mp);
6536 		return;
6537 	}
6538 
6539 	/* process the chain of packets */
6540 	bp = mp;
6541 	while (bp) {
6542 		mp = bp;
6543 		bp = bp->b_next;
6544 		mp->b_next = mp->b_prev = NULL;
6545 		ehp = (struct ether_header *)mp->b_rptr;
6546 
6547 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
6548 			__func__, MBLKSIZE(mp), MBLKL(mp));
6549 
6550 		READ_ENTER(&plist->lockrw);
6551 		port = vsw_lookup_fdb(vswp, ehp);
6552 		if (port) {
6553 			/*
6554 			 * Mark port as in-use.
6555 			 */
6556 			mutex_enter(&port->ref_lock);
6557 			port->ref_cnt++;
6558 			mutex_exit(&port->ref_lock);
6559 			RW_EXIT(&plist->lockrw);
6560 
6561 			D2(vswp, "%s: sending to target port", __func__);
6562 			(void) vsw_portsend(port, mp);
6563 
6564 			/*
6565 			 * Finished with port so decrement ref count and
6566 			 * check if should wake delete thread.
6567 			 */
6568 			mutex_enter(&port->ref_lock);
6569 			port->ref_cnt--;
6570 			if (port->ref_cnt == 0)
6571 				cv_signal(&port->ref_cv);
6572 			mutex_exit(&port->ref_lock);
6573 		} else {
6574 			RW_EXIT(&plist->lockrw);
6575 			/*
6576 			 * Destination not in FDB
6577 			 *
6578 			 * If the destination is broadcast or
6579 			 * multicast forward the packet to all
6580 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
6581 			 * except the caller.
6582 			 */
6583 			if (IS_BROADCAST(ehp)) {
6584 				D2(vswp, "%s: BROADCAST pkt", __func__);
6585 				(void) vsw_forward_all(vswp, mp,
6586 								caller, arg);
6587 			} else if (IS_MULTICAST(ehp)) {
6588 				D2(vswp, "%s: MULTICAST pkt", __func__);
6589 				(void) vsw_forward_grp(vswp, mp,
6590 							caller, arg);
6591 			} else {
6592 				/*
6593 				 * Unicast pkt from vnet that we don't have
6594 				 * an FDB entry for, so must be destinded for
6595 				 * the outside world. Attempt to send up to the
6596 				 * IP layer to allow it to deal with it.
6597 				 */
6598 				if (caller == VSW_VNETPORT) {
6599 					READ_ENTER(&vswp->if_lockrw);
6600 					if (vswp->if_state & VSW_IF_UP) {
6601 						RW_EXIT(&vswp->if_lockrw);
6602 						D2(vswp, "%s: sending up",
6603 							__func__);
6604 						mac_rx(vswp->if_mh, mrh, mp);
6605 					} else {
6606 						RW_EXIT(&vswp->if_lockrw);
6607 						/* Interface down, drop pkt */
6608 						D2(vswp, "%s I/F down",
6609 								__func__);
6610 						freemsg(mp);
6611 					}
6612 				}
6613 			}
6614 		}
6615 	}
6616 
6617 	D1(vswp, "%s: exit", __func__);
6618 }
6619 
6620 /*
6621  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
6622  * except the caller (port on which frame arrived).
6623  */
6624 static int
6625 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
6626 {
6627 	vsw_port_list_t	*plist = &vswp->plist;
6628 	vsw_port_t	*portp;
6629 	mblk_t		*nmp = NULL;
6630 	mblk_t		*ret_m = NULL;
6631 	int		skip_port = 0;
6632 
6633 	D1(vswp, "vsw_forward_all: enter\n");
6634 
6635 	/*
6636 	 * Broadcast message from inside ldoms so send to outside
6637 	 * world if in either of layer 2 modes.
6638 	 */
6639 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
6640 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
6641 		((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
6642 
6643 		nmp = dupmsg(mp);
6644 		if (nmp) {
6645 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
6646 				DERR(vswp, "%s: dropping pkt(s) "
6647 				"consisting of %ld bytes of data for"
6648 				" physical device", __func__, MBLKL(ret_m));
6649 			freemsg(ret_m);
6650 			}
6651 		}
6652 	}
6653 
6654 	if (caller == VSW_VNETPORT)
6655 		skip_port = 1;
6656 
6657 	/*
6658 	 * Broadcast message from other vnet (layer 2 or 3) or outside
6659 	 * world (layer 2 only), send up stack if plumbed.
6660 	 */
6661 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
6662 		READ_ENTER(&vswp->if_lockrw);
6663 		if (vswp->if_state & VSW_IF_UP) {
6664 			RW_EXIT(&vswp->if_lockrw);
6665 			nmp = copymsg(mp);
6666 			if (nmp)
6667 				mac_rx(vswp->if_mh, NULL, nmp);
6668 		} else {
6669 			RW_EXIT(&vswp->if_lockrw);
6670 		}
6671 	}
6672 
6673 	/* send it to all VNETPORTs */
6674 	READ_ENTER(&plist->lockrw);
6675 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
6676 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
6677 		/*
6678 		 * Caution ! - don't reorder these two checks as arg
6679 		 * will be NULL if the caller is PHYSDEV. skip_port is
6680 		 * only set if caller is VNETPORT.
6681 		 */
6682 		if ((skip_port) && (portp == arg))
6683 			continue;
6684 		else {
6685 			nmp = dupmsg(mp);
6686 			if (nmp) {
6687 				(void) vsw_portsend(portp, nmp);
6688 			} else {
6689 				DERR(vswp, "vsw_forward_all: nmp NULL");
6690 			}
6691 		}
6692 	}
6693 	RW_EXIT(&plist->lockrw);
6694 
6695 	freemsg(mp);
6696 
6697 	D1(vswp, "vsw_forward_all: exit\n");
6698 	return (0);
6699 }
6700 
6701 /*
6702  * Forward pkts to any devices or interfaces which have registered
6703  * an interest in them (i.e. multicast groups).
6704  */
6705 static int
6706 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
6707 {
6708 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
6709 	mfdb_ent_t		*entp = NULL;
6710 	mfdb_ent_t		*tpp = NULL;
6711 	vsw_port_t 		*port;
6712 	uint64_t		key = 0;
6713 	mblk_t			*nmp = NULL;
6714 	mblk_t			*ret_m = NULL;
6715 	boolean_t		check_if = B_TRUE;
6716 
6717 	/*
6718 	 * Convert address to hash table key
6719 	 */
6720 	KEY_HASH(key, ehp->ether_dhost);
6721 
6722 	D1(vswp, "%s: key 0x%llx", __func__, key);
6723 
6724 	/*
6725 	 * If pkt came from either a vnet or down the stack (if we are
6726 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
6727 	 * over the physical adapter, and then check to see if any other
6728 	 * vnets are interested in it.
6729 	 */
6730 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
6731 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
6732 		((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
6733 		nmp = dupmsg(mp);
6734 		if (nmp) {
6735 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
6736 				DERR(vswp, "%s: dropping pkt(s) "
6737 					"consisting of %ld bytes of "
6738 					"data for physical device",
6739 					__func__, MBLKL(ret_m));
6740 				freemsg(ret_m);
6741 			}
6742 		}
6743 	}
6744 
6745 	READ_ENTER(&vswp->mfdbrw);
6746 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
6747 				(mod_hash_val_t *)&entp) != 0) {
6748 		D3(vswp, "%s: no table entry found for addr 0x%llx",
6749 								__func__, key);
6750 	} else {
6751 		/*
6752 		 * Send to list of devices associated with this address...
6753 		 */
6754 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
6755 
6756 			/* dont send to ourselves */
6757 			if ((caller == VSW_VNETPORT) &&
6758 				(tpp->d_addr == (void *)arg)) {
6759 				port = (vsw_port_t *)tpp->d_addr;
6760 				D3(vswp, "%s: not sending to ourselves"
6761 					" : port %d", __func__,
6762 					port->p_instance);
6763 				continue;
6764 
6765 			} else if ((caller == VSW_LOCALDEV) &&
6766 				(tpp->d_type == VSW_LOCALDEV)) {
6767 				D3(vswp, "%s: not sending back up stack",
6768 					__func__);
6769 				continue;
6770 			}
6771 
6772 			if (tpp->d_type == VSW_VNETPORT) {
6773 				port = (vsw_port_t *)tpp->d_addr;
6774 				D3(vswp, "%s: sending to port %ld for "
6775 					" addr 0x%llx", __func__,
6776 					port->p_instance, key);
6777 
6778 				nmp = dupmsg(mp);
6779 				if (nmp)
6780 					(void) vsw_portsend(port, nmp);
6781 			} else {
6782 				if (vswp->if_state & VSW_IF_UP) {
6783 					nmp = copymsg(mp);
6784 					if (nmp)
6785 						mac_rx(vswp->if_mh, NULL, nmp);
6786 					check_if = B_FALSE;
6787 					D3(vswp, "%s: sending up stack"
6788 						" for addr 0x%llx", __func__,
6789 						key);
6790 				}
6791 			}
6792 		}
6793 	}
6794 
6795 	RW_EXIT(&vswp->mfdbrw);
6796 
6797 	/*
6798 	 * If the pkt came from either a vnet or from physical device,
6799 	 * and if we havent already sent the pkt up the stack then we
6800 	 * check now if we can/should (i.e. the interface is plumbed
6801 	 * and in promisc mode).
6802 	 */
6803 	if ((check_if) &&
6804 		((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
6805 		READ_ENTER(&vswp->if_lockrw);
6806 		if (VSW_U_P(vswp->if_state)) {
6807 			RW_EXIT(&vswp->if_lockrw);
6808 			D3(vswp, "%s: (caller %d) finally sending up stack"
6809 				" for addr 0x%llx", __func__, caller, key);
6810 			nmp = copymsg(mp);
6811 			if (nmp)
6812 				mac_rx(vswp->if_mh, NULL, nmp);
6813 		} else {
6814 			RW_EXIT(&vswp->if_lockrw);
6815 		}
6816 	}
6817 
6818 	freemsg(mp);
6819 
6820 	D1(vswp, "%s: exit", __func__);
6821 
6822 	return (0);
6823 }
6824 
6825 /* transmit the packet over the given port */
6826 static int
6827 vsw_portsend(vsw_port_t *port, mblk_t *mp)
6828 {
6829 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
6830 	vsw_ldc_t 	*ldcp;
6831 	int		status = 0;
6832 
6833 
6834 	READ_ENTER(&ldcl->lockrw);
6835 	/*
6836 	 * Note for now, we have a single channel.
6837 	 */
6838 	ldcp = ldcl->head;
6839 	if (ldcp == NULL) {
6840 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
6841 		freemsg(mp);
6842 		RW_EXIT(&ldcl->lockrw);
6843 		return (1);
6844 	}
6845 
6846 	/*
6847 	 * Send the message out using the appropriate
6848 	 * transmit function which will free mblock when it
6849 	 * is finished with it.
6850 	 */
6851 	mutex_enter(&port->tx_lock);
6852 	if (port->transmit != NULL)
6853 		status = (*port->transmit)(ldcp, mp);
6854 	else {
6855 		freemsg(mp);
6856 	}
6857 	mutex_exit(&port->tx_lock);
6858 
6859 	RW_EXIT(&ldcl->lockrw);
6860 
6861 	return (status);
6862 }
6863 
6864 /*
6865  * Send packet out via descriptor ring to a logical device.
6866  */
6867 static int
6868 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
6869 {
6870 	vio_dring_msg_t		dring_pkt;
6871 	dring_info_t		*dp = NULL;
6872 	vsw_private_desc_t	*priv_desc = NULL;
6873 	vnet_public_desc_t	*pub = NULL;
6874 	vsw_t			*vswp = ldcp->ldc_vswp;
6875 	mblk_t			*bp;
6876 	size_t			n, size;
6877 	caddr_t			bufp;
6878 	int			idx;
6879 	int			status = LDC_TX_SUCCESS;
6880 
6881 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
6882 
6883 	/* TODO: make test a macro */
6884 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
6885 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
6886 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
6887 			"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
6888 			ldcp->lane_out.lstate);
6889 		freemsg(mp);
6890 		return (LDC_TX_FAILURE);
6891 	}
6892 
6893 	/*
6894 	 * Note - using first ring only, this may change
6895 	 * in the future.
6896 	 */
6897 	READ_ENTER(&ldcp->lane_out.dlistrw);
6898 	if ((dp = ldcp->lane_out.dringp) == NULL) {
6899 		RW_EXIT(&ldcp->lane_out.dlistrw);
6900 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
6901 			" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
6902 		freemsg(mp);
6903 		return (LDC_TX_FAILURE);
6904 	}
6905 
6906 	size = msgsize(mp);
6907 	if (size > (size_t)ETHERMAX) {
6908 		RW_EXIT(&ldcp->lane_out.dlistrw);
6909 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
6910 		    ldcp->ldc_id, size);
6911 		freemsg(mp);
6912 		return (LDC_TX_FAILURE);
6913 	}
6914 
6915 	/*
6916 	 * Find a free descriptor
6917 	 *
6918 	 * Note: for the moment we are assuming that we will only
6919 	 * have one dring going from the switch to each of its
6920 	 * peers. This may change in the future.
6921 	 */
6922 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
6923 		D2(vswp, "%s(%lld): no descriptor available for ring "
6924 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
6925 
6926 		/* nothing more we can do */
6927 		status = LDC_TX_NORESOURCES;
6928 		goto vsw_dringsend_free_exit;
6929 	} else {
6930 		D2(vswp, "%s(%lld): free private descriptor found at pos "
6931 			"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
6932 			priv_desc);
6933 	}
6934 
6935 	/* copy data into the descriptor */
6936 	bufp = priv_desc->datap;
6937 	bufp += VNET_IPALIGN;
6938 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
6939 		n = MBLKL(bp);
6940 		bcopy(bp->b_rptr, bufp, n);
6941 		bufp += n;
6942 	}
6943 
6944 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
6945 
6946 	pub = priv_desc->descp;
6947 	pub->nbytes = priv_desc->datalen;
6948 
6949 	mutex_enter(&priv_desc->dstate_lock);
6950 	pub->hdr.dstate = VIO_DESC_READY;
6951 	mutex_exit(&priv_desc->dstate_lock);
6952 
6953 	/*
6954 	 * Determine whether or not we need to send a message to our
6955 	 * peer prompting them to read our newly updated descriptor(s).
6956 	 */
6957 	mutex_enter(&dp->restart_lock);
6958 	if (dp->restart_reqd) {
6959 		dp->restart_reqd = B_FALSE;
6960 		mutex_exit(&dp->restart_lock);
6961 
6962 		/*
6963 		 * Send a vio_dring_msg to peer to prompt them to read
6964 		 * the updated descriptor ring.
6965 		 */
6966 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
6967 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
6968 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
6969 		dring_pkt.tag.vio_sid = ldcp->local_session;
6970 
6971 		/* Note - for now using first ring */
6972 		dring_pkt.dring_ident = dp->ident;
6973 
6974 		mutex_enter(&ldcp->lane_out.seq_lock);
6975 		dring_pkt.seq_num = ldcp->lane_out.seq_num++;
6976 		mutex_exit(&ldcp->lane_out.seq_lock);
6977 
6978 		/*
6979 		 * If last_ack_recv is -1 then we know we've not
6980 		 * received any ack's yet, so this must be the first
6981 		 * msg sent, so set the start to the begining of the ring.
6982 		 */
6983 		mutex_enter(&dp->dlock);
6984 		if (dp->last_ack_recv == -1) {
6985 			dring_pkt.start_idx = 0;
6986 		} else {
6987 			dring_pkt.start_idx = (dp->last_ack_recv + 1) %
6988 						dp->num_descriptors;
6989 		}
6990 		dring_pkt.end_idx = -1;
6991 		mutex_exit(&dp->dlock);
6992 
6993 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
6994 			ldcp->ldc_id, dp, dring_pkt.dring_ident);
6995 		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
6996 			__func__, ldcp->ldc_id, dring_pkt.start_idx,
6997 			dring_pkt.end_idx, dring_pkt.seq_num);
6998 
6999 		RW_EXIT(&ldcp->lane_out.dlistrw);
7000 
7001 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
7002 					sizeof (vio_dring_msg_t), B_TRUE);
7003 
7004 		/* free the message block */
7005 		freemsg(mp);
7006 		return (status);
7007 
7008 	} else {
7009 		mutex_exit(&dp->restart_lock);
7010 		D2(vswp, "%s(%lld): updating descp %d", __func__,
7011 			ldcp->ldc_id, idx);
7012 	}
7013 
7014 vsw_dringsend_free_exit:
7015 
7016 	RW_EXIT(&ldcp->lane_out.dlistrw);
7017 
7018 	/* free the message block */
7019 	freemsg(mp);
7020 
7021 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
7022 	return (status);
7023 }
7024 
7025 /*
7026  * Send an in-band descriptor message over ldc.
7027  */
7028 static int
7029 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
7030 {
7031 	vsw_t			*vswp = ldcp->ldc_vswp;
7032 	vnet_ibnd_desc_t	ibnd_msg;
7033 	vsw_private_desc_t	*priv_desc = NULL;
7034 	dring_info_t		*dp = NULL;
7035 	size_t			n, size = 0;
7036 	caddr_t			bufp;
7037 	mblk_t			*bp;
7038 	int			idx, i;
7039 	int			status = LDC_TX_SUCCESS;
7040 	static int		warn_msg = 1;
7041 
7042 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
7043 
7044 	ASSERT(mp != NULL);
7045 
7046 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
7047 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
7048 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
7049 			__func__, ldcp->ldc_id, ldcp->ldc_status,
7050 			ldcp->lane_out.lstate);
7051 		freemsg(mp);
7052 		return (LDC_TX_FAILURE);
7053 	}
7054 
7055 	/*
7056 	 * only expect single dring to exist, which we use
7057 	 * as an internal buffer, rather than a transfer channel.
7058 	 */
7059 	READ_ENTER(&ldcp->lane_out.dlistrw);
7060 	if ((dp = ldcp->lane_out.dringp) == NULL) {
7061 		DERR(vswp, "%s(%lld): no dring for outbound lane",
7062 			__func__, ldcp->ldc_id);
7063 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
7064 			__func__, ldcp->ldc_id, ldcp->ldc_status,
7065 			ldcp->lane_out.lstate);
7066 		RW_EXIT(&ldcp->lane_out.dlistrw);
7067 		freemsg(mp);
7068 		return (LDC_TX_FAILURE);
7069 	}
7070 
7071 	size = msgsize(mp);
7072 	if (size > (size_t)ETHERMAX) {
7073 		RW_EXIT(&ldcp->lane_out.dlistrw);
7074 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
7075 		    ldcp->ldc_id, size);
7076 		freemsg(mp);
7077 		return (LDC_TX_FAILURE);
7078 	}
7079 
7080 	/*
7081 	 * Find a free descriptor in our buffer ring
7082 	 */
7083 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
7084 		RW_EXIT(&ldcp->lane_out.dlistrw);
7085 		if (warn_msg) {
7086 			DERR(vswp, "%s(%lld): no descriptor available for ring "
7087 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
7088 			warn_msg = 0;
7089 		}
7090 
7091 		/* nothing more we can do */
7092 		status = LDC_TX_NORESOURCES;
7093 		goto vsw_descrsend_free_exit;
7094 	} else {
7095 		D2(vswp, "%s(%lld): free private descriptor found at pos "
7096 			"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
7097 			priv_desc);
7098 		warn_msg = 1;
7099 	}
7100 
7101 	/* copy data into the descriptor */
7102 	bufp = priv_desc->datap;
7103 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
7104 		n = MBLKL(bp);
7105 		bcopy(bp->b_rptr, bufp, n);
7106 		bufp += n;
7107 	}
7108 
7109 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
7110 
7111 	/* create and send the in-band descp msg */
7112 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
7113 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
7114 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
7115 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
7116 
7117 	mutex_enter(&ldcp->lane_out.seq_lock);
7118 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
7119 	mutex_exit(&ldcp->lane_out.seq_lock);
7120 
7121 	/*
7122 	 * Copy the mem cookies describing the data from the
7123 	 * private region of the descriptor ring into the inband
7124 	 * descriptor.
7125 	 */
7126 	for (i = 0; i < priv_desc->ncookies; i++) {
7127 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
7128 			sizeof (ldc_mem_cookie_t));
7129 	}
7130 
7131 	ibnd_msg.hdr.desc_handle = idx;
7132 	ibnd_msg.ncookies = priv_desc->ncookies;
7133 	ibnd_msg.nbytes = size;
7134 
7135 	RW_EXIT(&ldcp->lane_out.dlistrw);
7136 
7137 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
7138 			sizeof (vnet_ibnd_desc_t), B_TRUE);
7139 
7140 vsw_descrsend_free_exit:
7141 
7142 	/* free the allocated message blocks */
7143 	freemsg(mp);
7144 
7145 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
7146 	return (status);
7147 }
7148 
7149 static void
7150 vsw_send_ver(void *arg)
7151 {
7152 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
7153 	vsw_t		*vswp = ldcp->ldc_vswp;
7154 	lane_t		*lp = &ldcp->lane_out;
7155 	vio_ver_msg_t	ver_msg;
7156 
7157 	D1(vswp, "%s enter", __func__);
7158 
7159 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7160 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7161 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
7162 	ver_msg.tag.vio_sid = ldcp->local_session;
7163 
7164 	ver_msg.ver_major = vsw_versions[0].ver_major;
7165 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
7166 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
7167 
7168 	lp->lstate |= VSW_VER_INFO_SENT;
7169 	lp->ver_major = ver_msg.ver_major;
7170 	lp->ver_minor = ver_msg.ver_minor;
7171 
7172 	DUMP_TAG(ver_msg.tag);
7173 
7174 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
7175 
7176 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
7177 }
7178 
7179 static void
7180 vsw_send_attr(vsw_ldc_t *ldcp)
7181 {
7182 	vsw_t			*vswp = ldcp->ldc_vswp;
7183 	lane_t			*lp = &ldcp->lane_out;
7184 	vnet_attr_msg_t		attr_msg;
7185 
7186 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
7187 
7188 	/*
7189 	 * Subtype is set to INFO by default
7190 	 */
7191 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7192 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7193 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
7194 	attr_msg.tag.vio_sid = ldcp->local_session;
7195 
7196 	/* payload copied from default settings for lane */
7197 	attr_msg.mtu = lp->mtu;
7198 	attr_msg.addr_type = lp->addr_type;
7199 	attr_msg.xfer_mode = lp->xfer_mode;
7200 	attr_msg.ack_freq = lp->xfer_mode;
7201 
7202 	READ_ENTER(&vswp->if_lockrw);
7203 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
7204 	RW_EXIT(&vswp->if_lockrw);
7205 
7206 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
7207 
7208 	DUMP_TAG(attr_msg.tag);
7209 
7210 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
7211 
7212 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
7213 }
7214 
7215 /*
7216  * Create dring info msg (which also results in the creation of
7217  * a dring).
7218  */
7219 static vio_dring_reg_msg_t *
7220 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
7221 {
7222 	vio_dring_reg_msg_t	*mp;
7223 	dring_info_t		*dp;
7224 	vsw_t			*vswp = ldcp->ldc_vswp;
7225 
7226 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
7227 
7228 	/*
7229 	 * If we can't create a dring, obviously no point sending
7230 	 * a message.
7231 	 */
7232 	if ((dp = vsw_create_dring(ldcp)) == NULL)
7233 		return (NULL);
7234 
7235 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
7236 
7237 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
7238 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
7239 	mp->tag.vio_subtype_env = VIO_DRING_REG;
7240 	mp->tag.vio_sid = ldcp->local_session;
7241 
7242 	/* payload */
7243 	mp->num_descriptors = dp->num_descriptors;
7244 	mp->descriptor_size = dp->descriptor_size;
7245 	mp->options = dp->options;
7246 	mp->ncookies = dp->ncookies;
7247 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
7248 
7249 	mp->dring_ident = 0;
7250 
7251 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
7252 
7253 	return (mp);
7254 }
7255 
7256 static void
7257 vsw_send_dring_info(vsw_ldc_t *ldcp)
7258 {
7259 	vio_dring_reg_msg_t	*dring_msg;
7260 	vsw_t			*vswp = ldcp->ldc_vswp;
7261 
7262 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
7263 
7264 	dring_msg = vsw_create_dring_info_pkt(ldcp);
7265 	if (dring_msg == NULL) {
7266 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
7267 			vswp->instance, __func__);
7268 		return;
7269 	}
7270 
7271 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
7272 
7273 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
7274 
7275 	(void) vsw_send_msg(ldcp, dring_msg,
7276 		sizeof (vio_dring_reg_msg_t), B_TRUE);
7277 
7278 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
7279 
7280 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
7281 }
7282 
7283 static void
7284 vsw_send_rdx(vsw_ldc_t *ldcp)
7285 {
7286 	vsw_t		*vswp = ldcp->ldc_vswp;
7287 	vio_rdx_msg_t	rdx_msg;
7288 
7289 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
7290 
7291 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7292 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7293 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
7294 	rdx_msg.tag.vio_sid = ldcp->local_session;
7295 
7296 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
7297 
7298 	DUMP_TAG(rdx_msg.tag);
7299 
7300 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
7301 
7302 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
7303 }
7304 
7305 /*
7306  * Generic routine to send message out over ldc channel.
7307  *
7308  * It is possible that when we attempt to write over the ldc channel
7309  * that we get notified that it has been reset. Depending on the value
7310  * of the handle_reset flag we either handle that event here or simply
7311  * notify the caller that the channel was reset.
7312  */
7313 static int
7314 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
7315 {
7316 	int		rv;
7317 	size_t		msglen = size;
7318 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
7319 	vsw_t		*vswp = ldcp->ldc_vswp;
7320 
7321 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
7322 			ldcp->ldc_id, size);
7323 
7324 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
7325 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
7326 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
7327 
7328 	mutex_enter(&ldcp->ldc_txlock);
7329 	do {
7330 		msglen = size;
7331 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
7332 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
7333 
7334 	if ((rv != 0) || (msglen != size)) {
7335 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
7336 			"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
7337 			rv, size, msglen);
7338 	}
7339 	mutex_exit(&ldcp->ldc_txlock);
7340 
7341 	/*
7342 	 * If channel has been reset we either handle it here or
7343 	 * simply report back that it has been reset and let caller
7344 	 * decide what to do.
7345 	 */
7346 	if (rv == ECONNRESET) {
7347 		DWARN(vswp, "%s (%lld) channel reset",
7348 					__func__, ldcp->ldc_id);
7349 
7350 		/*
7351 		 * N.B - must never be holding the dlistrw lock when
7352 		 * we do a reset of the channel.
7353 		 */
7354 		if (handle_reset) {
7355 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
7356 		}
7357 	}
7358 
7359 	return (rv);
7360 }
7361 
7362 /*
7363  * Add an entry into FDB, for the given mac address and port_id.
7364  * Returns 0 on success, 1 on failure.
7365  *
7366  * Lock protecting FDB must be held by calling process.
7367  */
7368 static int
7369 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
7370 {
7371 	uint64_t	addr = 0;
7372 
7373 	D1(vswp, "%s: enter", __func__);
7374 
7375 	KEY_HASH(addr, port->p_macaddr);
7376 
7377 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
7378 
7379 	/*
7380 	 * Note: duplicate keys will be rejected by mod_hash.
7381 	 */
7382 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
7383 				(mod_hash_val_t)port) != 0) {
7384 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
7385 		return (1);
7386 	}
7387 
7388 	D1(vswp, "%s: exit", __func__);
7389 	return (0);
7390 }
7391 
7392 /*
7393  * Remove an entry from FDB.
7394  * Returns 0 on success, 1 on failure.
7395  */
7396 static int
7397 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
7398 {
7399 	uint64_t	addr = 0;
7400 
7401 	D1(vswp, "%s: enter", __func__);
7402 
7403 	KEY_HASH(addr, port->p_macaddr);
7404 
7405 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
7406 
7407 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
7408 
7409 	D1(vswp, "%s: enter", __func__);
7410 
7411 	return (0);
7412 }
7413 
7414 /*
7415  * Search fdb for a given mac address.
7416  * Returns pointer to the entry if found, else returns NULL.
7417  */
7418 static vsw_port_t *
7419 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
7420 {
7421 	uint64_t	key = 0;
7422 	vsw_port_t	*port = NULL;
7423 
7424 	D1(vswp, "%s: enter", __func__);
7425 
7426 	KEY_HASH(key, ehp->ether_dhost);
7427 
7428 	D2(vswp, "%s: key = 0x%llx", __func__, key);
7429 
7430 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
7431 				(mod_hash_val_t *)&port) != 0) {
7432 		D2(vswp, "%s: no port found", __func__);
7433 		return (NULL);
7434 	}
7435 
7436 	D1(vswp, "%s: exit", __func__);
7437 
7438 	return (port);
7439 }
7440 
7441 /*
7442  * Add or remove multicast address(es).
7443  *
7444  * Returns 0 on success, 1 on failure.
7445  */
7446 static int
7447 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
7448 {
7449 	mcst_addr_t		*mcst_p = NULL;
7450 	vsw_t			*vswp = port->p_vswp;
7451 	uint64_t		addr = 0x0;
7452 	int			i;
7453 
7454 	D1(vswp, "%s: enter", __func__);
7455 
7456 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
7457 
7458 	mutex_enter(&vswp->mac_lock);
7459 	if (vswp->mh == NULL) {
7460 		mutex_exit(&vswp->mac_lock);
7461 		return (1);
7462 	}
7463 	mutex_exit(&vswp->mac_lock);
7464 
7465 	for (i = 0; i < mcst_pkt->count; i++) {
7466 		/*
7467 		 * Convert address into form that can be used
7468 		 * as hash table key.
7469 		 */
7470 		KEY_HASH(addr, mcst_pkt->mca[i]);
7471 
7472 		/*
7473 		 * Add or delete the specified address/port combination.
7474 		 */
7475 		if (mcst_pkt->set == 0x1) {
7476 			D3(vswp, "%s: adding multicast address 0x%llx for "
7477 				"port %ld", __func__, addr, port->p_instance);
7478 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
7479 				/*
7480 				 * Update the list of multicast
7481 				 * addresses contained within the
7482 				 * port structure to include this new
7483 				 * one.
7484 				 */
7485 				mcst_p = kmem_alloc(sizeof (mcst_addr_t),
7486 								KM_NOSLEEP);
7487 				if (mcst_p == NULL) {
7488 					DERR(vswp, "%s: unable to alloc mem",
7489 						__func__);
7490 					return (1);
7491 				}
7492 
7493 				mcst_p->nextp = NULL;
7494 				mcst_p->addr = addr;
7495 
7496 				mutex_enter(&port->mca_lock);
7497 				mcst_p->nextp = port->mcap;
7498 				port->mcap = mcst_p;
7499 				mutex_exit(&port->mca_lock);
7500 
7501 				/*
7502 				 * Program the address into HW. If the addr
7503 				 * has already been programmed then the MAC
7504 				 * just increments a ref counter (which is
7505 				 * used when the address is being deleted)
7506 				 */
7507 				mutex_enter(&vswp->mac_lock);
7508 				if ((vswp->mh == NULL) ||
7509 					mac_multicst_add(vswp->mh,
7510 						(uchar_t *)&mcst_pkt->mca[i])) {
7511 					mutex_exit(&vswp->mac_lock);
7512 					cmn_err(CE_WARN, "!vsw%d: unable to "
7513 						"add multicast address",
7514 						vswp->instance);
7515 					(void) vsw_del_mcst(vswp, VSW_VNETPORT,
7516 						addr, port);
7517 					vsw_del_addr(VSW_VNETPORT, port, addr);
7518 					return (1);
7519 				}
7520 				mutex_exit(&vswp->mac_lock);
7521 
7522 			} else {
7523 				DERR(vswp, "%s: error adding multicast "
7524 					"address 0x%llx for port %ld",
7525 					__func__, addr, port->p_instance);
7526 				return (1);
7527 			}
7528 		} else {
7529 			/*
7530 			 * Delete an entry from the multicast hash
7531 			 * table and update the address list
7532 			 * appropriately.
7533 			 */
7534 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
7535 				D3(vswp, "%s: deleting multicast address "
7536 					"0x%llx for port %ld", __func__, addr,
7537 					port->p_instance);
7538 
7539 				vsw_del_addr(VSW_VNETPORT, port, addr);
7540 
7541 				/*
7542 				 * Remove the address from HW. The address
7543 				 * will actually only be removed once the ref
7544 				 * count within the MAC layer has dropped to
7545 				 * zero. I.e. we can safely call this fn even
7546 				 * if other ports are interested in this
7547 				 * address.
7548 				 */
7549 				mutex_enter(&vswp->mac_lock);
7550 				if ((vswp->mh == NULL) ||
7551 					mac_multicst_remove(vswp->mh,
7552 						(uchar_t *)&mcst_pkt->mca[i])) {
7553 					mutex_exit(&vswp->mac_lock);
7554 					cmn_err(CE_WARN, "!vsw%d: unable to "
7555 						"remove multicast address",
7556 						vswp->instance);
7557 					return (1);
7558 				}
7559 				mutex_exit(&vswp->mac_lock);
7560 
7561 			} else {
7562 				DERR(vswp, "%s: error deleting multicast "
7563 					"addr 0x%llx for port %ld",
7564 					__func__, addr, port->p_instance);
7565 				return (1);
7566 			}
7567 		}
7568 	}
7569 	D1(vswp, "%s: exit", __func__);
7570 	return (0);
7571 }
7572 
7573 /*
7574  * Add a new multicast entry.
7575  *
7576  * Search hash table based on address. If match found then
7577  * update associated val (which is chain of ports), otherwise
7578  * create new key/val (addr/port) pair and insert into table.
7579  */
7580 static int
7581 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
7582 {
7583 	int		dup = 0;
7584 	int		rv = 0;
7585 	mfdb_ent_t	*ment = NULL;
7586 	mfdb_ent_t	*tmp_ent = NULL;
7587 	mfdb_ent_t	*new_ent = NULL;
7588 	void		*tgt = NULL;
7589 
7590 	if (devtype == VSW_VNETPORT) {
7591 		/*
7592 		 * Being invoked from a vnet.
7593 		 */
7594 		ASSERT(arg != NULL);
7595 		tgt = arg;
7596 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
7597 			((vsw_port_t *)arg)->p_instance, addr);
7598 	} else {
7599 		/*
7600 		 * We are being invoked via the m_multicst mac entry
7601 		 * point.
7602 		 */
7603 		D2(NULL, "%s: address 0x%llx", __func__, addr);
7604 		tgt = (void *)vswp;
7605 	}
7606 
7607 	WRITE_ENTER(&vswp->mfdbrw);
7608 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
7609 				(mod_hash_val_t *)&ment) != 0) {
7610 
7611 		/* address not currently in table */
7612 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
7613 		ment->d_addr = (void *)tgt;
7614 		ment->d_type = devtype;
7615 		ment->nextp = NULL;
7616 
7617 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
7618 			(mod_hash_val_t)ment) != 0) {
7619 			DERR(vswp, "%s: hash table insertion failed", __func__);
7620 			kmem_free(ment, sizeof (mfdb_ent_t));
7621 			rv = 1;
7622 		} else {
7623 			D2(vswp, "%s: added initial entry for 0x%llx to "
7624 				"table", __func__, addr);
7625 		}
7626 	} else {
7627 		/*
7628 		 * Address in table. Check to see if specified port
7629 		 * is already associated with the address. If not add
7630 		 * it now.
7631 		 */
7632 		tmp_ent = ment;
7633 		while (tmp_ent != NULL) {
7634 			if (tmp_ent->d_addr == (void *)tgt) {
7635 				if (devtype == VSW_VNETPORT) {
7636 					DERR(vswp, "%s: duplicate port entry "
7637 						"found for portid %ld and key "
7638 						"0x%llx", __func__,
7639 						((vsw_port_t *)arg)->p_instance,
7640 						addr);
7641 				} else {
7642 					DERR(vswp, "%s: duplicate entry found"
7643 						"for key 0x%llx",
7644 						__func__, addr);
7645 				}
7646 				rv = 1;
7647 				dup = 1;
7648 				break;
7649 			}
7650 			tmp_ent = tmp_ent->nextp;
7651 		}
7652 
7653 		/*
7654 		 * Port not on list so add it to end now.
7655 		 */
7656 		if (0 == dup) {
7657 			D2(vswp, "%s: added entry for 0x%llx to table",
7658 				__func__, addr);
7659 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
7660 			new_ent->d_addr = (void *)tgt;
7661 			new_ent->d_type = devtype;
7662 			new_ent->nextp = NULL;
7663 
7664 			tmp_ent = ment;
7665 			while (tmp_ent->nextp != NULL)
7666 				tmp_ent = tmp_ent->nextp;
7667 
7668 			tmp_ent->nextp = new_ent;
7669 		}
7670 	}
7671 
7672 	RW_EXIT(&vswp->mfdbrw);
7673 	return (rv);
7674 }
7675 
7676 /*
7677  * Remove a multicast entry from the hashtable.
7678  *
7679  * Search hash table based on address. If match found, scan
7680  * list of ports associated with address. If specified port
7681  * found remove it from list.
7682  */
7683 static int
7684 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
7685 {
7686 	mfdb_ent_t	*ment = NULL;
7687 	mfdb_ent_t	*curr_p, *prev_p;
7688 	void		*tgt = NULL;
7689 
7690 	D1(vswp, "%s: enter", __func__);
7691 
7692 	if (devtype == VSW_VNETPORT) {
7693 		tgt = (vsw_port_t *)arg;
7694 		D2(vswp, "%s: removing port %d from mFDB for address"
7695 			" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
7696 			addr);
7697 	} else {
7698 		D2(vswp, "%s: removing entry", __func__);
7699 		tgt = (void *)vswp;
7700 	}
7701 
7702 	WRITE_ENTER(&vswp->mfdbrw);
7703 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
7704 				(mod_hash_val_t *)&ment) != 0) {
7705 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
7706 		RW_EXIT(&vswp->mfdbrw);
7707 		return (1);
7708 	}
7709 
7710 	prev_p = curr_p = ment;
7711 
7712 	while (curr_p != NULL) {
7713 		if (curr_p->d_addr == (void *)tgt) {
7714 			if (devtype == VSW_VNETPORT) {
7715 				D2(vswp, "%s: port %d found", __func__,
7716 					((vsw_port_t *)tgt)->p_instance);
7717 			} else {
7718 				D2(vswp, "%s: instance found", __func__);
7719 			}
7720 
7721 			if (prev_p == curr_p) {
7722 				/*
7723 				 * head of list, if no other element is in
7724 				 * list then destroy this entry, otherwise
7725 				 * just replace it with updated value.
7726 				 */
7727 				ment = curr_p->nextp;
7728 				kmem_free(curr_p, sizeof (mfdb_ent_t));
7729 				if (ment == NULL) {
7730 					(void) mod_hash_destroy(vswp->mfdb,
7731 							(mod_hash_val_t)addr);
7732 				} else {
7733 					(void) mod_hash_replace(vswp->mfdb,
7734 							(mod_hash_key_t)addr,
7735 							(mod_hash_val_t)ment);
7736 				}
7737 			} else {
7738 				/*
7739 				 * Not head of list, no need to do
7740 				 * replacement, just adjust list pointers.
7741 				 */
7742 				prev_p->nextp = curr_p->nextp;
7743 				kmem_free(curr_p, sizeof (mfdb_ent_t));
7744 			}
7745 			break;
7746 		}
7747 
7748 		prev_p = curr_p;
7749 		curr_p = curr_p->nextp;
7750 	}
7751 
7752 	RW_EXIT(&vswp->mfdbrw);
7753 
7754 	D1(vswp, "%s: exit", __func__);
7755 
7756 	return (0);
7757 }
7758 
7759 /*
7760  * Port is being deleted, but has registered an interest in one
7761  * or more multicast groups. Using the list of addresses maintained
7762  * within the port structure find the appropriate entry in the hash
7763  * table and remove this port from the list of interested ports.
7764  */
7765 static void
7766 vsw_del_mcst_port(vsw_port_t *port)
7767 {
7768 	mcst_addr_t	*mcst_p = NULL;
7769 	vsw_t		*vswp = port->p_vswp;
7770 
7771 	D1(vswp, "%s: enter", __func__);
7772 
7773 	mutex_enter(&port->mca_lock);
7774 	while (port->mcap != NULL) {
7775 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
7776 					port->mcap->addr, port);
7777 
7778 		mcst_p = port->mcap->nextp;
7779 		kmem_free(port->mcap, sizeof (mcst_addr_t));
7780 		port->mcap = mcst_p;
7781 	}
7782 	mutex_exit(&port->mca_lock);
7783 
7784 	D1(vswp, "%s: exit", __func__);
7785 }
7786 
7787 /*
7788  * This vsw instance is detaching, but has registered an interest in one
7789  * or more multicast groups. Using the list of addresses maintained
7790  * within the vsw structure find the appropriate entry in the hash
7791  * table and remove this instance from the list of interested ports.
7792  */
7793 static void
7794 vsw_del_mcst_vsw(vsw_t *vswp)
7795 {
7796 	mcst_addr_t	*next_p = NULL;
7797 
7798 	D1(vswp, "%s: enter", __func__);
7799 
7800 	mutex_enter(&vswp->mca_lock);
7801 
7802 	while (vswp->mcap != NULL) {
7803 		DERR(vswp, "%s: deleting addr 0x%llx",
7804 			__func__, vswp->mcap->addr);
7805 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
7806 				vswp->mcap->addr, NULL);
7807 
7808 		next_p = vswp->mcap->nextp;
7809 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
7810 		vswp->mcap = next_p;
7811 	}
7812 
7813 	vswp->mcap = NULL;
7814 	mutex_exit(&vswp->mca_lock);
7815 
7816 	D1(vswp, "%s: exit", __func__);
7817 }
7818 
7819 
7820 /*
7821  * Remove the specified address from the list of address maintained
7822  * in this port node.
7823  */
7824 static void
7825 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
7826 {
7827 	vsw_t		*vswp = NULL;
7828 	vsw_port_t	*port = NULL;
7829 	mcst_addr_t	*prev_p = NULL;
7830 	mcst_addr_t	*curr_p = NULL;
7831 
7832 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
7833 		__func__, devtype, addr);
7834 
7835 	if (devtype == VSW_VNETPORT) {
7836 		port = (vsw_port_t *)arg;
7837 		mutex_enter(&port->mca_lock);
7838 		prev_p = curr_p = port->mcap;
7839 	} else {
7840 		vswp = (vsw_t *)arg;
7841 		mutex_enter(&vswp->mca_lock);
7842 		prev_p = curr_p = vswp->mcap;
7843 	}
7844 
7845 	while (curr_p != NULL) {
7846 		if (curr_p->addr == addr) {
7847 			D2(NULL, "%s: address found", __func__);
7848 			/* match found */
7849 			if (prev_p == curr_p) {
7850 				/* list head */
7851 				if (devtype == VSW_VNETPORT)
7852 					port->mcap = curr_p->nextp;
7853 				else
7854 					vswp->mcap = curr_p->nextp;
7855 			} else {
7856 				prev_p->nextp = curr_p->nextp;
7857 			}
7858 			kmem_free(curr_p, sizeof (mcst_addr_t));
7859 			break;
7860 		} else {
7861 			prev_p = curr_p;
7862 			curr_p = curr_p->nextp;
7863 		}
7864 	}
7865 
7866 	if (devtype == VSW_VNETPORT)
7867 		mutex_exit(&port->mca_lock);
7868 	else
7869 		mutex_exit(&vswp->mca_lock);
7870 
7871 	D1(NULL, "%s: exit", __func__);
7872 }
7873 
7874 /*
7875  * Creates a descriptor ring (dring) and links it into the
7876  * link of outbound drings for this channel.
7877  *
7878  * Returns NULL if creation failed.
7879  */
7880 static dring_info_t *
7881 vsw_create_dring(vsw_ldc_t *ldcp)
7882 {
7883 	vsw_private_desc_t	*priv_addr = NULL;
7884 	vsw_t			*vswp = ldcp->ldc_vswp;
7885 	ldc_mem_info_t		minfo;
7886 	dring_info_t		*dp, *tp;
7887 	int			i;
7888 
7889 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
7890 
7891 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
7892 
7893 	/* create public section of ring */
7894 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
7895 			VSW_PUB_SIZE, &dp->handle)) != 0) {
7896 
7897 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
7898 			"failed", ldcp->ldc_id);
7899 		goto create_fail_exit;
7900 	}
7901 
7902 	ASSERT(dp->handle != NULL);
7903 
7904 	/*
7905 	 * Get the base address of the public section of the ring.
7906 	 */
7907 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
7908 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
7909 			ldcp->ldc_id);
7910 		goto dring_fail_exit;
7911 	} else {
7912 		ASSERT(minfo.vaddr != 0);
7913 		dp->pub_addr = minfo.vaddr;
7914 	}
7915 
7916 	dp->num_descriptors = VSW_RING_NUM_EL;
7917 	dp->descriptor_size = VSW_PUB_SIZE;
7918 	dp->options = VIO_TX_DRING;
7919 	dp->ncookies = 1;	/* guaranteed by ldc */
7920 
7921 	/*
7922 	 * create private portion of ring
7923 	 */
7924 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
7925 		(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
7926 
7927 	if (vsw_setup_ring(ldcp, dp)) {
7928 		DERR(vswp, "%s: unable to setup ring", __func__);
7929 		goto dring_fail_exit;
7930 	}
7931 
7932 	/* haven't used any descriptors yet */
7933 	dp->end_idx = 0;
7934 	dp->last_ack_recv = -1;
7935 
7936 	/* bind dring to the channel */
7937 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
7938 		LDC_SHADOW_MAP, LDC_MEM_RW,
7939 		&dp->cookie[0], &dp->ncookies)) != 0) {
7940 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
7941 			"%lld", ldcp->ldc_id);
7942 		goto dring_fail_exit;
7943 	}
7944 
7945 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
7946 	dp->restart_reqd = B_TRUE;
7947 
7948 	/*
7949 	 * Only ever create rings for outgoing lane. Link it onto
7950 	 * end of list.
7951 	 */
7952 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
7953 	if (ldcp->lane_out.dringp == NULL) {
7954 		D2(vswp, "vsw_create_dring: adding first outbound ring");
7955 		ldcp->lane_out.dringp = dp;
7956 	} else {
7957 		tp = ldcp->lane_out.dringp;
7958 		while (tp->next != NULL)
7959 			tp = tp->next;
7960 
7961 		tp->next = dp;
7962 	}
7963 	RW_EXIT(&ldcp->lane_out.dlistrw);
7964 
7965 	return (dp);
7966 
7967 dring_fail_exit:
7968 	(void) ldc_mem_dring_destroy(dp->handle);
7969 
7970 create_fail_exit:
7971 	if (dp->priv_addr != NULL) {
7972 		priv_addr = dp->priv_addr;
7973 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
7974 			if (priv_addr->memhandle != NULL)
7975 				(void) ldc_mem_free_handle(
7976 						priv_addr->memhandle);
7977 			priv_addr++;
7978 		}
7979 		kmem_free(dp->priv_addr,
7980 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
7981 	}
7982 	mutex_destroy(&dp->dlock);
7983 
7984 	kmem_free(dp, sizeof (dring_info_t));
7985 	return (NULL);
7986 }
7987 
7988 /*
7989  * Create a ring consisting of just a private portion and link
7990  * it into the list of rings for the outbound lane.
7991  *
7992  * These type of rings are used primarily for temporary data
7993  * storage (i.e. as data buffers).
7994  */
7995 void
7996 vsw_create_privring(vsw_ldc_t *ldcp)
7997 {
7998 	dring_info_t		*dp, *tp;
7999 	vsw_t			*vswp = ldcp->ldc_vswp;
8000 
8001 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
8002 
8003 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
8004 
8005 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
8006 
8007 	/* no public section */
8008 	dp->pub_addr = NULL;
8009 
8010 	dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
8011 					VSW_RING_NUM_EL), KM_SLEEP);
8012 
8013 	dp->num_descriptors = VSW_RING_NUM_EL;
8014 
8015 	if (vsw_setup_ring(ldcp, dp)) {
8016 		DERR(vswp, "%s: setup of ring failed", __func__);
8017 		kmem_free(dp->priv_addr,
8018 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
8019 		mutex_destroy(&dp->dlock);
8020 		kmem_free(dp, sizeof (dring_info_t));
8021 		return;
8022 	}
8023 
8024 	/* haven't used any descriptors yet */
8025 	dp->end_idx = 0;
8026 
8027 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
8028 	dp->restart_reqd = B_TRUE;
8029 
8030 	/*
8031 	 * Only ever create rings for outgoing lane. Link it onto
8032 	 * end of list.
8033 	 */
8034 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
8035 	if (ldcp->lane_out.dringp == NULL) {
8036 		D2(vswp, "%s: adding first outbound privring", __func__);
8037 		ldcp->lane_out.dringp = dp;
8038 	} else {
8039 		tp = ldcp->lane_out.dringp;
8040 		while (tp->next != NULL)
8041 			tp = tp->next;
8042 
8043 		tp->next = dp;
8044 	}
8045 	RW_EXIT(&ldcp->lane_out.dlistrw);
8046 
8047 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
8048 }
8049 
8050 /*
8051  * Setup the descriptors in the dring. Returns 0 on success, 1 on
8052  * failure.
8053  */
8054 int
8055 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
8056 {
8057 	vnet_public_desc_t	*pub_addr = NULL;
8058 	vsw_private_desc_t	*priv_addr = NULL;
8059 	vsw_t			*vswp = ldcp->ldc_vswp;
8060 	uint64_t		*tmpp;
8061 	uint64_t		offset = 0;
8062 	uint32_t		ncookies = 0;
8063 	static char		*name = "vsw_setup_ring";
8064 	int			i, j, nc, rv;
8065 
8066 	priv_addr = dp->priv_addr;
8067 	pub_addr = dp->pub_addr;
8068 
8069 	/* public section may be null but private should never be */
8070 	ASSERT(priv_addr != NULL);
8071 
8072 	/*
8073 	 * Allocate the region of memory which will be used to hold
8074 	 * the data the descriptors will refer to.
8075 	 */
8076 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
8077 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
8078 
8079 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
8080 		dp->data_sz, dp->data_addr);
8081 
8082 	tmpp = (uint64_t *)dp->data_addr;
8083 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
8084 
8085 	/*
8086 	 * Initialise some of the private and public (if they exist)
8087 	 * descriptor fields.
8088 	 */
8089 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
8090 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
8091 
8092 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
8093 			&priv_addr->memhandle)) != 0) {
8094 			DERR(vswp, "%s: alloc mem handle failed", name);
8095 			goto setup_ring_cleanup;
8096 		}
8097 
8098 		priv_addr->datap = (void *)tmpp;
8099 
8100 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
8101 			(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
8102 			LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
8103 			&(priv_addr->memcookie[0]), &ncookies);
8104 		if (rv != 0) {
8105 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
8106 				"(rv %d)", name, ldcp->ldc_id, rv);
8107 			goto setup_ring_cleanup;
8108 		}
8109 		priv_addr->bound = 1;
8110 
8111 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
8112 			name, i, priv_addr->memcookie[0].addr,
8113 			priv_addr->memcookie[0].size);
8114 
8115 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
8116 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
8117 				"invalid num of cookies (%d) for size 0x%llx",
8118 				name, ldcp->ldc_id, ncookies,
8119 				VSW_RING_EL_DATA_SZ);
8120 
8121 			goto setup_ring_cleanup;
8122 		} else {
8123 			for (j = 1; j < ncookies; j++) {
8124 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
8125 					&(priv_addr->memcookie[j]));
8126 				if (rv != 0) {
8127 					DERR(vswp, "%s: ldc_mem_nextcookie "
8128 						"failed rv (%d)", name, rv);
8129 					goto setup_ring_cleanup;
8130 				}
8131 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
8132 					"size 0x%llx", name, j,
8133 					priv_addr->memcookie[j].addr,
8134 					priv_addr->memcookie[j].size);
8135 			}
8136 
8137 		}
8138 		priv_addr->ncookies = ncookies;
8139 		priv_addr->dstate = VIO_DESC_FREE;
8140 
8141 		if (pub_addr != NULL) {
8142 
8143 			/* link pub and private sides */
8144 			priv_addr->descp = pub_addr;
8145 
8146 			pub_addr->ncookies = priv_addr->ncookies;
8147 
8148 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
8149 				bcopy(&priv_addr->memcookie[nc],
8150 					&pub_addr->memcookie[nc],
8151 					sizeof (ldc_mem_cookie_t));
8152 			}
8153 
8154 			pub_addr->hdr.dstate = VIO_DESC_FREE;
8155 			pub_addr++;
8156 		}
8157 
8158 		/*
8159 		 * move to next element in the dring and the next
8160 		 * position in the data buffer.
8161 		 */
8162 		priv_addr++;
8163 		tmpp += offset;
8164 	}
8165 
8166 	return (0);
8167 
8168 setup_ring_cleanup:
8169 	priv_addr = dp->priv_addr;
8170 
8171 	for (j = 0; j < i; j++) {
8172 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
8173 		(void) ldc_mem_free_handle(priv_addr->memhandle);
8174 
8175 		mutex_destroy(&priv_addr->dstate_lock);
8176 
8177 		priv_addr++;
8178 	}
8179 	kmem_free(dp->data_addr, dp->data_sz);
8180 
8181 	return (1);
8182 }
8183 
8184 /*
8185  * Searches the private section of a ring for a free descriptor,
8186  * starting at the location of the last free descriptor found
8187  * previously.
8188  *
8189  * Returns 0 if free descriptor is available, and updates state
8190  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
8191  *
8192  * FUTURE: might need to return contiguous range of descriptors
8193  * as dring info msg assumes all will be contiguous.
8194  */
8195 static int
8196 vsw_dring_find_free_desc(dring_info_t *dringp,
8197 		vsw_private_desc_t **priv_p, int *idx)
8198 {
8199 	vsw_private_desc_t	*addr = NULL;
8200 	int			num = VSW_RING_NUM_EL;
8201 	int			ret = 1;
8202 
8203 	D1(NULL, "%s enter\n", __func__);
8204 
8205 	ASSERT(dringp->priv_addr != NULL);
8206 
8207 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
8208 			__func__, dringp, dringp->end_idx);
8209 
8210 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
8211 
8212 	mutex_enter(&addr->dstate_lock);
8213 	if (addr->dstate == VIO_DESC_FREE) {
8214 		addr->dstate = VIO_DESC_READY;
8215 		*priv_p = addr;
8216 		*idx = dringp->end_idx;
8217 		dringp->end_idx = (dringp->end_idx + 1) % num;
8218 		ret = 0;
8219 
8220 	}
8221 	mutex_exit(&addr->dstate_lock);
8222 
8223 	/* ring full */
8224 	if (ret == 1) {
8225 		D2(NULL, "%s: no desp free: started at %d", __func__,
8226 			dringp->end_idx);
8227 	}
8228 
8229 	D1(NULL, "%s: exit\n", __func__);
8230 
8231 	return (ret);
8232 }
8233 
8234 /*
8235  * Map from a dring identifier to the ring itself. Returns
8236  * pointer to ring or NULL if no match found.
8237  *
8238  * Should be called with dlistrw rwlock held as reader.
8239  */
8240 static dring_info_t *
8241 vsw_ident2dring(lane_t *lane, uint64_t ident)
8242 {
8243 	dring_info_t	*dp = NULL;
8244 
8245 	if ((dp = lane->dringp) == NULL) {
8246 		return (NULL);
8247 	} else {
8248 		if (dp->ident == ident)
8249 			return (dp);
8250 
8251 		while (dp != NULL) {
8252 			if (dp->ident == ident)
8253 				break;
8254 			dp = dp->next;
8255 		}
8256 	}
8257 
8258 	return (dp);
8259 }
8260 
8261 /*
8262  * Set the default lane attributes. These are copied into
8263  * the attr msg we send to our peer. If they are not acceptable
8264  * then (currently) the handshake ends.
8265  */
8266 static void
8267 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
8268 {
8269 	bzero(lp, sizeof (lane_t));
8270 
8271 	READ_ENTER(&vswp->if_lockrw);
8272 	ether_copy(&(vswp->if_addr), &(lp->addr));
8273 	RW_EXIT(&vswp->if_lockrw);
8274 
8275 	lp->mtu = VSW_MTU;
8276 	lp->addr_type = ADDR_TYPE_MAC;
8277 	lp->xfer_mode = VIO_DRING_MODE;
8278 	lp->ack_freq = 0;	/* for shared mode */
8279 
8280 	mutex_enter(&lp->seq_lock);
8281 	lp->seq_num = VNET_ISS;
8282 	mutex_exit(&lp->seq_lock);
8283 }
8284 
8285 /*
8286  * Verify that the attributes are acceptable.
8287  *
8288  * FUTURE: If some attributes are not acceptable, change them
8289  * our desired values.
8290  */
8291 static int
8292 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
8293 {
8294 	int	ret = 0;
8295 
8296 	D1(NULL, "vsw_check_attr enter\n");
8297 
8298 	/*
8299 	 * Note we currently only support in-band descriptors
8300 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
8301 	 */
8302 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
8303 			(pkt->xfer_mode != VIO_DRING_MODE)) {
8304 		D2(NULL, "vsw_check_attr: unknown mode %x\n",
8305 			pkt->xfer_mode);
8306 		ret = 1;
8307 	}
8308 
8309 	/* Only support MAC addresses at moment. */
8310 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
8311 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
8312 			"or address 0x%llx\n", pkt->addr_type,
8313 			pkt->addr);
8314 		ret = 1;
8315 	}
8316 
8317 	/*
8318 	 * MAC address supplied by device should match that stored
8319 	 * in the vsw-port OBP node. Need to decide what to do if they
8320 	 * don't match, for the moment just warn but don't fail.
8321 	 */
8322 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
8323 		DERR(NULL, "vsw_check_attr: device supplied address "
8324 			"0x%llx doesn't match node address 0x%llx\n",
8325 			pkt->addr, port->p_macaddr);
8326 	}
8327 
8328 	/*
8329 	 * Ack freq only makes sense in pkt mode, in shared
8330 	 * mode the ring descriptors say whether or not to
8331 	 * send back an ACK.
8332 	 */
8333 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
8334 				(pkt->ack_freq > 0)) {
8335 		D2(NULL, "vsw_check_attr: non zero ack freq "
8336 			" in SHM mode\n");
8337 		ret = 1;
8338 	}
8339 
8340 	/*
8341 	 * Note: for the moment we only support ETHER
8342 	 * frames. This may change in the future.
8343 	 */
8344 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
8345 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
8346 			pkt->mtu);
8347 		ret = 1;
8348 	}
8349 
8350 	D1(NULL, "vsw_check_attr exit\n");
8351 
8352 	return (ret);
8353 }
8354 
8355 /*
8356  * Returns 1 if there is a problem, 0 otherwise.
8357  */
8358 static int
8359 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
8360 {
8361 	_NOTE(ARGUNUSED(pkt))
8362 
8363 	int	ret = 0;
8364 
8365 	D1(NULL, "vsw_check_dring_info enter\n");
8366 
8367 	if ((pkt->num_descriptors == 0) ||
8368 		(pkt->descriptor_size == 0) ||
8369 		(pkt->ncookies != 1)) {
8370 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
8371 		ret = 1;
8372 	}
8373 
8374 	D1(NULL, "vsw_check_dring_info exit\n");
8375 
8376 	return (ret);
8377 }
8378 
8379 /*
8380  * Returns 1 if two memory cookies match. Otherwise returns 0.
8381  */
8382 static int
8383 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
8384 {
8385 	if ((m1->addr != m2->addr) ||
8386 		(m2->size != m2->size)) {
8387 		return (0);
8388 	} else {
8389 		return (1);
8390 	}
8391 }
8392 
8393 /*
8394  * Returns 1 if ring described in reg message matches that
8395  * described by dring_info structure. Otherwise returns 0.
8396  */
8397 static int
8398 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
8399 {
8400 	if ((msg->descriptor_size != dp->descriptor_size) ||
8401 		(msg->num_descriptors != dp->num_descriptors) ||
8402 		(msg->ncookies != dp->ncookies) ||
8403 		!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
8404 		return (0);
8405 	} else {
8406 		return (1);
8407 	}
8408 
8409 }
8410 
8411 static caddr_t
8412 vsw_print_ethaddr(uint8_t *a, char *ebuf)
8413 {
8414 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
8415 	    a[0], a[1], a[2], a[3], a[4], a[5]);
8416 	return (ebuf);
8417 }
8418 
8419 /*
8420  * Reset and free all the resources associated with
8421  * the channel.
8422  */
8423 static void
8424 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
8425 {
8426 	dring_info_t		*dp, *dpp;
8427 	lane_t			*lp = NULL;
8428 	int			rv = 0;
8429 
8430 	ASSERT(ldcp != NULL);
8431 
8432 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
8433 
8434 	if (dir == INBOUND) {
8435 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
8436 			" of channel %lld", __func__, ldcp->ldc_id);
8437 		lp = &ldcp->lane_in;
8438 	} else {
8439 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
8440 			" of channel %lld", __func__, ldcp->ldc_id);
8441 		lp = &ldcp->lane_out;
8442 	}
8443 
8444 	lp->lstate = VSW_LANE_INACTIV;
8445 	mutex_enter(&lp->seq_lock);
8446 	lp->seq_num = VNET_ISS;
8447 	mutex_exit(&lp->seq_lock);
8448 	if (lp->dringp) {
8449 		if (dir == INBOUND) {
8450 			WRITE_ENTER(&lp->dlistrw);
8451 			dp = lp->dringp;
8452 			while (dp != NULL) {
8453 				dpp = dp->next;
8454 				if (dp->handle != NULL)
8455 					(void) ldc_mem_dring_unmap(dp->handle);
8456 				kmem_free(dp, sizeof (dring_info_t));
8457 				dp = dpp;
8458 			}
8459 			RW_EXIT(&lp->dlistrw);
8460 		} else {
8461 			/*
8462 			 * unbind, destroy exported dring, free dring struct
8463 			 */
8464 			WRITE_ENTER(&lp->dlistrw);
8465 			dp = lp->dringp;
8466 			rv = vsw_free_ring(dp);
8467 			RW_EXIT(&lp->dlistrw);
8468 		}
8469 		if (rv == 0) {
8470 			lp->dringp = NULL;
8471 		}
8472 	}
8473 
8474 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
8475 }
8476 
8477 /*
8478  * Free ring and all associated resources.
8479  *
8480  * Should be called with dlistrw rwlock held as writer.
8481  */
8482 static int
8483 vsw_free_ring(dring_info_t *dp)
8484 {
8485 	vsw_private_desc_t	*paddr = NULL;
8486 	dring_info_t		*dpp;
8487 	int			i, rv = 1;
8488 
8489 	while (dp != NULL) {
8490 		mutex_enter(&dp->dlock);
8491 		dpp = dp->next;
8492 		if (dp->priv_addr != NULL) {
8493 			/*
8494 			 * First unbind and free the memory handles
8495 			 * stored in each descriptor within the ring.
8496 			 */
8497 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
8498 				paddr = (vsw_private_desc_t *)
8499 						dp->priv_addr + i;
8500 				if (paddr->memhandle != NULL) {
8501 					if (paddr->bound == 1) {
8502 						rv = ldc_mem_unbind_handle(
8503 							paddr->memhandle);
8504 
8505 						if (rv != 0) {
8506 							DERR(NULL, "error "
8507 							"unbinding handle for "
8508 							"ring 0x%llx at pos %d",
8509 							dp, i);
8510 							mutex_exit(&dp->dlock);
8511 							return (rv);
8512 						}
8513 						paddr->bound = 0;
8514 					}
8515 
8516 					rv = ldc_mem_free_handle(
8517 							paddr->memhandle);
8518 					if (rv != 0) {
8519 						DERR(NULL, "error freeing "
8520 							"handle for ring "
8521 							"0x%llx at pos %d",
8522 							dp, i);
8523 						mutex_exit(&dp->dlock);
8524 						return (rv);
8525 					}
8526 					paddr->memhandle = NULL;
8527 				}
8528 				mutex_destroy(&paddr->dstate_lock);
8529 			}
8530 			kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
8531 					* VSW_RING_NUM_EL));
8532 		}
8533 
8534 		/*
8535 		 * Now unbind and destroy the ring itself.
8536 		 */
8537 		if (dp->handle != NULL) {
8538 			(void) ldc_mem_dring_unbind(dp->handle);
8539 			(void) ldc_mem_dring_destroy(dp->handle);
8540 		}
8541 
8542 		if (dp->data_addr != NULL) {
8543 			kmem_free(dp->data_addr, dp->data_sz);
8544 		}
8545 
8546 		mutex_exit(&dp->dlock);
8547 		mutex_destroy(&dp->dlock);
8548 		mutex_destroy(&dp->restart_lock);
8549 		kmem_free(dp, sizeof (dring_info_t));
8550 
8551 		dp = dpp;
8552 	}
8553 	return (0);
8554 }
8555 
8556 /*
8557  * Debugging routines
8558  */
8559 static void
8560 display_state(void)
8561 {
8562 	vsw_t		*vswp;
8563 	vsw_port_list_t	*plist;
8564 	vsw_port_t 	*port;
8565 	vsw_ldc_list_t	*ldcl;
8566 	vsw_ldc_t 	*ldcp;
8567 
8568 	cmn_err(CE_NOTE, "***** system state *****");
8569 
8570 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
8571 		plist = &vswp->plist;
8572 		READ_ENTER(&plist->lockrw);
8573 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
8574 			vswp->instance, plist->num_ports);
8575 
8576 		for (port = plist->head; port != NULL; port = port->p_next) {
8577 			ldcl = &port->p_ldclist;
8578 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
8579 				port->p_instance, ldcl->num_ldcs);
8580 			READ_ENTER(&ldcl->lockrw);
8581 			ldcp = ldcl->head;
8582 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
8583 				cmn_err(CE_CONT, "chan %lu : dev %d : "
8584 					"status %d : phase %u\n",
8585 					ldcp->ldc_id, ldcp->dev_class,
8586 					ldcp->ldc_status, ldcp->hphase);
8587 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
8588 					"psession %lu\n",
8589 					ldcp->ldc_id,
8590 					ldcp->local_session,
8591 					ldcp->peer_session);
8592 
8593 				cmn_err(CE_CONT, "Inbound lane:\n");
8594 				display_lane(&ldcp->lane_in);
8595 				cmn_err(CE_CONT, "Outbound lane:\n");
8596 				display_lane(&ldcp->lane_out);
8597 			}
8598 			RW_EXIT(&ldcl->lockrw);
8599 		}
8600 		RW_EXIT(&plist->lockrw);
8601 	}
8602 	cmn_err(CE_NOTE, "***** system state *****");
8603 }
8604 
8605 static void
8606 display_lane(lane_t *lp)
8607 {
8608 	dring_info_t	*drp;
8609 
8610 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
8611 		lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
8612 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
8613 		lp->addr_type, lp->addr, lp->xfer_mode);
8614 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
8615 
8616 	cmn_err(CE_CONT, "Dring info:\n");
8617 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
8618 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
8619 			drp->num_descriptors, drp->descriptor_size);
8620 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
8621 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
8622 			(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
8623 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
8624 			drp->ident, drp->end_idx);
8625 		display_ring(drp);
8626 	}
8627 }
8628 
8629 static void
8630 display_ring(dring_info_t *dringp)
8631 {
8632 	uint64_t		i;
8633 	uint64_t		priv_count = 0;
8634 	uint64_t		pub_count = 0;
8635 	vnet_public_desc_t	*pub_addr = NULL;
8636 	vsw_private_desc_t	*priv_addr = NULL;
8637 
8638 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
8639 		if (dringp->pub_addr != NULL) {
8640 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
8641 
8642 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
8643 				pub_count++;
8644 		}
8645 
8646 		if (dringp->priv_addr != NULL) {
8647 			priv_addr =
8648 				(vsw_private_desc_t *)dringp->priv_addr + i;
8649 
8650 			if (priv_addr->dstate == VIO_DESC_FREE)
8651 				priv_count++;
8652 		}
8653 	}
8654 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
8655 			i, priv_count, pub_count);
8656 }
8657 
8658 static void
8659 dump_flags(uint64_t state)
8660 {
8661 	int	i;
8662 
8663 	typedef struct flag_name {
8664 		int	flag_val;
8665 		char	*flag_name;
8666 	} flag_name_t;
8667 
8668 	flag_name_t	flags[] = {
8669 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
8670 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
8671 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
8672 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
8673 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
8674 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
8675 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
8676 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
8677 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
8678 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
8679 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
8680 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
8681 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
8682 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
8683 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
8684 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
8685 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
8686 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
8687 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
8688 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
8689 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
8690 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
8691 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
8692 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
8693 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
8694 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
8695 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
8696 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
8697 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
8698 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
8699 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
8700 
8701 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
8702 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
8703 		if (state & flags[i].flag_val)
8704 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
8705 	}
8706 }
8707