xref: /titanic_44/usr/src/uts/sun4v/io/vsw.c (revision b9238976491622ad75a67ab0c12edf99e36212b9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 
75 /*
76  * Function prototypes.
77  */
78 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
79 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
80 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
81 static	int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
82 static	int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *);
83 static	void vsw_setup_switching_timeout(void *arg);
84 static	void vsw_stop_switching_timeout(vsw_t *vswp);
85 static	int vsw_setup_switching(vsw_t *);
86 static	int vsw_setup_layer2(vsw_t *);
87 static	int vsw_setup_layer3(vsw_t *);
88 
89 /* MAC Ring table functions. */
90 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
91 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
92 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
93 static void vsw_queue_stop(vsw_queue_t *vqp);
94 static vsw_queue_t *vsw_queue_create();
95 static void vsw_queue_destroy(vsw_queue_t *vqp);
96 
97 /* MAC layer routines */
98 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
99 		mac_resource_t *mrp);
100 static	int vsw_get_hw_maddr(vsw_t *);
101 static	int vsw_set_hw(vsw_t *, vsw_port_t *, int);
102 static	int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
103 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
104 static	int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
105 static	int vsw_unset_hw_addr(vsw_t *, int);
106 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
107 static void vsw_reconfig_hw(vsw_t *);
108 static int vsw_prog_if(vsw_t *);
109 static int vsw_prog_ports(vsw_t *);
110 static int vsw_mac_attach(vsw_t *vswp);
111 static void vsw_mac_detach(vsw_t *vswp);
112 static int vsw_mac_open(vsw_t *vswp);
113 static void vsw_mac_close(vsw_t *vswp);
114 static void vsw_set_addrs(vsw_t *vswp);
115 static void vsw_unset_addrs(vsw_t *vswp);
116 
117 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
118 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
119 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
120 static int vsw_mac_register(vsw_t *);
121 static int vsw_mac_unregister(vsw_t *);
122 static int vsw_m_stat(void *, uint_t, uint64_t *);
123 static void vsw_m_stop(void *arg);
124 static int vsw_m_start(void *arg);
125 static int vsw_m_unicst(void *arg, const uint8_t *);
126 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
127 static int vsw_m_promisc(void *arg, boolean_t);
128 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
129 
130 /* MDEG routines */
131 static	int vsw_mdeg_register(vsw_t *vswp);
132 static	void vsw_mdeg_unregister(vsw_t *vswp);
133 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
134 static	int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
135 static	int vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
136 static	void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);
137 static	int vsw_read_mdprops(vsw_t *vswp);
138 
139 /* Port add/deletion routines */
140 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
141 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
142 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
143 static	int vsw_detach_ports(vsw_t *vswp);
144 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
145 static	int vsw_port_delete(vsw_port_t *port);
146 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
147 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
148 static	int vsw_init_ldcs(vsw_port_t *port);
149 static	int vsw_uninit_ldcs(vsw_port_t *port);
150 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
151 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
152 static	int vsw_drain_ldcs(vsw_port_t *port);
153 static	int vsw_drain_port_taskq(vsw_port_t *port);
154 static	void vsw_marker_task(void *);
155 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
156 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
157 
158 /* Interrupt routines */
159 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
160 
161 /* Handshake routines */
162 static	void vsw_ldc_reinit(vsw_ldc_t *);
163 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
164 static	void vsw_conn_task(void *);
165 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
166 static	void vsw_next_milestone(vsw_ldc_t *);
167 static	int vsw_supported_version(vio_ver_msg_t *);
168 
169 /* Data processing routines */
170 static void vsw_process_pkt(void *);
171 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
172 static void vsw_process_ctrl_pkt(void *);
173 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
174 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
175 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
176 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
177 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
178 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
179 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
180 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
181 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
182 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
183 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
184 
185 /* Switching/data transmit routines */
186 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
187 	    vsw_port_t *port, mac_resource_handle_t);
188 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
189 	    vsw_port_t *port, mac_resource_handle_t);
190 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
191 	    vsw_port_t *port);
192 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
193 	    vsw_port_t *port);
194 static	int vsw_portsend(vsw_port_t *, mblk_t *);
195 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
196 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
197 
198 /* Packet creation routines */
199 static void vsw_send_ver(void *);
200 static void vsw_send_attr(vsw_ldc_t *);
201 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
202 static void vsw_send_dring_info(vsw_ldc_t *);
203 static void vsw_send_rdx(vsw_ldc_t *);
204 
205 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
206 
207 /* Forwarding database (FDB) routines */
208 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
209 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
210 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
211 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
212 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
213 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
214 static	mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
215 static	void vsw_del_mcst_port(vsw_port_t *);
216 static	void vsw_del_mcst_vsw(vsw_t *);
217 
218 /* Dring routines */
219 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
220 static void vsw_create_privring(vsw_ldc_t *);
221 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
222 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
223     int *);
224 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
225 
226 static void vsw_set_lane_attr(vsw_t *, lane_t *);
227 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
228 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
229 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
230 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
231 
232 /* Misc support routines */
233 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
234 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
235 static int vsw_free_ring(dring_info_t *);
236 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
237 
238 /* Debugging routines */
239 static void dump_flags(uint64_t);
240 static void display_state(void);
241 static void display_lane(lane_t *);
242 static void display_ring(dring_info_t *);
243 
244 int	vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */
245 int	vsw_wretries = 100;		/* # of write attempts */
246 int	vsw_chain_len = 150;		/* max # of mblks in msg chain */
247 int	vsw_desc_delay = 0;		/* delay in us */
248 int	vsw_read_attempts = 5;		/* # of reads of descriptor */
249 int	vsw_mac_open_retries = 20;	/* max # of mac_open() retries */
250 int	vsw_setup_switching_delay = 3;	/* setup sw timeout interval in sec */
251 
252 uint32_t	vsw_mblk_size = VSW_MBLK_SIZE;
253 uint32_t	vsw_num_mblks = VSW_NUM_MBLKS;
254 
255 static	mac_callbacks_t	vsw_m_callbacks = {
256 	0,
257 	vsw_m_stat,
258 	vsw_m_start,
259 	vsw_m_stop,
260 	vsw_m_promisc,
261 	vsw_m_multicst,
262 	vsw_m_unicst,
263 	vsw_m_tx,
264 	NULL,
265 	NULL,
266 	NULL
267 };
268 
269 static	struct	cb_ops	vsw_cb_ops = {
270 	nulldev,			/* cb_open */
271 	nulldev,			/* cb_close */
272 	nodev,				/* cb_strategy */
273 	nodev,				/* cb_print */
274 	nodev,				/* cb_dump */
275 	nodev,				/* cb_read */
276 	nodev,				/* cb_write */
277 	nodev,				/* cb_ioctl */
278 	nodev,				/* cb_devmap */
279 	nodev,				/* cb_mmap */
280 	nodev,				/* cb_segmap */
281 	nochpoll,			/* cb_chpoll */
282 	ddi_prop_op,			/* cb_prop_op */
283 	NULL,				/* cb_stream */
284 	D_MP,				/* cb_flag */
285 	CB_REV,				/* rev */
286 	nodev,				/* int (*cb_aread)() */
287 	nodev				/* int (*cb_awrite)() */
288 };
289 
290 static	struct	dev_ops	vsw_ops = {
291 	DEVO_REV,		/* devo_rev */
292 	0,			/* devo_refcnt */
293 	vsw_getinfo,		/* devo_getinfo */
294 	nulldev,		/* devo_identify */
295 	nulldev,		/* devo_probe */
296 	vsw_attach,		/* devo_attach */
297 	vsw_detach,		/* devo_detach */
298 	nodev,			/* devo_reset */
299 	&vsw_cb_ops,		/* devo_cb_ops */
300 	(struct bus_ops *)NULL,	/* devo_bus_ops */
301 	ddi_power		/* devo_power */
302 };
303 
304 extern	struct	mod_ops	mod_driverops;
305 static struct modldrv vswmodldrv = {
306 	&mod_driverops,
307 	"sun4v Virtual Switch",
308 	&vsw_ops,
309 };
310 
311 #define	LDC_ENTER_LOCK(ldcp)	\
312 				mutex_enter(&((ldcp)->ldc_cblock));\
313 				mutex_enter(&((ldcp)->ldc_txlock));
314 #define	LDC_EXIT_LOCK(ldcp)	\
315 				mutex_exit(&((ldcp)->ldc_txlock));\
316 				mutex_exit(&((ldcp)->ldc_cblock));
317 
318 /* Driver soft state ptr  */
319 static void	*vsw_state;
320 
321 /*
322  * Linked list of "vsw_t" structures - one per instance.
323  */
324 vsw_t		*vsw_head = NULL;
325 krwlock_t	vsw_rw;
326 
327 /*
328  * Property names
329  */
330 static char vdev_propname[] = "virtual-device";
331 static char vsw_propname[] = "virtual-network-switch";
332 static char physdev_propname[] = "vsw-phys-dev";
333 static char smode_propname[] = "vsw-switch-mode";
334 static char macaddr_propname[] = "local-mac-address";
335 static char remaddr_propname[] = "remote-mac-address";
336 static char ldcids_propname[] = "ldc-ids";
337 static char chan_propname[] = "channel-endpoint";
338 static char id_propname[] = "id";
339 static char reg_propname[] = "reg";
340 
341 /* supported versions */
342 static	ver_sup_t	vsw_versions[] = { {1, 0} };
343 
344 /*
345  * Matching criteria passed to the MDEG to register interest
346  * in changes to 'virtual-device-port' nodes identified by their
347  * 'id' property.
348  */
349 static md_prop_match_t vport_prop_match[] = {
350 	{ MDET_PROP_VAL,    "id"   },
351 	{ MDET_LIST_END,    NULL    }
352 };
353 
354 static mdeg_node_match_t vport_match = { "virtual-device-port",
355 						vport_prop_match };
356 
357 /*
358  * Matching criteria passed to the MDEG to register interest
359  * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified
360  * by their 'name' and 'cfg-handle' properties.
361  */
362 static md_prop_match_t vdev_prop_match[] = {
363 	{ MDET_PROP_STR,    "name"   },
364 	{ MDET_PROP_VAL,    "cfg-handle" },
365 	{ MDET_LIST_END,    NULL    }
366 };
367 
368 static mdeg_node_match_t vdev_match = { "virtual-device",
369 						vdev_prop_match };
370 
371 
372 /*
373  * Specification of an MD node passed to the MDEG to filter any
374  * 'vport' nodes that do not belong to the specified node. This
375  * template is copied for each vsw instance and filled in with
376  * the appropriate 'cfg-handle' value before being passed to the MDEG.
377  */
378 static mdeg_prop_spec_t vsw_prop_template[] = {
379 	{ MDET_PROP_STR,    "name",		vsw_propname },
380 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
381 	{ MDET_LIST_END,    NULL,		NULL	}
382 };
383 
384 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
385 
386 /*
387  * From /etc/system enable/disable thread per ring. This is a mode
388  * selection that is done a vsw driver attach time.
389  */
390 boolean_t vsw_multi_ring_enable = B_FALSE;
391 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;
392 
393 /*
394  * Print debug messages - set to 0x1f to enable all msgs
395  * or 0x0 to turn all off.
396  */
397 int vswdbg = 0x0;
398 
399 /*
400  * debug levels:
401  * 0x01:	Function entry/exit tracing
402  * 0x02:	Internal function messages
403  * 0x04:	Verbose internal messages
404  * 0x08:	Warning messages
405  * 0x10:	Error messages
406  */
407 
408 static void
409 vswdebug(vsw_t *vswp, const char *fmt, ...)
410 {
411 	char buf[512];
412 	va_list ap;
413 
414 	va_start(ap, fmt);
415 	(void) vsprintf(buf, fmt, ap);
416 	va_end(ap);
417 
418 	if (vswp == NULL)
419 		cmn_err(CE_CONT, "%s\n", buf);
420 	else
421 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
422 }
423 
424 /*
425  * For the moment the state dump routines have their own
426  * private flag.
427  */
428 #define	DUMP_STATE	0
429 
430 #if DUMP_STATE
431 
432 #define	DUMP_TAG(tag) \
433 {			\
434 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
435 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
436 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
437 }
438 
439 #define	DUMP_TAG_PTR(tag) \
440 {			\
441 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
442 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
443 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
444 }
445 
446 #define	DUMP_FLAGS(flags) dump_flags(flags);
447 #define	DISPLAY_STATE()	display_state()
448 
449 #else
450 
451 #define	DUMP_TAG(tag)
452 #define	DUMP_TAG_PTR(tag)
453 #define	DUMP_FLAGS(state)
454 #define	DISPLAY_STATE()
455 
456 #endif	/* DUMP_STATE */
457 
458 #ifdef DEBUG
459 
460 #define	D1		\
461 if (vswdbg & 0x01)	\
462 	vswdebug
463 
464 #define	D2		\
465 if (vswdbg & 0x02)	\
466 	vswdebug
467 
468 #define	D3		\
469 if (vswdbg & 0x04)	\
470 	vswdebug
471 
472 #define	DWARN		\
473 if (vswdbg & 0x08)	\
474 	vswdebug
475 
476 #define	DERR		\
477 if (vswdbg & 0x10)	\
478 	vswdebug
479 
480 #else
481 
482 #define	DERR		if (0)	vswdebug
483 #define	DWARN		if (0)	vswdebug
484 #define	D1		if (0)	vswdebug
485 #define	D2		if (0)	vswdebug
486 #define	D3		if (0)	vswdebug
487 
488 #endif	/* DEBUG */
489 
490 static struct modlinkage modlinkage = {
491 	MODREV_1,
492 	&vswmodldrv,
493 	NULL
494 };
495 
496 int
497 _init(void)
498 {
499 	int status;
500 
501 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
502 
503 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
504 	if (status != 0) {
505 		return (status);
506 	}
507 
508 	mac_init_ops(&vsw_ops, "vsw");
509 	status = mod_install(&modlinkage);
510 	if (status != 0) {
511 		ddi_soft_state_fini(&vsw_state);
512 	}
513 	return (status);
514 }
515 
516 int
517 _fini(void)
518 {
519 	int status;
520 
521 	status = mod_remove(&modlinkage);
522 	if (status != 0)
523 		return (status);
524 	mac_fini_ops(&vsw_ops);
525 	ddi_soft_state_fini(&vsw_state);
526 
527 	rw_destroy(&vsw_rw);
528 
529 	return (status);
530 }
531 
532 int
533 _info(struct modinfo *modinfop)
534 {
535 	return (mod_info(&modlinkage, modinfop));
536 }
537 
538 static int
539 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
540 {
541 	vsw_t		*vswp;
542 	int		instance;
543 	char		hashname[MAXNAMELEN];
544 	char		qname[TASKQ_NAMELEN];
545 	enum		{ PROG_init = 0x00,
546 				PROG_locks = 0x01,
547 				PROG_readmd = 0x02,
548 				PROG_fdb = 0x04,
549 				PROG_mfdb = 0x08,
550 				PROG_taskq = 0x10,
551 				PROG_swmode = 0x20,
552 				PROG_macreg = 0x40,
553 				PROG_mdreg = 0x80}
554 			progress;
555 
556 	progress = PROG_init;
557 	int		rv;
558 
559 	switch (cmd) {
560 	case DDI_ATTACH:
561 		break;
562 	case DDI_RESUME:
563 		/* nothing to do for this non-device */
564 		return (DDI_SUCCESS);
565 	case DDI_PM_RESUME:
566 	default:
567 		return (DDI_FAILURE);
568 	}
569 
570 	instance = ddi_get_instance(dip);
571 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
572 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
573 		return (DDI_FAILURE);
574 	}
575 	vswp = ddi_get_soft_state(vsw_state, instance);
576 
577 	if (vswp == NULL) {
578 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
579 		goto vsw_attach_fail;
580 	}
581 
582 	vswp->dip = dip;
583 	vswp->instance = instance;
584 	ddi_set_driver_private(dip, (caddr_t)vswp);
585 
586 	mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL);
587 	mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
588 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
589 	mutex_init(&vswp->swtmout_lock, NULL, MUTEX_DRIVER, NULL);
590 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
591 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
592 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
593 
594 	progress |= PROG_locks;
595 
596 	rv = vsw_read_mdprops(vswp);
597 	if (rv != 0)
598 		goto vsw_attach_fail;
599 
600 	progress |= PROG_readmd;
601 
602 	/* setup the unicast forwarding database  */
603 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
604 	    vswp->instance);
605 	D2(vswp, "creating unicast hash table (%s)...", hashname);
606 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
607 	    mod_hash_null_valdtor, sizeof (void *));
608 
609 	progress |= PROG_fdb;
610 
611 	/* setup the multicast fowarding database */
612 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
613 	    vswp->instance);
614 	D2(vswp, "creating multicast hash table %s)...", hashname);
615 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
616 	    mod_hash_null_valdtor, sizeof (void *));
617 
618 	progress |= PROG_mfdb;
619 
620 	/*
621 	 * Create the taskq which will process all the VIO
622 	 * control messages.
623 	 */
624 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
625 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
626 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
627 		cmn_err(CE_WARN, "!vsw%d: Unable to create task queue",
628 		    vswp->instance);
629 		goto vsw_attach_fail;
630 	}
631 
632 	progress |= PROG_taskq;
633 
634 	/* prevent auto-detaching */
635 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
636 	    DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
637 		cmn_err(CE_NOTE, "!Unable to set \"%s\" property for "
638 		    "instance %u", DDI_NO_AUTODETACH, instance);
639 	}
640 
641 	/*
642 	 * Setup the required switching mode,
643 	 * based on the mdprops that we read earlier.
644 	 */
645 	rv = vsw_setup_switching(vswp);
646 	if (rv == EAGAIN) {
647 		/*
648 		 * Unable to setup switching mode;
649 		 * as the error is EAGAIN, schedule a timeout to retry.
650 		 */
651 		mutex_enter(&vswp->swtmout_lock);
652 
653 		vswp->swtmout_enabled = B_TRUE;
654 		vswp->swtmout_id =
655 		    timeout(vsw_setup_switching_timeout, vswp,
656 		    (vsw_setup_switching_delay * drv_usectohz(MICROSEC)));
657 
658 		mutex_exit(&vswp->swtmout_lock);
659 	} else if (rv != 0) {
660 		goto vsw_attach_fail;
661 	}
662 
663 	progress |= PROG_swmode;
664 
665 	/* Register with mac layer as a provider */
666 	rv = vsw_mac_register(vswp);
667 	if (rv != 0)
668 		goto vsw_attach_fail;
669 
670 	progress |= PROG_macreg;
671 
672 	/*
673 	 * Now we have everything setup, register an interest in
674 	 * specific MD nodes.
675 	 *
676 	 * The callback is invoked in 2 cases, firstly if upon mdeg
677 	 * registration there are existing nodes which match our specified
678 	 * criteria, and secondly if the MD is changed (and again, there
679 	 * are nodes which we are interested in present within it. Note
680 	 * that our callback will be invoked even if our specified nodes
681 	 * have not actually changed).
682 	 *
683 	 */
684 	rv = vsw_mdeg_register(vswp);
685 	if (rv != 0)
686 		goto vsw_attach_fail;
687 
688 	progress |= PROG_mdreg;
689 
690 	WRITE_ENTER(&vsw_rw);
691 	vswp->next = vsw_head;
692 	vsw_head = vswp;
693 	RW_EXIT(&vsw_rw);
694 
695 	ddi_report_dev(vswp->dip);
696 	return (DDI_SUCCESS);
697 
698 vsw_attach_fail:
699 	DERR(NULL, "vsw_attach: failed");
700 
701 	if (progress & PROG_mdreg) {
702 		vsw_mdeg_unregister(vswp);
703 		(void) vsw_detach_ports(vswp);
704 	}
705 
706 	if (progress & PROG_macreg)
707 		(void) vsw_mac_unregister(vswp);
708 
709 	if (progress & PROG_swmode) {
710 		vsw_stop_switching_timeout(vswp);
711 		mutex_enter(&vswp->mac_lock);
712 		vsw_mac_detach(vswp);
713 		vsw_mac_close(vswp);
714 		mutex_exit(&vswp->mac_lock);
715 	}
716 
717 	if (progress & PROG_taskq)
718 		ddi_taskq_destroy(vswp->taskq_p);
719 
720 	if (progress & PROG_mfdb)
721 		mod_hash_destroy_hash(vswp->mfdb);
722 
723 	if (progress & PROG_fdb)
724 		mod_hash_destroy_hash(vswp->fdb);
725 
726 	if (progress & PROG_locks) {
727 		rw_destroy(&vswp->plist.lockrw);
728 		rw_destroy(&vswp->mfdbrw);
729 		rw_destroy(&vswp->if_lockrw);
730 		mutex_destroy(&vswp->swtmout_lock);
731 		mutex_destroy(&vswp->mca_lock);
732 		mutex_destroy(&vswp->mac_lock);
733 		mutex_destroy(&vswp->hw_lock);
734 	}
735 
736 	ddi_soft_state_free(vsw_state, instance);
737 	return (DDI_FAILURE);
738 }
739 
740 static int
741 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
742 {
743 	vio_mblk_pool_t		*poolp, *npoolp;
744 	vsw_t			**vswpp, *vswp;
745 	int 			instance;
746 
747 	instance = ddi_get_instance(dip);
748 	vswp = ddi_get_soft_state(vsw_state, instance);
749 
750 	if (vswp == NULL) {
751 		return (DDI_FAILURE);
752 	}
753 
754 	switch (cmd) {
755 	case DDI_DETACH:
756 		break;
757 	case DDI_SUSPEND:
758 	case DDI_PM_SUSPEND:
759 	default:
760 		return (DDI_FAILURE);
761 	}
762 
763 	D2(vswp, "detaching instance %d", instance);
764 
765 	/* Stop any pending timeout to setup switching mode. */
766 	vsw_stop_switching_timeout(vswp);
767 
768 	if (vswp->if_state & VSW_IF_REG) {
769 		if (vsw_mac_unregister(vswp) != 0) {
770 			cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
771 			    "MAC layer", vswp->instance);
772 			return (DDI_FAILURE);
773 		}
774 	}
775 
776 	vsw_mdeg_unregister(vswp);
777 
778 	/* remove mac layer callback */
779 	mutex_enter(&vswp->mac_lock);
780 	if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
781 		mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
782 		vswp->mrh = NULL;
783 	}
784 	mutex_exit(&vswp->mac_lock);
785 
786 	if (vsw_detach_ports(vswp) != 0) {
787 		cmn_err(CE_WARN, "!vsw%d: Unable to detach ports",
788 		    vswp->instance);
789 		return (DDI_FAILURE);
790 	}
791 
792 	rw_destroy(&vswp->if_lockrw);
793 
794 	mutex_destroy(&vswp->hw_lock);
795 
796 	/*
797 	 * Now that the ports have been deleted, stop and close
798 	 * the physical device.
799 	 */
800 	mutex_enter(&vswp->mac_lock);
801 
802 	vsw_mac_detach(vswp);
803 	vsw_mac_close(vswp);
804 
805 	mutex_exit(&vswp->mac_lock);
806 
807 	mutex_destroy(&vswp->mac_lock);
808 	mutex_destroy(&vswp->swtmout_lock);
809 
810 	/*
811 	 * Destroy any free pools that may still exist.
812 	 */
813 	poolp = vswp->rxh;
814 	while (poolp != NULL) {
815 		npoolp = vswp->rxh = poolp->nextp;
816 		if (vio_destroy_mblks(poolp) != 0) {
817 			vswp->rxh = poolp;
818 			return (DDI_FAILURE);
819 		}
820 		poolp = npoolp;
821 	}
822 
823 	/*
824 	 * Remove this instance from any entries it may be on in
825 	 * the hash table by using the list of addresses maintained
826 	 * in the vsw_t structure.
827 	 */
828 	vsw_del_mcst_vsw(vswp);
829 
830 	vswp->mcap = NULL;
831 	mutex_destroy(&vswp->mca_lock);
832 
833 	/*
834 	 * By now any pending tasks have finished and the underlying
835 	 * ldc's have been destroyed, so its safe to delete the control
836 	 * message taskq.
837 	 */
838 	if (vswp->taskq_p != NULL)
839 		ddi_taskq_destroy(vswp->taskq_p);
840 
841 	/*
842 	 * At this stage all the data pointers in the hash table
843 	 * should be NULL, as all the ports have been removed and will
844 	 * have deleted themselves from the port lists which the data
845 	 * pointers point to. Hence we can destroy the table using the
846 	 * default destructors.
847 	 */
848 	D2(vswp, "vsw_detach: destroying hash tables..");
849 	mod_hash_destroy_hash(vswp->fdb);
850 	vswp->fdb = NULL;
851 
852 	WRITE_ENTER(&vswp->mfdbrw);
853 	mod_hash_destroy_hash(vswp->mfdb);
854 	vswp->mfdb = NULL;
855 	RW_EXIT(&vswp->mfdbrw);
856 	rw_destroy(&vswp->mfdbrw);
857 
858 	ddi_remove_minor_node(dip, NULL);
859 
860 	rw_destroy(&vswp->plist.lockrw);
861 	WRITE_ENTER(&vsw_rw);
862 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
863 		if (*vswpp == vswp) {
864 			*vswpp = vswp->next;
865 			break;
866 		}
867 	}
868 	RW_EXIT(&vsw_rw);
869 	ddi_soft_state_free(vsw_state, instance);
870 
871 	return (DDI_SUCCESS);
872 }
873 
874 static int
875 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
876 {
877 	_NOTE(ARGUNUSED(dip))
878 
879 	vsw_t	*vswp = NULL;
880 	dev_t	dev = (dev_t)arg;
881 	int	instance;
882 
883 	instance = getminor(dev);
884 
885 	switch (infocmd) {
886 	case DDI_INFO_DEVT2DEVINFO:
887 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
888 			*result = NULL;
889 			return (DDI_FAILURE);
890 		}
891 		*result = vswp->dip;
892 		return (DDI_SUCCESS);
893 
894 	case DDI_INFO_DEVT2INSTANCE:
895 		*result = (void *)(uintptr_t)instance;
896 		return (DDI_SUCCESS);
897 
898 	default:
899 		*result = NULL;
900 		return (DDI_FAILURE);
901 	}
902 }
903 
904 /*
905  * Get the value of the "vsw-phys-dev" property in the specified
906  * node. This property is the name of the physical device that
907  * the virtual switch will use to talk to the outside world.
908  *
909  * Note it is valid for this property to be NULL (but the property
910  * itself must exist). Callers of this routine should verify that
911  * the value returned is what they expected (i.e. either NULL or non NULL).
912  *
913  * On success returns value of the property in region pointed to by
914  * the 'name' argument, and with return value of 0. Otherwise returns 1.
915  */
916 static int
917 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
918 {
919 	int	len = 0;
920 	char	*physname = NULL;
921 	char	*dev;
922 
923 	if (md_get_prop_data(mdp, node, physdev_propname,
924 	    (uint8_t **)(&physname), &len) != 0) {
925 		cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical "
926 		    "device(s) from MD", vswp->instance);
927 		return (1);
928 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
929 		cmn_err(CE_WARN, "!vsw%d: %s is too long a device name",
930 		    vswp->instance, physname);
931 		return (1);
932 	} else {
933 		(void) strncpy(name, physname, strlen(physname) + 1);
934 		D2(vswp, "%s: using first device specified (%s)",
935 		    __func__, physname);
936 	}
937 
938 #ifdef DEBUG
939 	/*
940 	 * As a temporary measure to aid testing we check to see if there
941 	 * is a vsw.conf file present. If there is we use the value of the
942 	 * vsw_physname property in the file as the name of the physical
943 	 * device, overriding the value from the MD.
944 	 *
945 	 * There may be multiple devices listed, but for the moment
946 	 * we just use the first one.
947 	 */
948 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
949 	    "vsw_physname", &dev) == DDI_PROP_SUCCESS) {
950 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
951 			cmn_err(CE_WARN, "vsw%d: %s is too long a device name",
952 			    vswp->instance, dev);
953 			ddi_prop_free(dev);
954 			return (1);
955 		} else {
956 			cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from "
957 			    "config file", vswp->instance, dev);
958 
959 			(void) strncpy(name, dev, strlen(dev) + 1);
960 		}
961 
962 		ddi_prop_free(dev);
963 	}
964 #endif
965 
966 	return (0);
967 }
968 
969 /*
970  * Read the 'vsw-switch-mode' property from the specified MD node.
971  *
972  * Returns 0 on success and the number of modes found in 'found',
973  * otherwise returns 1.
974  */
975 static int
976 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
977 						uint8_t *modes, int *found)
978 {
979 	int		len = 0;
980 	int		smode_num = 0;
981 	char		*smode = NULL;
982 	char		*curr_mode = NULL;
983 
984 	D1(vswp, "%s: enter", __func__);
985 
986 	/*
987 	 * Get the switch-mode property. The modes are listed in
988 	 * decreasing order of preference, i.e. prefered mode is
989 	 * first item in list.
990 	 */
991 	len = 0;
992 	smode_num = 0;
993 	if (md_get_prop_data(mdp, node, smode_propname,
994 	    (uint8_t **)(&smode), &len) != 0) {
995 		/*
996 		 * Unable to get switch-mode property from MD, nothing
997 		 * more we can do.
998 		 */
999 		cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
1000 		    " from the MD", vswp->instance);
1001 		*found = 0;
1002 		return (1);
1003 	}
1004 
1005 	curr_mode = smode;
1006 	/*
1007 	 * Modes of operation:
1008 	 * 'switched'	 - layer 2 switching, underlying HW in
1009 	 *			programmed mode.
1010 	 * 'promiscuous' - layer 2 switching, underlying HW in
1011 	 *			promiscuous mode.
1012 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
1013 	 *			in non-promiscuous mode.
1014 	 */
1015 	while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) {
1016 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
1017 		if (strcmp(curr_mode, "switched") == 0) {
1018 			modes[smode_num++] = VSW_LAYER2;
1019 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
1020 			modes[smode_num++] = VSW_LAYER2_PROMISC;
1021 		} else if (strcmp(curr_mode, "routed") == 0) {
1022 			modes[smode_num++] = VSW_LAYER3;
1023 		} else {
1024 			cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
1025 			    "setting to default switched mode",
1026 			    vswp->instance, curr_mode);
1027 			modes[smode_num++] = VSW_LAYER2;
1028 		}
1029 		curr_mode += strlen(curr_mode) + 1;
1030 	}
1031 	*found = smode_num;
1032 
1033 	D2(vswp, "%s: %d modes found", __func__, smode_num);
1034 
1035 	D1(vswp, "%s: exit", __func__);
1036 
1037 	return (0);
1038 }
1039 
1040 /*
1041  * Check to see if the card supports the setting of multiple unicst
1042  * addresses.
1043  *
1044  * Returns 0 if card supports the programming of multiple unicast addresses,
1045  * otherwise returns 1.
1046  */
1047 static int
1048 vsw_get_hw_maddr(vsw_t *vswp)
1049 {
1050 	D1(vswp, "%s: enter", __func__);
1051 
1052 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
1053 
1054 	if (vswp->mh == NULL)
1055 		return (1);
1056 
1057 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
1058 		cmn_err(CE_WARN, "!vsw%d: device (%s) does not support "
1059 		    "setting multiple unicast addresses", vswp->instance,
1060 		    vswp->physname);
1061 		return (1);
1062 	}
1063 
1064 	D2(vswp, "%s: %d addrs : %d free", __func__,
1065 	    vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
1066 
1067 	D1(vswp, "%s: exit", __func__);
1068 
1069 	return (0);
1070 }
1071 
1072 /*
1073  * Program unicast and multicast addresses of vsw interface and the ports
1074  * into the physical device.
1075  */
1076 static void
1077 vsw_set_addrs(vsw_t *vswp)
1078 {
1079 	vsw_port_list_t	*plist = &vswp->plist;
1080 	vsw_port_t	*port;
1081 	mcst_addr_t	*mcap;
1082 	int		rv;
1083 
1084 	READ_ENTER(&vswp->if_lockrw);
1085 
1086 	if (vswp->if_state & VSW_IF_UP) {
1087 
1088 		/* program unicst addr of vsw interface in the physdev */
1089 		if (vswp->addr_set == VSW_ADDR_UNSET) {
1090 			mutex_enter(&vswp->hw_lock);
1091 			rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
1092 			mutex_exit(&vswp->hw_lock);
1093 			if (rv != 0) {
1094 				cmn_err(CE_NOTE,
1095 				    "!vsw%d: failed to program interface "
1096 				    "unicast address\n", vswp->instance);
1097 			}
1098 			/*
1099 			 * Notify the MAC layer of the changed address.
1100 			 */
1101 			mac_unicst_update(vswp->if_mh,
1102 			    (uint8_t *)&vswp->if_addr);
1103 		}
1104 
1105 		/* program mcast addrs of vsw interface in the physdev */
1106 		mutex_enter(&vswp->mca_lock);
1107 		mutex_enter(&vswp->mac_lock);
1108 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
1109 			if (mcap->mac_added)
1110 				continue;
1111 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
1112 			if (rv == 0) {
1113 				mcap->mac_added = B_TRUE;
1114 			} else {
1115 				cmn_err(CE_WARN, "!vsw%d: unable to add "
1116 				    "multicast address: %s\n", vswp->instance,
1117 				    ether_sprintf((void *)&mcap->mca));
1118 			}
1119 		}
1120 		mutex_exit(&vswp->mac_lock);
1121 		mutex_exit(&vswp->mca_lock);
1122 
1123 	}
1124 
1125 	RW_EXIT(&vswp->if_lockrw);
1126 
1127 	WRITE_ENTER(&plist->lockrw);
1128 
1129 	/* program unicast address of ports in the physical device */
1130 	mutex_enter(&vswp->hw_lock);
1131 	for (port = plist->head; port != NULL; port = port->p_next) {
1132 		if (port->addr_set != VSW_ADDR_UNSET) /* addr already set */
1133 			continue;
1134 		if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
1135 			cmn_err(CE_NOTE,
1136 			    "!vsw%d: port:%d failed to set unicast address\n",
1137 			    vswp->instance, port->p_instance);
1138 		}
1139 	}
1140 	mutex_exit(&vswp->hw_lock);
1141 
1142 	/* program multicast addresses of ports in the physdev */
1143 	for (port = plist->head; port != NULL; port = port->p_next) {
1144 		mutex_enter(&port->mca_lock);
1145 		mutex_enter(&vswp->mac_lock);
1146 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
1147 			if (mcap->mac_added)
1148 				continue;
1149 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
1150 			if (rv == 0) {
1151 				mcap->mac_added = B_TRUE;
1152 			} else {
1153 				cmn_err(CE_WARN, "!vsw%d: unable to add "
1154 				    "multicast address: %s\n", vswp->instance,
1155 				    ether_sprintf((void *)&mcap->mca));
1156 			}
1157 		}
1158 		mutex_exit(&vswp->mac_lock);
1159 		mutex_exit(&port->mca_lock);
1160 	}
1161 
1162 	RW_EXIT(&plist->lockrw);
1163 }
1164 
1165 /*
1166  * Remove unicast and multicast addresses of vsw interface and the ports
1167  * from the physical device.
1168  */
1169 static void
1170 vsw_unset_addrs(vsw_t *vswp)
1171 {
1172 	vsw_port_list_t	*plist = &vswp->plist;
1173 	vsw_port_t	*port;
1174 	mcst_addr_t	*mcap;
1175 
1176 	READ_ENTER(&vswp->if_lockrw);
1177 
1178 	if (vswp->if_state & VSW_IF_UP) {
1179 
1180 		/*
1181 		 * Remove unicast addr of vsw interfce
1182 		 * from current physdev
1183 		 */
1184 		mutex_enter(&vswp->hw_lock);
1185 		(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
1186 		mutex_exit(&vswp->hw_lock);
1187 
1188 		/*
1189 		 * Remove mcast addrs of vsw interface
1190 		 * from current physdev
1191 		 */
1192 		mutex_enter(&vswp->mca_lock);
1193 		mutex_enter(&vswp->mac_lock);
1194 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
1195 			if (!mcap->mac_added)
1196 				continue;
1197 			(void) mac_multicst_remove(vswp->mh,
1198 			    (uchar_t *)&mcap->mca);
1199 			mcap->mac_added = B_FALSE;
1200 		}
1201 		mutex_exit(&vswp->mac_lock);
1202 		mutex_exit(&vswp->mca_lock);
1203 
1204 	}
1205 
1206 	RW_EXIT(&vswp->if_lockrw);
1207 
1208 	WRITE_ENTER(&plist->lockrw);
1209 
1210 	/*
1211 	 * Remove unicast address of ports from the current physical device
1212 	 */
1213 	mutex_enter(&vswp->hw_lock);
1214 	for (port = plist->head; port != NULL; port = port->p_next) {
1215 		/* Remove address if was programmed into HW. */
1216 		if (port->addr_set == VSW_ADDR_UNSET)
1217 			continue;
1218 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
1219 	}
1220 	mutex_exit(&vswp->hw_lock);
1221 
1222 	/* Remove multicast addresses of ports from the current physdev */
1223 	for (port = plist->head; port != NULL; port = port->p_next) {
1224 		mutex_enter(&port->mca_lock);
1225 		mutex_enter(&vswp->mac_lock);
1226 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
1227 			if (!mcap->mac_added)
1228 				continue;
1229 			(void) mac_multicst_remove(vswp->mh,
1230 			    (uchar_t *)&mcap->mca);
1231 			mcap->mac_added = B_FALSE;
1232 		}
1233 		mutex_exit(&vswp->mac_lock);
1234 		mutex_exit(&port->mca_lock);
1235 	}
1236 
1237 	RW_EXIT(&plist->lockrw);
1238 }
1239 
1240 /* copy mac address of vsw into soft state structure */
1241 static void
1242 vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr)
1243 {
1244 	int	i;
1245 
1246 	WRITE_ENTER(&vswp->if_lockrw);
1247 	for (i = ETHERADDRL - 1; i >= 0; i--) {
1248 		vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
1249 		macaddr >>= 8;
1250 	}
1251 	RW_EXIT(&vswp->if_lockrw);
1252 }
1253 
1254 /*
1255  * Timeout routine to setup switching mode:
1256  * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop()
1257  * initially. If it fails and the error is EAGAIN, then this timeout handler
1258  * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried
1259  * until we successfully finish it; or the returned error is not EAGAIN.
1260  */
1261 static void
1262 vsw_setup_switching_timeout(void *arg)
1263 {
1264 	vsw_t		*vswp = (vsw_t *)arg;
1265 	int		rv;
1266 
1267 	if (vswp->swtmout_enabled == B_FALSE)
1268 		return;
1269 
1270 	rv = vsw_setup_switching(vswp);
1271 
1272 	if (rv == 0) {
1273 		/*
1274 		 * Successfully setup switching mode.
1275 		 * Program unicst, mcst addrs of vsw
1276 		 * interface and ports in the physdev.
1277 		 */
1278 		vsw_set_addrs(vswp);
1279 	}
1280 
1281 	mutex_enter(&vswp->swtmout_lock);
1282 
1283 	if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) {
1284 		/*
1285 		 * Reschedule timeout() if the error is EAGAIN and the
1286 		 * timeout is still enabled. For errors other than EAGAIN,
1287 		 * we simply return without rescheduling timeout().
1288 		 */
1289 		vswp->swtmout_id =
1290 		    timeout(vsw_setup_switching_timeout, vswp,
1291 		    (vsw_setup_switching_delay * drv_usectohz(MICROSEC)));
1292 		goto exit;
1293 	}
1294 
1295 	/* timeout handler completed */
1296 	vswp->swtmout_enabled = B_FALSE;
1297 	vswp->swtmout_id = 0;
1298 
1299 exit:
1300 	mutex_exit(&vswp->swtmout_lock);
1301 }
1302 
1303 /*
1304  * Cancel the timeout handler to setup switching mode.
1305  */
1306 static void
1307 vsw_stop_switching_timeout(vsw_t *vswp)
1308 {
1309 	timeout_id_t tid;
1310 
1311 	mutex_enter(&vswp->swtmout_lock);
1312 
1313 	tid = vswp->swtmout_id;
1314 
1315 	if (tid != 0) {
1316 		/* signal timeout handler to stop */
1317 		vswp->swtmout_enabled = B_FALSE;
1318 		vswp->swtmout_id = 0;
1319 		mutex_exit(&vswp->swtmout_lock);
1320 
1321 		(void) untimeout(tid);
1322 	} else {
1323 		mutex_exit(&vswp->swtmout_lock);
1324 	}
1325 
1326 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
1327 
1328 	mutex_enter(&vswp->mac_lock);
1329 	vswp->mac_open_retries = 0;
1330 	mutex_exit(&vswp->mac_lock);
1331 }
1332 
1333 /*
1334  * Setup the required switching mode.
1335  * This routine is invoked from vsw_attach() or vsw_update_md_prop()
1336  * initially. If it fails and the error is EAGAIN, then a timeout handler
1337  * is started to retry vsw_setup_switching(), until it successfully finishes;
1338  * or the returned error is not EAGAIN.
1339  *
1340  * Returns:
1341  *  0 on success.
1342  *  EAGAIN if retry is needed.
1343  *  1 on all other failures.
1344  */
1345 static int
1346 vsw_setup_switching(vsw_t *vswp)
1347 {
1348 	int	i, rv = 1;
1349 
1350 	D1(vswp, "%s: enter", __func__);
1351 
1352 	/*
1353 	 * Select best switching mode.
1354 	 * Note that we start from the saved smode_idx. This is done as
1355 	 * this routine can be called from the timeout handler to retry
1356 	 * setting up a specific mode. Currently only the function which
1357 	 * sets up layer2/promisc mode returns EAGAIN if the underlying
1358 	 * physical device is not available yet, causing retries.
1359 	 */
1360 	for (i = vswp->smode_idx; i < vswp->smode_num; i++) {
1361 		vswp->smode_idx = i;
1362 		switch (vswp->smode[i]) {
1363 		case VSW_LAYER2:
1364 		case VSW_LAYER2_PROMISC:
1365 			rv = vsw_setup_layer2(vswp);
1366 			break;
1367 
1368 		case VSW_LAYER3:
1369 			rv = vsw_setup_layer3(vswp);
1370 			break;
1371 
1372 		default:
1373 			DERR(vswp, "unknown switch mode");
1374 			break;
1375 		}
1376 
1377 		if ((rv == 0) || (rv == EAGAIN))
1378 			break;
1379 
1380 		/* all other errors(rv != 0): continue & select the next mode */
1381 		rv = 1;
1382 	}
1383 
1384 	if (rv && (rv != EAGAIN)) {
1385 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
1386 		    "switching mode", vswp->instance);
1387 	} else if (rv == 0) {
1388 		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
1389 	}
1390 
1391 	D2(vswp, "%s: Operating in mode %d", __func__,
1392 	    vswp->smode[vswp->smode_idx]);
1393 
1394 	D1(vswp, "%s: exit", __func__);
1395 
1396 	return (rv);
1397 }
1398 
1399 /*
1400  * Setup for layer 2 switching.
1401  *
1402  * Returns:
1403  *  0 on success.
1404  *  EAGAIN if retry is needed.
1405  *  EIO on all other failures.
1406  */
1407 static int
1408 vsw_setup_layer2(vsw_t *vswp)
1409 {
1410 	int	rv;
1411 
1412 	D1(vswp, "%s: enter", __func__);
1413 
1414 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
1415 
1416 	rv = strlen(vswp->physname);
1417 	if (rv == 0) {
1418 		/*
1419 		 * Physical device name is NULL, which is
1420 		 * required for layer 2.
1421 		 */
1422 		cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
1423 		    vswp->instance);
1424 		return (EIO);
1425 	}
1426 
1427 	mutex_enter(&vswp->mac_lock);
1428 
1429 	rv = vsw_mac_open(vswp);
1430 	if (rv != 0) {
1431 		if (rv != EAGAIN) {
1432 			cmn_err(CE_WARN, "!vsw%d: Unable to open physical "
1433 			    "device: %s\n", vswp->instance, vswp->physname);
1434 		}
1435 		mutex_exit(&vswp->mac_lock);
1436 		return (rv);
1437 	}
1438 
1439 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
1440 		/*
1441 		 * Verify that underlying device can support multiple
1442 		 * unicast mac addresses.
1443 		 */
1444 		rv = vsw_get_hw_maddr(vswp);
1445 		if (rv != 0) {
1446 			cmn_err(CE_WARN, "!vsw%d: Unable to setup "
1447 			    "layer2 switching", vswp->instance);
1448 			goto exit_error;
1449 		}
1450 	}
1451 
1452 	/*
1453 	 * Attempt to link into the MAC layer so we can get
1454 	 * and send packets out over the physical adapter.
1455 	 */
1456 	rv = vsw_mac_attach(vswp);
1457 	if (rv != 0) {
1458 		/*
1459 		 * Registration with the MAC layer has failed,
1460 		 * so return error so that can fall back to next
1461 		 * prefered switching method.
1462 		 */
1463 		cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: "
1464 		    "%s\n", vswp->instance, vswp->physname);
1465 		goto exit_error;
1466 	}
1467 
1468 	D1(vswp, "%s: exit", __func__);
1469 
1470 	mutex_exit(&vswp->mac_lock);
1471 	return (0);
1472 
1473 exit_error:
1474 	vsw_mac_close(vswp);
1475 	mutex_exit(&vswp->mac_lock);
1476 	return (EIO);
1477 }
1478 
1479 static int
1480 vsw_setup_layer3(vsw_t *vswp)
1481 {
1482 	D1(vswp, "%s: enter", __func__);
1483 
1484 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1485 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
1486 
1487 	D1(vswp, "%s: exit", __func__);
1488 
1489 	return (0);
1490 }
1491 
1492 /*
1493  * Open the underlying physical device for access in layer2 mode.
1494  * Returns:
1495  * 0 on success
1496  * EAGAIN if mac_open() fails due to the device being not available yet.
1497  * EIO on any other failures.
1498  */
1499 static int
1500 vsw_mac_open(vsw_t *vswp)
1501 {
1502 	char	drv[LIFNAMSIZ];
1503 	uint_t	ddi_instance;
1504 	int	rv;
1505 
1506 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
1507 
1508 	if (vswp->mh != NULL) {
1509 		/* already open */
1510 		return (0);
1511 	}
1512 
1513 	if (vswp->mac_open_retries++ >= vsw_mac_open_retries) {
1514 		/* exceeded max retries */
1515 		return (EIO);
1516 	}
1517 
1518 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
1519 		cmn_err(CE_WARN, "!vsw%d: invalid device name: %s",
1520 		    vswp->instance, vswp->physname);
1521 		return (EIO);
1522 	}
1523 
1524 	/*
1525 	 * Aggregation devices are special in that the device instance
1526 	 * must be set to zero when they are being mac_open()'ed.
1527 	 *
1528 	 * The only way to determine if we are being passed an aggregated
1529 	 * device is to check the device name.
1530 	 */
1531 	if (strcmp(drv, "aggr") == 0) {
1532 		ddi_instance = 0;
1533 	}
1534 
1535 	rv = mac_open(vswp->physname, ddi_instance, &vswp->mh);
1536 	if (rv != 0) {
1537 		/*
1538 		 * If mac_open() failed and the error indicates that the
1539 		 * device is not available yet, then, we return EAGAIN to
1540 		 * indicate that it needs to be retried.
1541 		 * For example, this may happen during boot up, as the
1542 		 * required link aggregation groups(devices) have not been
1543 		 * created yet.
1544 		 */
1545 		if (rv == ENOENT) {
1546 			return (EAGAIN);
1547 		} else {
1548 			cmn_err(CE_WARN, "vsw%d: mac_open %s failed rv:%x",
1549 			    vswp->instance, vswp->physname, rv);
1550 			return (EIO);
1551 		}
1552 	}
1553 
1554 	vswp->mac_open_retries = 0;
1555 
1556 	return (0);
1557 }
1558 
1559 /*
1560  * Close the underlying physical device.
1561  */
1562 static void
1563 vsw_mac_close(vsw_t *vswp)
1564 {
1565 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
1566 
1567 	if (vswp->mh != NULL) {
1568 		mac_close(vswp->mh);
1569 		vswp->mh = NULL;
1570 	}
1571 }
1572 
1573 /*
1574  * Link into the MAC layer to gain access to the services provided by
1575  * the underlying physical device driver (which should also have
1576  * registered with the MAC layer).
1577  *
1578  * Only when in layer 2 mode.
1579  */
1580 static int
1581 vsw_mac_attach(vsw_t *vswp)
1582 {
1583 	D1(vswp, "%s: enter", __func__);
1584 
1585 	ASSERT(vswp->mrh == NULL);
1586 	ASSERT(vswp->mstarted == B_FALSE);
1587 	ASSERT(vswp->mresources == B_FALSE);
1588 
1589 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
1590 
1591 	ASSERT(vswp->mh != NULL);
1592 
1593 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1594 
1595 	if (vsw_multi_ring_enable) {
1596 		/*
1597 		 * Initialize the ring table.
1598 		 */
1599 		vsw_mac_ring_tbl_init(vswp);
1600 
1601 		/*
1602 		 * Register our rx callback function.
1603 		 */
1604 		vswp->mrh = mac_rx_add(vswp->mh,
1605 		    vsw_rx_queue_cb, (void *)vswp);
1606 		ASSERT(vswp->mrh != NULL);
1607 
1608 		/*
1609 		 * Register our mac resource callback.
1610 		 */
1611 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
1612 		vswp->mresources = B_TRUE;
1613 
1614 		/*
1615 		 * Get the ring resources available to us from
1616 		 * the mac below us.
1617 		 */
1618 		mac_resources(vswp->mh);
1619 	} else {
1620 		/*
1621 		 * Just register our rx callback function
1622 		 */
1623 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1624 		ASSERT(vswp->mrh != NULL);
1625 	}
1626 
1627 	/* Get the MAC tx fn */
1628 	vswp->txinfo = mac_tx_get(vswp->mh);
1629 
1630 	/* start the interface */
1631 	if (mac_start(vswp->mh) != 0) {
1632 		cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
1633 		    vswp->instance);
1634 		goto mac_fail_exit;
1635 	}
1636 
1637 	vswp->mstarted = B_TRUE;
1638 
1639 	D1(vswp, "%s: exit", __func__);
1640 	return (0);
1641 
1642 mac_fail_exit:
1643 	vsw_mac_detach(vswp);
1644 
1645 	D1(vswp, "%s: exit", __func__);
1646 	return (1);
1647 }
1648 
1649 static void
1650 vsw_mac_detach(vsw_t *vswp)
1651 {
1652 	D1(vswp, "vsw_mac_detach: enter");
1653 
1654 	ASSERT(vswp != NULL);
1655 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
1656 
1657 	if (vsw_multi_ring_enable) {
1658 		vsw_mac_ring_tbl_destroy(vswp);
1659 	}
1660 
1661 	if (vswp->mh != NULL) {
1662 		if (vswp->mstarted)
1663 			mac_stop(vswp->mh);
1664 		if (vswp->mrh != NULL)
1665 			mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
1666 		if (vswp->mresources)
1667 			mac_resource_set(vswp->mh, NULL, NULL);
1668 	}
1669 
1670 	vswp->mrh = NULL;
1671 	vswp->txinfo = NULL;
1672 	vswp->mstarted = B_FALSE;
1673 
1674 	D1(vswp, "vsw_mac_detach: exit");
1675 }
1676 
1677 /*
1678  * Depending on the mode specified, the capabilites and capacity
1679  * of the underlying device setup the physical device.
1680  *
1681  * If in layer 3 mode, then do nothing.
1682  *
1683  * If in layer 2 programmed mode attempt to program the unicast address
1684  * associated with the port into the physical device. If this is not
1685  * possible due to resource exhaustion or simply because the device does
1686  * not support multiple unicast addresses then if required fallback onto
1687  * putting the card into promisc mode.
1688  *
1689  * If in promisc mode then simply set the card into promisc mode.
1690  *
1691  * Returns 0 success, 1 on failure.
1692  */
1693 static int
1694 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
1695 {
1696 	mac_multi_addr_t	mac_addr;
1697 	int			err;
1698 
1699 	D1(vswp, "%s: enter", __func__);
1700 
1701 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1702 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1703 
1704 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1705 		return (0);
1706 
1707 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
1708 		return (vsw_set_hw_promisc(vswp, port, type));
1709 	}
1710 
1711 	/*
1712 	 * Attempt to program the unicast address into the HW.
1713 	 */
1714 	mac_addr.mma_addrlen = ETHERADDRL;
1715 	if (type == VSW_VNETPORT) {
1716 		ASSERT(port != NULL);
1717 		ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
1718 	} else {
1719 		ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
1720 	}
1721 
1722 	err = vsw_set_hw_addr(vswp, &mac_addr);
1723 	if (err == ENOSPC) {
1724 		/*
1725 		 * Mark that attempt should be made to re-config sometime
1726 		 * in future if a port is deleted.
1727 		 */
1728 		vswp->recfg_reqd = B_TRUE;
1729 
1730 		/*
1731 		 * Only 1 mode specified, nothing more to do.
1732 		 */
1733 		if (vswp->smode_num == 1)
1734 			return (err);
1735 
1736 		/*
1737 		 * If promiscuous was next mode specified try to
1738 		 * set the card into that mode.
1739 		 */
1740 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
1741 		    (vswp->smode[vswp->smode_idx + 1] ==
1742 		    VSW_LAYER2_PROMISC)) {
1743 			vswp->smode_idx += 1;
1744 			return (vsw_set_hw_promisc(vswp, port, type));
1745 		}
1746 		return (err);
1747 	}
1748 
1749 	if (err != 0)
1750 		return (err);
1751 
1752 	if (type == VSW_VNETPORT) {
1753 		port->addr_slot = mac_addr.mma_slot;
1754 		port->addr_set = VSW_ADDR_HW;
1755 	} else {
1756 		vswp->addr_slot = mac_addr.mma_slot;
1757 		vswp->addr_set = VSW_ADDR_HW;
1758 	}
1759 
1760 	D2(vswp, "programmed addr %s into slot %d "
1761 	"of device %s", ether_sprintf((void *)mac_addr.mma_addr),
1762 	    mac_addr.mma_slot, vswp->physname);
1763 
1764 	D1(vswp, "%s: exit", __func__);
1765 
1766 	return (0);
1767 }
1768 
1769 /*
1770  * If in layer 3 mode do nothing.
1771  *
1772  * If in layer 2 switched mode remove the address from the physical
1773  * device.
1774  *
1775  * If in layer 2 promiscuous mode disable promisc mode.
1776  *
1777  * Returns 0 on success.
1778  */
1779 static int
1780 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
1781 {
1782 	mac_addr_slot_t	slot;
1783 	int		rv;
1784 
1785 	D1(vswp, "%s: enter", __func__);
1786 
1787 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1788 
1789 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1790 		return (0);
1791 
1792 	switch (type) {
1793 	case VSW_VNETPORT:
1794 		ASSERT(port != NULL);
1795 
1796 		if (port->addr_set == VSW_ADDR_PROMISC) {
1797 			return (vsw_unset_hw_promisc(vswp, port, type));
1798 
1799 		} else if (port->addr_set == VSW_ADDR_HW) {
1800 			slot = port->addr_slot;
1801 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
1802 				port->addr_set = VSW_ADDR_UNSET;
1803 		}
1804 
1805 		break;
1806 
1807 	case VSW_LOCALDEV:
1808 		if (vswp->addr_set == VSW_ADDR_PROMISC) {
1809 			return (vsw_unset_hw_promisc(vswp, NULL, type));
1810 
1811 		} else if (vswp->addr_set == VSW_ADDR_HW) {
1812 			slot = vswp->addr_slot;
1813 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
1814 				vswp->addr_set = VSW_ADDR_UNSET;
1815 		}
1816 
1817 		break;
1818 
1819 	default:
1820 		/* should never happen */
1821 		DERR(vswp, "%s: unknown type %d", __func__, type);
1822 		ASSERT(0);
1823 		return (1);
1824 	}
1825 
1826 	D1(vswp, "%s: exit", __func__);
1827 	return (rv);
1828 }
1829 
1830 /*
1831  * Attempt to program a unicast address into HW.
1832  *
1833  * Returns 0 on sucess, 1 on failure.
1834  */
1835 static int
1836 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
1837 {
1838 	void	*mah;
1839 	int	rv = EINVAL;
1840 
1841 	D1(vswp, "%s: enter", __func__);
1842 
1843 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1844 
1845 	if (vswp->maddr.maddr_handle == NULL)
1846 		return (rv);
1847 
1848 	mah = vswp->maddr.maddr_handle;
1849 
1850 	rv = vswp->maddr.maddr_add(mah, mac);
1851 
1852 	if (rv == 0)
1853 		return (rv);
1854 
1855 	/*
1856 	 * Its okay for the add to fail because we have exhausted
1857 	 * all the resouces in the hardware device. Any other error
1858 	 * we want to flag.
1859 	 */
1860 	if (rv != ENOSPC) {
1861 		cmn_err(CE_WARN, "!vsw%d: error programming "
1862 		    "address %s into HW err (%d)",
1863 		    vswp->instance, ether_sprintf((void *)mac->mma_addr), rv);
1864 	}
1865 	D1(vswp, "%s: exit", __func__);
1866 	return (rv);
1867 }
1868 
1869 /*
1870  * Remove a unicast mac address which has previously been programmed
1871  * into HW.
1872  *
1873  * Returns 0 on sucess, 1 on failure.
1874  */
1875 static int
1876 vsw_unset_hw_addr(vsw_t *vswp, int slot)
1877 {
1878 	void	*mah;
1879 	int	rv;
1880 
1881 	D1(vswp, "%s: enter", __func__);
1882 
1883 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1884 	ASSERT(slot >= 0);
1885 
1886 	if (vswp->maddr.maddr_handle == NULL)
1887 		return (1);
1888 
1889 	mah = vswp->maddr.maddr_handle;
1890 
1891 	rv = vswp->maddr.maddr_remove(mah, slot);
1892 	if (rv != 0) {
1893 		cmn_err(CE_WARN, "!vsw%d: unable to remove address "
1894 		    "from slot %d in device %s (err %d)",
1895 		    vswp->instance, slot, vswp->physname, rv);
1896 		return (1);
1897 	}
1898 
1899 	D2(vswp, "removed addr from slot %d in device %s",
1900 	    slot, vswp->physname);
1901 
1902 	D1(vswp, "%s: exit", __func__);
1903 	return (0);
1904 }
1905 
1906 /*
1907  * Set network card into promisc mode.
1908  *
1909  * Returns 0 on success, 1 on failure.
1910  */
1911 static int
1912 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
1913 {
1914 	D1(vswp, "%s: enter", __func__);
1915 
1916 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1917 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1918 
1919 	mutex_enter(&vswp->mac_lock);
1920 	if (vswp->mh == NULL) {
1921 		mutex_exit(&vswp->mac_lock);
1922 		return (1);
1923 	}
1924 
1925 	if (vswp->promisc_cnt++ == 0) {
1926 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1927 			vswp->promisc_cnt--;
1928 			mutex_exit(&vswp->mac_lock);
1929 			return (1);
1930 		}
1931 		cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
1932 		    "promiscuous mode", vswp->instance, vswp->physname);
1933 	}
1934 	mutex_exit(&vswp->mac_lock);
1935 
1936 	if (type == VSW_VNETPORT) {
1937 		ASSERT(port != NULL);
1938 		port->addr_set = VSW_ADDR_PROMISC;
1939 	} else {
1940 		vswp->addr_set = VSW_ADDR_PROMISC;
1941 	}
1942 
1943 	D1(vswp, "%s: exit", __func__);
1944 
1945 	return (0);
1946 }
1947 
1948 /*
1949  * Turn off promiscuous mode on network card.
1950  *
1951  * Returns 0 on success, 1 on failure.
1952  */
1953 static int
1954 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
1955 {
1956 	vsw_port_list_t 	*plist = &vswp->plist;
1957 
1958 	D2(vswp, "%s: enter", __func__);
1959 
1960 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1961 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1962 
1963 	mutex_enter(&vswp->mac_lock);
1964 	if (vswp->mh == NULL) {
1965 		mutex_exit(&vswp->mac_lock);
1966 		return (1);
1967 	}
1968 
1969 	if (--vswp->promisc_cnt == 0) {
1970 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
1971 			vswp->promisc_cnt++;
1972 			mutex_exit(&vswp->mac_lock);
1973 			return (1);
1974 		}
1975 
1976 		/*
1977 		 * We are exiting promisc mode either because we were
1978 		 * only in promisc mode because we had failed over from
1979 		 * switched mode due to HW resource issues, or the user
1980 		 * wanted the card in promisc mode for all the ports and
1981 		 * the last port is now being deleted. Tweak the message
1982 		 * accordingly.
1983 		 */
1984 		if (plist->num_ports != 0) {
1985 			cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
1986 			    "programmed mode", vswp->instance, vswp->physname);
1987 		} else {
1988 			cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
1989 			    "promiscuous mode", vswp->instance, vswp->physname);
1990 		}
1991 	}
1992 	mutex_exit(&vswp->mac_lock);
1993 
1994 	if (type == VSW_VNETPORT) {
1995 		ASSERT(port != NULL);
1996 		ASSERT(port->addr_set == VSW_ADDR_PROMISC);
1997 		port->addr_set = VSW_ADDR_UNSET;
1998 	} else {
1999 		ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
2000 		vswp->addr_set = VSW_ADDR_UNSET;
2001 	}
2002 
2003 	D1(vswp, "%s: exit", __func__);
2004 	return (0);
2005 }
2006 
2007 /*
2008  * Determine whether or not we are operating in our prefered
2009  * mode and if not whether the physical resources now allow us
2010  * to operate in it.
2011  *
2012  * If a port is being removed should only be invoked after port has been
2013  * removed from the port list.
2014  */
2015 static void
2016 vsw_reconfig_hw(vsw_t *vswp)
2017 {
2018 	int			s_idx;
2019 
2020 	D1(vswp, "%s: enter", __func__);
2021 
2022 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
2023 
2024 	if (vswp->maddr.maddr_handle == NULL) {
2025 		return;
2026 	}
2027 
2028 	/*
2029 	 * If we are in layer 2 (i.e. switched) or would like to be
2030 	 * in layer 2 then check if any ports or the vswitch itself
2031 	 * need to be programmed into the HW.
2032 	 *
2033 	 * This can happen in two cases - switched was specified as
2034 	 * the prefered mode of operation but we exhausted the HW
2035 	 * resources and so failed over to the next specifed mode,
2036 	 * or switched was the only mode specified so after HW
2037 	 * resources were exhausted there was nothing more we
2038 	 * could do.
2039 	 */
2040 	if (vswp->smode_idx > 0)
2041 		s_idx = vswp->smode_idx - 1;
2042 	else
2043 		s_idx = vswp->smode_idx;
2044 
2045 	if (vswp->smode[s_idx] != VSW_LAYER2) {
2046 		return;
2047 	}
2048 
2049 	D2(vswp, "%s: attempting reconfig..", __func__);
2050 
2051 	/*
2052 	 * First, attempt to set the vswitch mac address into HW,
2053 	 * if required.
2054 	 */
2055 	if (vsw_prog_if(vswp)) {
2056 		return;
2057 	}
2058 
2059 	/*
2060 	 * Next, attempt to set any ports which have not yet been
2061 	 * programmed into HW.
2062 	 */
2063 	if (vsw_prog_ports(vswp)) {
2064 		return;
2065 	}
2066 
2067 	/*
2068 	 * By now we know that have programmed all desired ports etc
2069 	 * into HW, so safe to mark reconfiguration as complete.
2070 	 */
2071 	vswp->recfg_reqd = B_FALSE;
2072 
2073 	vswp->smode_idx = s_idx;
2074 
2075 	D1(vswp, "%s: exit", __func__);
2076 }
2077 
2078 /*
2079  * Check to see if vsw itself is plumbed, and if so whether or not
2080  * its mac address should be written into HW.
2081  *
2082  * Returns 0 if could set address, or didn't have to set it.
2083  * Returns 1 if failed to set address.
2084  */
2085 static int
2086 vsw_prog_if(vsw_t *vswp)
2087 {
2088 	mac_multi_addr_t	addr;
2089 
2090 	D1(vswp, "%s: enter", __func__);
2091 
2092 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
2093 
2094 	READ_ENTER(&vswp->if_lockrw);
2095 	if ((vswp->if_state & VSW_IF_UP) &&
2096 	    (vswp->addr_set != VSW_ADDR_HW)) {
2097 
2098 		addr.mma_addrlen = ETHERADDRL;
2099 		ether_copy(&vswp->if_addr, &addr.mma_addr);
2100 
2101 		if (vsw_set_hw_addr(vswp, &addr) != 0) {
2102 			RW_EXIT(&vswp->if_lockrw);
2103 			return (1);
2104 		}
2105 
2106 		vswp->addr_slot = addr.mma_slot;
2107 
2108 		/*
2109 		 * If previously when plumbed had had to place
2110 		 * interface into promisc mode, now reverse that.
2111 		 *
2112 		 * Note that interface will only actually be set into
2113 		 * non-promisc mode when last port/interface has been
2114 		 * programmed into HW.
2115 		 */
2116 		if (vswp->addr_set == VSW_ADDR_PROMISC)
2117 			(void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
2118 
2119 		vswp->addr_set = VSW_ADDR_HW;
2120 	}
2121 	RW_EXIT(&vswp->if_lockrw);
2122 
2123 	D1(vswp, "%s: exit", __func__);
2124 	return (0);
2125 }
2126 
2127 /*
2128  * Scan the port list for any ports which have not yet been set
2129  * into HW. For those found attempt to program their mac addresses
2130  * into the physical device.
2131  *
2132  * Returns 0 if able to program all required ports (can be 0) into HW.
2133  * Returns 1 if failed to set at least one mac address.
2134  */
2135 static int
2136 vsw_prog_ports(vsw_t *vswp)
2137 {
2138 	mac_multi_addr_t	addr;
2139 	vsw_port_list_t		*plist = &vswp->plist;
2140 	vsw_port_t		*tp;
2141 	int			rv = 0;
2142 
2143 	D1(vswp, "%s: enter", __func__);
2144 
2145 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
2146 
2147 	READ_ENTER(&plist->lockrw);
2148 	for (tp = plist->head; tp != NULL; tp = tp->p_next) {
2149 		if (tp->addr_set != VSW_ADDR_HW) {
2150 			addr.mma_addrlen = ETHERADDRL;
2151 			ether_copy(&tp->p_macaddr, &addr.mma_addr);
2152 
2153 			if (vsw_set_hw_addr(vswp, &addr) != 0) {
2154 				rv = 1;
2155 				break;
2156 			}
2157 
2158 			tp->addr_slot = addr.mma_slot;
2159 
2160 			/*
2161 			 * If when this port had first attached we had
2162 			 * had to place the interface into promisc mode,
2163 			 * then now reverse that.
2164 			 *
2165 			 * Note that the interface will not actually
2166 			 * change to non-promisc mode until all ports
2167 			 * have been programmed.
2168 			 */
2169 			if (tp->addr_set == VSW_ADDR_PROMISC)
2170 				(void) vsw_unset_hw_promisc(vswp,
2171 				    tp, VSW_VNETPORT);
2172 
2173 			tp->addr_set = VSW_ADDR_HW;
2174 		}
2175 	}
2176 	RW_EXIT(&plist->lockrw);
2177 
2178 	D1(vswp, "%s: exit", __func__);
2179 	return (rv);
2180 }
2181 
2182 static void
2183 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
2184 {
2185 	ringp->ring_state = VSW_MAC_RING_FREE;
2186 	ringp->ring_arg = NULL;
2187 	ringp->ring_blank = NULL;
2188 	ringp->ring_vqp = NULL;
2189 	ringp->ring_vswp = vswp;
2190 }
2191 
2192 static void
2193 vsw_mac_ring_tbl_init(vsw_t *vswp)
2194 {
2195 	int		i;
2196 
2197 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
2198 
2199 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
2200 	vswp->mac_ring_tbl  =
2201 	    kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP);
2202 
2203 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
2204 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
2205 }
2206 
2207 static void
2208 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
2209 {
2210 	int		i;
2211 	vsw_mac_ring_t	*ringp;
2212 
2213 	mutex_enter(&vswp->mac_ring_lock);
2214 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
2215 		ringp = &vswp->mac_ring_tbl[i];
2216 
2217 		if (ringp->ring_state != VSW_MAC_RING_FREE) {
2218 			/*
2219 			 * Destroy the queue.
2220 			 */
2221 			vsw_queue_stop(ringp->ring_vqp);
2222 			vsw_queue_destroy(ringp->ring_vqp);
2223 
2224 			/*
2225 			 * Re-initialize the structure.
2226 			 */
2227 			vsw_mac_ring_tbl_entry_init(vswp, ringp);
2228 		}
2229 	}
2230 	mutex_exit(&vswp->mac_ring_lock);
2231 
2232 	mutex_destroy(&vswp->mac_ring_lock);
2233 	kmem_free(vswp->mac_ring_tbl,
2234 	    vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
2235 	vswp->mac_ring_tbl_sz = 0;
2236 }
2237 
2238 /*
2239  * Handle resource add callbacks from the driver below.
2240  */
2241 static mac_resource_handle_t
2242 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
2243 {
2244 	vsw_t		*vswp = (vsw_t *)arg;
2245 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
2246 	vsw_mac_ring_t	*ringp;
2247 	vsw_queue_t	*vqp;
2248 	int		i;
2249 
2250 	ASSERT(vswp != NULL);
2251 	ASSERT(mrp != NULL);
2252 	ASSERT(vswp->mac_ring_tbl != NULL);
2253 
2254 	D1(vswp, "%s: enter", __func__);
2255 
2256 	/*
2257 	 * Check to make sure we have the correct resource type.
2258 	 */
2259 	if (mrp->mr_type != MAC_RX_FIFO)
2260 		return (NULL);
2261 
2262 	/*
2263 	 * Find a open entry in the ring table.
2264 	 */
2265 	mutex_enter(&vswp->mac_ring_lock);
2266 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
2267 		ringp = &vswp->mac_ring_tbl[i];
2268 
2269 		/*
2270 		 * Check for an empty slot, if found, then setup queue
2271 		 * and thread.
2272 		 */
2273 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
2274 			/*
2275 			 * Create the queue for this ring.
2276 			 */
2277 			vqp = vsw_queue_create();
2278 
2279 			/*
2280 			 * Initialize the ring data structure.
2281 			 */
2282 			ringp->ring_vqp = vqp;
2283 			ringp->ring_arg = mrfp->mrf_arg;
2284 			ringp->ring_blank = mrfp->mrf_blank;
2285 			ringp->ring_state = VSW_MAC_RING_INUSE;
2286 
2287 			/*
2288 			 * Create the worker thread.
2289 			 */
2290 			vqp->vq_worker = thread_create(NULL, 0,
2291 			    vsw_queue_worker, ringp, 0, &p0,
2292 			    TS_RUN, minclsyspri);
2293 			if (vqp->vq_worker == NULL) {
2294 				vsw_queue_destroy(vqp);
2295 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
2296 				ringp = NULL;
2297 			}
2298 
2299 			if (ringp != NULL) {
2300 				/*
2301 				 * Make sure thread get's running state for
2302 				 * this ring.
2303 				 */
2304 				mutex_enter(&vqp->vq_lock);
2305 				while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
2306 				    (vqp->vq_state != VSW_QUEUE_DRAINED)) {
2307 					cv_wait(&vqp->vq_cv, &vqp->vq_lock);
2308 				}
2309 
2310 				/*
2311 				 * If the thread is not running, cleanup.
2312 				 */
2313 				if (vqp->vq_state == VSW_QUEUE_DRAINED) {
2314 					vsw_queue_destroy(vqp);
2315 					vsw_mac_ring_tbl_entry_init(vswp,
2316 					    ringp);
2317 					ringp = NULL;
2318 				}
2319 				mutex_exit(&vqp->vq_lock);
2320 			}
2321 
2322 			mutex_exit(&vswp->mac_ring_lock);
2323 			D1(vswp, "%s: exit", __func__);
2324 			return ((mac_resource_handle_t)ringp);
2325 		}
2326 	}
2327 	mutex_exit(&vswp->mac_ring_lock);
2328 
2329 	/*
2330 	 * No slots in the ring table available.
2331 	 */
2332 	D1(vswp, "%s: exit", __func__);
2333 	return (NULL);
2334 }
2335 
2336 static void
2337 vsw_queue_stop(vsw_queue_t *vqp)
2338 {
2339 	mutex_enter(&vqp->vq_lock);
2340 
2341 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
2342 		vqp->vq_state = VSW_QUEUE_STOP;
2343 		cv_signal(&vqp->vq_cv);
2344 
2345 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
2346 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
2347 	}
2348 
2349 	vqp->vq_state = VSW_QUEUE_STOPPED;
2350 
2351 	mutex_exit(&vqp->vq_lock);
2352 }
2353 
2354 static vsw_queue_t *
2355 vsw_queue_create()
2356 {
2357 	vsw_queue_t *vqp;
2358 
2359 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
2360 
2361 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
2362 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
2363 	vqp->vq_first = NULL;
2364 	vqp->vq_last = NULL;
2365 	vqp->vq_state = VSW_QUEUE_STOPPED;
2366 
2367 	return (vqp);
2368 }
2369 
2370 static void
2371 vsw_queue_destroy(vsw_queue_t *vqp)
2372 {
2373 	cv_destroy(&vqp->vq_cv);
2374 	mutex_destroy(&vqp->vq_lock);
2375 	kmem_free(vqp, sizeof (vsw_queue_t));
2376 }
2377 
2378 static void
2379 vsw_queue_worker(vsw_mac_ring_t *rrp)
2380 {
2381 	mblk_t		*mp;
2382 	vsw_queue_t	*vqp = rrp->ring_vqp;
2383 	vsw_t		*vswp = rrp->ring_vswp;
2384 
2385 	mutex_enter(&vqp->vq_lock);
2386 
2387 	ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
2388 
2389 	/*
2390 	 * Set the state to running, since the thread is now active.
2391 	 */
2392 	vqp->vq_state = VSW_QUEUE_RUNNING;
2393 	cv_signal(&vqp->vq_cv);
2394 
2395 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
2396 		/*
2397 		 * Wait for work to do or the state has changed
2398 		 * to not running.
2399 		 */
2400 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
2401 		    (vqp->vq_first == NULL)) {
2402 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
2403 		}
2404 
2405 		/*
2406 		 * Process packets that we received from the interface.
2407 		 */
2408 		if (vqp->vq_first != NULL) {
2409 			mp = vqp->vq_first;
2410 
2411 			vqp->vq_first = NULL;
2412 			vqp->vq_last = NULL;
2413 
2414 			mutex_exit(&vqp->vq_lock);
2415 
2416 			/* switch the chain of packets received */
2417 			vswp->vsw_switch_frame(vswp, mp,
2418 			    VSW_PHYSDEV, NULL, NULL);
2419 
2420 			mutex_enter(&vqp->vq_lock);
2421 		}
2422 	}
2423 
2424 	/*
2425 	 * We are drained and signal we are done.
2426 	 */
2427 	vqp->vq_state = VSW_QUEUE_DRAINED;
2428 	cv_signal(&vqp->vq_cv);
2429 
2430 	/*
2431 	 * Exit lock and drain the remaining packets.
2432 	 */
2433 	mutex_exit(&vqp->vq_lock);
2434 
2435 	/*
2436 	 * Exit the thread
2437 	 */
2438 	thread_exit();
2439 }
2440 
2441 /*
2442  * static void
2443  * vsw_rx_queue_cb() - Receive callback routine when
2444  *	vsw_multi_ring_enable is non-zero.  Queue the packets
2445  *	to a packet queue for a worker thread to process.
2446  */
2447 static void
2448 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
2449 {
2450 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
2451 	vsw_t		*vswp = (vsw_t *)arg;
2452 	vsw_queue_t	*vqp;
2453 	mblk_t		*bp, *last;
2454 
2455 	ASSERT(mrh != NULL);
2456 	ASSERT(vswp != NULL);
2457 	ASSERT(mp != NULL);
2458 
2459 	D1(vswp, "%s: enter", __func__);
2460 
2461 	/*
2462 	 * Find the last element in the mblk chain.
2463 	 */
2464 	bp = mp;
2465 	do {
2466 		last = bp;
2467 		bp = bp->b_next;
2468 	} while (bp != NULL);
2469 
2470 	/* Get the queue for the packets */
2471 	vqp = ringp->ring_vqp;
2472 
2473 	/*
2474 	 * Grab the lock such we can queue the packets.
2475 	 */
2476 	mutex_enter(&vqp->vq_lock);
2477 
2478 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
2479 		freemsg(mp);
2480 		mutex_exit(&vqp->vq_lock);
2481 		goto vsw_rx_queue_cb_exit;
2482 	}
2483 
2484 	/*
2485 	 * Add the mblk chain to the queue.  If there
2486 	 * is some mblks in the queue, then add the new
2487 	 * chain to the end.
2488 	 */
2489 	if (vqp->vq_first == NULL)
2490 		vqp->vq_first = mp;
2491 	else
2492 		vqp->vq_last->b_next = mp;
2493 
2494 	vqp->vq_last = last;
2495 
2496 	/*
2497 	 * Signal the worker thread that there is work to
2498 	 * do.
2499 	 */
2500 	cv_signal(&vqp->vq_cv);
2501 
2502 	/*
2503 	 * Let go of the lock and exit.
2504 	 */
2505 	mutex_exit(&vqp->vq_lock);
2506 
2507 vsw_rx_queue_cb_exit:
2508 	D1(vswp, "%s: exit", __func__);
2509 }
2510 
2511 /*
2512  * receive callback routine. Invoked by MAC layer when there
2513  * are pkts being passed up from physical device.
2514  *
2515  * PERF: It may be more efficient when the card is in promisc
2516  * mode to check the dest address of the pkts here (against
2517  * the FDB) rather than checking later. Needs to be investigated.
2518  */
2519 static void
2520 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
2521 {
2522 	_NOTE(ARGUNUSED(mrh))
2523 
2524 	vsw_t		*vswp = (vsw_t *)arg;
2525 
2526 	ASSERT(vswp != NULL);
2527 
2528 	D1(vswp, "vsw_rx_cb: enter");
2529 
2530 	/* switch the chain of packets received */
2531 	vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
2532 
2533 	D1(vswp, "vsw_rx_cb: exit");
2534 }
2535 
2536 /*
2537  * Send a message out over the physical device via the MAC layer.
2538  *
2539  * Returns any mblks that it was unable to transmit.
2540  */
2541 static mblk_t *
2542 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
2543 {
2544 	const mac_txinfo_t	*mtp;
2545 	mblk_t			*nextp;
2546 
2547 	mutex_enter(&vswp->mac_lock);
2548 	if ((vswp->mh == NULL) || (vswp->mstarted == B_FALSE)) {
2549 
2550 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
2551 		mutex_exit(&vswp->mac_lock);
2552 		return (mp);
2553 	} else {
2554 		for (;;) {
2555 			nextp = mp->b_next;
2556 			mp->b_next = NULL;
2557 
2558 			mtp = vswp->txinfo;
2559 
2560 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
2561 				mp->b_next = nextp;
2562 				break;
2563 			}
2564 
2565 			if ((mp = nextp) == NULL)
2566 				break;
2567 		}
2568 	}
2569 	mutex_exit(&vswp->mac_lock);
2570 
2571 	return (mp);
2572 }
2573 
2574 /*
2575  * Register with the MAC layer as a network device, so we
2576  * can be plumbed if necessary.
2577  */
2578 static int
2579 vsw_mac_register(vsw_t *vswp)
2580 {
2581 	mac_register_t	*macp;
2582 	int		rv;
2583 
2584 	D1(vswp, "%s: enter", __func__);
2585 
2586 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
2587 		return (EINVAL);
2588 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2589 	macp->m_driver = vswp;
2590 	macp->m_dip = vswp->dip;
2591 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
2592 	macp->m_callbacks = &vsw_m_callbacks;
2593 	macp->m_min_sdu = 0;
2594 	macp->m_max_sdu = ETHERMTU;
2595 	rv = mac_register(macp, &vswp->if_mh);
2596 	mac_free(macp);
2597 	if (rv != 0) {
2598 		/*
2599 		 * Treat this as a non-fatal error as we may be
2600 		 * able to operate in some other mode.
2601 		 */
2602 		cmn_err(CE_NOTE, "!vsw%d: Unable to register as "
2603 		    "a provider with MAC layer", vswp->instance);
2604 		return (rv);
2605 	}
2606 
2607 	vswp->if_state |= VSW_IF_REG;
2608 
2609 	D1(vswp, "%s: exit", __func__);
2610 
2611 	return (rv);
2612 }
2613 
2614 static int
2615 vsw_mac_unregister(vsw_t *vswp)
2616 {
2617 	int		rv = 0;
2618 
2619 	D1(vswp, "%s: enter", __func__);
2620 
2621 	WRITE_ENTER(&vswp->if_lockrw);
2622 
2623 	if (vswp->if_state & VSW_IF_REG) {
2624 		rv = mac_unregister(vswp->if_mh);
2625 		if (rv != 0) {
2626 			DWARN(vswp, "%s: unable to unregister from MAC "
2627 			    "framework", __func__);
2628 
2629 			RW_EXIT(&vswp->if_lockrw);
2630 			D1(vswp, "%s: fail exit", __func__);
2631 			return (rv);
2632 		}
2633 
2634 		/* mark i/f as down and unregistered */
2635 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
2636 	}
2637 	RW_EXIT(&vswp->if_lockrw);
2638 
2639 	D1(vswp, "%s: exit", __func__);
2640 
2641 	return (rv);
2642 }
2643 
2644 static int
2645 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
2646 {
2647 	vsw_t			*vswp = (vsw_t *)arg;
2648 
2649 	D1(vswp, "%s: enter", __func__);
2650 
2651 	mutex_enter(&vswp->mac_lock);
2652 	if (vswp->mh == NULL) {
2653 		mutex_exit(&vswp->mac_lock);
2654 		return (EINVAL);
2655 	}
2656 
2657 	/* return stats from underlying device */
2658 	*val = mac_stat_get(vswp->mh, stat);
2659 
2660 	mutex_exit(&vswp->mac_lock);
2661 
2662 	return (0);
2663 }
2664 
2665 static void
2666 vsw_m_stop(void *arg)
2667 {
2668 	vsw_t		*vswp = (vsw_t *)arg;
2669 
2670 	D1(vswp, "%s: enter", __func__);
2671 
2672 	WRITE_ENTER(&vswp->if_lockrw);
2673 	vswp->if_state &= ~VSW_IF_UP;
2674 	RW_EXIT(&vswp->if_lockrw);
2675 
2676 	mutex_enter(&vswp->hw_lock);
2677 
2678 	(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
2679 
2680 	if (vswp->recfg_reqd)
2681 		vsw_reconfig_hw(vswp);
2682 
2683 	mutex_exit(&vswp->hw_lock);
2684 
2685 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2686 }
2687 
2688 static int
2689 vsw_m_start(void *arg)
2690 {
2691 	vsw_t		*vswp = (vsw_t *)arg;
2692 
2693 	D1(vswp, "%s: enter", __func__);
2694 
2695 	WRITE_ENTER(&vswp->if_lockrw);
2696 
2697 	vswp->if_state |= VSW_IF_UP;
2698 
2699 	if (vswp->switching_setup_done == B_FALSE) {
2700 		/*
2701 		 * If the switching mode has not been setup yet, just
2702 		 * return. The unicast address will be programmed
2703 		 * after the physical device is successfully setup by the
2704 		 * timeout handler.
2705 		 */
2706 		RW_EXIT(&vswp->if_lockrw);
2707 		return (0);
2708 	}
2709 
2710 	/* if in layer2 mode, program unicast address. */
2711 	if (vswp->mh != NULL) {
2712 		mutex_enter(&vswp->hw_lock);
2713 		(void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
2714 		mutex_exit(&vswp->hw_lock);
2715 	}
2716 
2717 	RW_EXIT(&vswp->if_lockrw);
2718 
2719 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2720 	return (0);
2721 }
2722 
2723 /*
2724  * Change the local interface address.
2725  *
2726  * Note: we don't support this entry point. The local
2727  * mac address of the switch can only be changed via its
2728  * MD node properties.
2729  */
2730 static int
2731 vsw_m_unicst(void *arg, const uint8_t *macaddr)
2732 {
2733 	_NOTE(ARGUNUSED(arg, macaddr))
2734 
2735 	return (DDI_FAILURE);
2736 }
2737 
2738 static int
2739 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
2740 {
2741 	vsw_t		*vswp = (vsw_t *)arg;
2742 	mcst_addr_t	*mcst_p = NULL;
2743 	uint64_t	addr = 0x0;
2744 	int		i, ret = 0;
2745 
2746 	D1(vswp, "%s: enter", __func__);
2747 
2748 	/*
2749 	 * Convert address into form that can be used
2750 	 * as hash table key.
2751 	 */
2752 	for (i = 0; i < ETHERADDRL; i++) {
2753 		addr = (addr << 8) | mca[i];
2754 	}
2755 
2756 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
2757 
2758 	if (add) {
2759 		D2(vswp, "%s: adding multicast", __func__);
2760 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2761 			/*
2762 			 * Update the list of multicast addresses
2763 			 * contained within the vsw_t structure to
2764 			 * include this new one.
2765 			 */
2766 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
2767 			if (mcst_p == NULL) {
2768 				DERR(vswp, "%s unable to alloc mem", __func__);
2769 				(void) vsw_del_mcst(vswp,
2770 				    VSW_LOCALDEV, addr, NULL);
2771 				return (1);
2772 			}
2773 			mcst_p->addr = addr;
2774 			ether_copy(mca, &mcst_p->mca);
2775 
2776 			/*
2777 			 * Call into the underlying driver to program the
2778 			 * address into HW.
2779 			 */
2780 			mutex_enter(&vswp->mac_lock);
2781 			if (vswp->mh != NULL) {
2782 				ret = mac_multicst_add(vswp->mh, mca);
2783 				if (ret != 0) {
2784 					cmn_err(CE_WARN, "!vsw%d: unable to "
2785 					    "add multicast address",
2786 					    vswp->instance);
2787 					mutex_exit(&vswp->mac_lock);
2788 					(void) vsw_del_mcst(vswp,
2789 					    VSW_LOCALDEV, addr, NULL);
2790 					kmem_free(mcst_p, sizeof (*mcst_p));
2791 					return (ret);
2792 				}
2793 				mcst_p->mac_added = B_TRUE;
2794 			}
2795 			mutex_exit(&vswp->mac_lock);
2796 
2797 			mutex_enter(&vswp->mca_lock);
2798 			mcst_p->nextp = vswp->mcap;
2799 			vswp->mcap = mcst_p;
2800 			mutex_exit(&vswp->mca_lock);
2801 		} else {
2802 			cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
2803 			    "address", vswp->instance);
2804 		}
2805 		return (ret);
2806 	}
2807 
2808 	D2(vswp, "%s: removing multicast", __func__);
2809 	/*
2810 	 * Remove the address from the hash table..
2811 	 */
2812 	if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2813 
2814 		/*
2815 		 * ..and then from the list maintained in the
2816 		 * vsw_t structure.
2817 		 */
2818 		mcst_p = vsw_del_addr(VSW_LOCALDEV, vswp, addr);
2819 		ASSERT(mcst_p != NULL);
2820 
2821 		mutex_enter(&vswp->mac_lock);
2822 		if (vswp->mh != NULL && mcst_p->mac_added) {
2823 			(void) mac_multicst_remove(vswp->mh, mca);
2824 			mcst_p->mac_added = B_FALSE;
2825 		}
2826 		mutex_exit(&vswp->mac_lock);
2827 		kmem_free(mcst_p, sizeof (*mcst_p));
2828 	}
2829 
2830 	D1(vswp, "%s: exit", __func__);
2831 
2832 	return (0);
2833 }
2834 
2835 static int
2836 vsw_m_promisc(void *arg, boolean_t on)
2837 {
2838 	vsw_t		*vswp = (vsw_t *)arg;
2839 
2840 	D1(vswp, "%s: enter", __func__);
2841 
2842 	WRITE_ENTER(&vswp->if_lockrw);
2843 	if (on)
2844 		vswp->if_state |= VSW_IF_PROMISC;
2845 	else
2846 		vswp->if_state &= ~VSW_IF_PROMISC;
2847 	RW_EXIT(&vswp->if_lockrw);
2848 
2849 	D1(vswp, "%s: exit", __func__);
2850 
2851 	return (0);
2852 }
2853 
2854 static mblk_t *
2855 vsw_m_tx(void *arg, mblk_t *mp)
2856 {
2857 	vsw_t		*vswp = (vsw_t *)arg;
2858 
2859 	D1(vswp, "%s: enter", __func__);
2860 
2861 	vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
2862 
2863 	D1(vswp, "%s: exit", __func__);
2864 
2865 	return (NULL);
2866 }
2867 
2868 /*
2869  * Register for machine description (MD) updates.
2870  *
2871  * Returns 0 on success, 1 on failure.
2872  */
2873 static int
2874 vsw_mdeg_register(vsw_t *vswp)
2875 {
2876 	mdeg_prop_spec_t	*pspecp;
2877 	mdeg_node_spec_t	*inst_specp;
2878 	mdeg_handle_t		mdeg_hdl, mdeg_port_hdl;
2879 	size_t			templatesz;
2880 	int			rv;
2881 
2882 	D1(vswp, "%s: enter", __func__);
2883 
2884 	/*
2885 	 * Allocate and initialize a per-instance copy
2886 	 * of the global property spec array that will
2887 	 * uniquely identify this vsw instance.
2888 	 */
2889 	templatesz = sizeof (vsw_prop_template);
2890 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
2891 
2892 	bcopy(vsw_prop_template, pspecp, templatesz);
2893 
2894 	VSW_SET_MDEG_PROP_INST(pspecp, vswp->regprop);
2895 
2896 	/* initialize the complete prop spec structure */
2897 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
2898 	inst_specp->namep = "virtual-device";
2899 	inst_specp->specp = pspecp;
2900 
2901 	D2(vswp, "%s: instance %d registering with mdeg", __func__,
2902 	    vswp->regprop);
2903 	/*
2904 	 * Register an interest in 'virtual-device' nodes with a
2905 	 * 'name' property of 'virtual-network-switch'
2906 	 */
2907 	rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb,
2908 	    (void *)vswp, &mdeg_hdl);
2909 	if (rv != MDEG_SUCCESS) {
2910 		DERR(vswp, "%s: mdeg_register failed (%d) for vsw node",
2911 		    __func__, rv);
2912 		goto mdeg_reg_fail;
2913 	}
2914 
2915 	/*
2916 	 * Register an interest in 'vsw-port' nodes.
2917 	 */
2918 	rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb,
2919 	    (void *)vswp, &mdeg_port_hdl);
2920 	if (rv != MDEG_SUCCESS) {
2921 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
2922 		(void) mdeg_unregister(mdeg_hdl);
2923 		goto mdeg_reg_fail;
2924 	}
2925 
2926 	/* save off data that will be needed later */
2927 	vswp->inst_spec = inst_specp;
2928 	vswp->mdeg_hdl = mdeg_hdl;
2929 	vswp->mdeg_port_hdl = mdeg_port_hdl;
2930 
2931 	D1(vswp, "%s: exit", __func__);
2932 	return (0);
2933 
2934 mdeg_reg_fail:
2935 	cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks",
2936 	    vswp->instance);
2937 	kmem_free(pspecp, templatesz);
2938 	kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
2939 
2940 	vswp->mdeg_hdl = NULL;
2941 	vswp->mdeg_port_hdl = NULL;
2942 
2943 	return (1);
2944 }
2945 
2946 static void
2947 vsw_mdeg_unregister(vsw_t *vswp)
2948 {
2949 	D1(vswp, "vsw_mdeg_unregister: enter");
2950 
2951 	if (vswp->mdeg_hdl != NULL)
2952 		(void) mdeg_unregister(vswp->mdeg_hdl);
2953 
2954 	if (vswp->mdeg_port_hdl != NULL)
2955 		(void) mdeg_unregister(vswp->mdeg_port_hdl);
2956 
2957 	if (vswp->inst_spec != NULL) {
2958 		if (vswp->inst_spec->specp != NULL) {
2959 			(void) kmem_free(vswp->inst_spec->specp,
2960 			    sizeof (vsw_prop_template));
2961 			vswp->inst_spec->specp = NULL;
2962 		}
2963 
2964 		(void) kmem_free(vswp->inst_spec, sizeof (mdeg_node_spec_t));
2965 		vswp->inst_spec = NULL;
2966 	}
2967 
2968 	D1(vswp, "vsw_mdeg_unregister: exit");
2969 }
2970 
2971 /*
2972  * Mdeg callback invoked for the vsw node itself.
2973  */
2974 static int
2975 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2976 {
2977 	vsw_t		*vswp;
2978 	md_t		*mdp;
2979 	mde_cookie_t	node;
2980 	uint64_t	inst;
2981 	char		*node_name = NULL;
2982 
2983 	if (resp == NULL)
2984 		return (MDEG_FAILURE);
2985 
2986 	vswp = (vsw_t *)cb_argp;
2987 
2988 	D1(vswp, "%s: added %d : removed %d : curr matched %d"
2989 	    " : prev matched %d", __func__, resp->added.nelem,
2990 	    resp->removed.nelem, resp->match_curr.nelem,
2991 	    resp->match_prev.nelem);
2992 
2993 	/*
2994 	 * We get an initial callback for this node as 'added'
2995 	 * after registering with mdeg. Note that we would have
2996 	 * already gathered information about this vsw node by
2997 	 * walking MD earlier during attach (in vsw_read_mdprops()).
2998 	 * So, there is a window where the properties of this
2999 	 * node might have changed when we get this initial 'added'
3000 	 * callback. We handle this as if an update occured
3001 	 * and invoke the same function which handles updates to
3002 	 * the properties of this vsw-node if any.
3003 	 *
3004 	 * A non-zero 'match' value indicates that the MD has been
3005 	 * updated and that a virtual-network-switch node is
3006 	 * present which may or may not have been updated. It is
3007 	 * up to the clients to examine their own nodes and
3008 	 * determine if they have changed.
3009 	 */
3010 	if (resp->added.nelem != 0) {
3011 
3012 		if (resp->added.nelem != 1) {
3013 			cmn_err(CE_NOTE, "!vsw%d: number of nodes added "
3014 			    "invalid: %d\n", vswp->instance, resp->added.nelem);
3015 			return (MDEG_FAILURE);
3016 		}
3017 
3018 		mdp = resp->added.mdp;
3019 		node = resp->added.mdep[0];
3020 
3021 	} else if (resp->match_curr.nelem != 0) {
3022 
3023 		if (resp->match_curr.nelem != 1) {
3024 			cmn_err(CE_NOTE, "!vsw%d: number of nodes updated "
3025 			    "invalid: %d\n", vswp->instance,
3026 			    resp->match_curr.nelem);
3027 			return (MDEG_FAILURE);
3028 		}
3029 
3030 		mdp = resp->match_curr.mdp;
3031 		node = resp->match_curr.mdep[0];
3032 
3033 	} else {
3034 		return (MDEG_FAILURE);
3035 	}
3036 
3037 	/* Validate name and instance */
3038 	if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
3039 		DERR(vswp, "%s: unable to get node name\n",  __func__);
3040 		return (MDEG_FAILURE);
3041 	}
3042 
3043 	/* is this a virtual-network-switch? */
3044 	if (strcmp(node_name, vsw_propname) != 0) {
3045 		DERR(vswp, "%s: Invalid node name: %s\n",
3046 		    __func__, node_name);
3047 		return (MDEG_FAILURE);
3048 	}
3049 
3050 	if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
3051 		DERR(vswp, "%s: prop(cfg-handle) not found\n",
3052 		    __func__);
3053 		return (MDEG_FAILURE);
3054 	}
3055 
3056 	/* is this the right instance of vsw? */
3057 	if (inst != vswp->regprop) {
3058 		DERR(vswp, "%s: Invalid cfg-handle: %lx\n",
3059 		    __func__, inst);
3060 		return (MDEG_FAILURE);
3061 	}
3062 
3063 	vsw_update_md_prop(vswp, mdp, node);
3064 
3065 	return (MDEG_SUCCESS);
3066 }
3067 
3068 /*
3069  * Mdeg callback invoked for changes to the vsw-port nodes
3070  * under the vsw node.
3071  */
3072 static int
3073 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
3074 {
3075 	vsw_t		*vswp;
3076 	int		idx;
3077 	md_t		*mdp;
3078 	mde_cookie_t	node;
3079 	uint64_t	inst;
3080 
3081 	if ((resp == NULL) || (cb_argp == NULL))
3082 		return (MDEG_FAILURE);
3083 
3084 	vswp = (vsw_t *)cb_argp;
3085 
3086 	D2(vswp, "%s: added %d : removed %d : curr matched %d"
3087 	    " : prev matched %d", __func__, resp->added.nelem,
3088 	    resp->removed.nelem, resp->match_curr.nelem,
3089 	    resp->match_prev.nelem);
3090 
3091 	/* process added ports */
3092 	for (idx = 0; idx < resp->added.nelem; idx++) {
3093 		mdp = resp->added.mdp;
3094 		node = resp->added.mdep[idx];
3095 
3096 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
3097 
3098 		if (vsw_port_add(vswp, mdp, &node) != 0) {
3099 			cmn_err(CE_WARN, "!vsw%d: Unable to add new port "
3100 			    "(0x%lx)", vswp->instance, node);
3101 		}
3102 	}
3103 
3104 	/* process removed ports */
3105 	for (idx = 0; idx < resp->removed.nelem; idx++) {
3106 		mdp = resp->removed.mdp;
3107 		node = resp->removed.mdep[idx];
3108 
3109 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
3110 			DERR(vswp, "%s: prop(%s) not found in port(%d)",
3111 			    __func__, id_propname, idx);
3112 			continue;
3113 		}
3114 
3115 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
3116 
3117 		if (vsw_port_detach(vswp, inst) != 0) {
3118 			cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld",
3119 			    vswp->instance, inst);
3120 		}
3121 	}
3122 
3123 	/*
3124 	 * Currently no support for updating already active ports.
3125 	 * So, ignore the match_curr and match_priv arrays for now.
3126 	 */
3127 
3128 	D1(vswp, "%s: exit", __func__);
3129 
3130 	return (MDEG_SUCCESS);
3131 }
3132 
3133 /*
3134  * Scan the machine description for this instance of vsw
3135  * and read its properties. Called only from vsw_attach().
3136  * Returns: 0 on success, 1 on failure.
3137  */
3138 static int
3139 vsw_read_mdprops(vsw_t *vswp)
3140 {
3141 	md_t		*mdp = NULL;
3142 	mde_cookie_t	rootnode;
3143 	mde_cookie_t	*listp = NULL;
3144 	uint64_t	inst;
3145 	uint64_t	cfgh;
3146 	char		*name;
3147 	int		rv = 1;
3148 	int		num_nodes = 0;
3149 	int		num_devs = 0;
3150 	int		listsz = 0;
3151 	int		i;
3152 
3153 	/*
3154 	 * In each 'virtual-device' node in the MD there is a
3155 	 * 'cfg-handle' property which is the MD's concept of
3156 	 * an instance number (this may be completely different from
3157 	 * the device drivers instance #). OBP reads that value and
3158 	 * stores it in the 'reg' property of the appropriate node in
3159 	 * the device tree. We first read this reg property and use this
3160 	 * to compare against the 'cfg-handle' property of vsw nodes
3161 	 * in MD to get to this specific vsw instance and then read
3162 	 * other properties that we are interested in.
3163 	 * We also cache the value of 'reg' property and use it later
3164 	 * to register callbacks with mdeg (see vsw_mdeg_register())
3165 	 */
3166 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
3167 	    DDI_PROP_DONTPASS, reg_propname, -1);
3168 	if (inst == -1) {
3169 		cmn_err(CE_NOTE, "!vsw%d: Unable to read %s property from "
3170 		    "OBP device tree", vswp->instance, reg_propname);
3171 		return (rv);
3172 	}
3173 
3174 	vswp->regprop = inst;
3175 
3176 	if ((mdp = md_get_handle()) == NULL) {
3177 		DWARN(vswp, "%s: cannot init MD\n", __func__);
3178 		return (rv);
3179 	}
3180 
3181 	num_nodes = md_node_count(mdp);
3182 	ASSERT(num_nodes > 0);
3183 
3184 	listsz = num_nodes * sizeof (mde_cookie_t);
3185 	listp = (mde_cookie_t *)kmem_zalloc(listsz, KM_SLEEP);
3186 
3187 	rootnode = md_root_node(mdp);
3188 
3189 	/* search for all "virtual_device" nodes */
3190 	num_devs = md_scan_dag(mdp, rootnode,
3191 	    md_find_name(mdp, vdev_propname),
3192 	    md_find_name(mdp, "fwd"), listp);
3193 	if (num_devs <= 0) {
3194 		DWARN(vswp, "%s: invalid num_devs:%d\n", __func__, num_devs);
3195 		goto vsw_readmd_exit;
3196 	}
3197 
3198 	/*
3199 	 * Now loop through the list of virtual-devices looking for
3200 	 * devices with name "virtual-network-switch" and for each
3201 	 * such device compare its instance with what we have from
3202 	 * the 'reg' property to find the right node in MD and then
3203 	 * read all its properties.
3204 	 */
3205 	for (i = 0; i < num_devs; i++) {
3206 
3207 		if (md_get_prop_str(mdp, listp[i], "name", &name) != 0) {
3208 			DWARN(vswp, "%s: name property not found\n",
3209 			    __func__);
3210 			goto vsw_readmd_exit;
3211 		}
3212 
3213 		/* is this a virtual-network-switch? */
3214 		if (strcmp(name, vsw_propname) != 0)
3215 			continue;
3216 
3217 		if (md_get_prop_val(mdp, listp[i], "cfg-handle", &cfgh) != 0) {
3218 			DWARN(vswp, "%s: cfg-handle property not found\n",
3219 			    __func__);
3220 			goto vsw_readmd_exit;
3221 		}
3222 
3223 		/* is this the required instance of vsw? */
3224 		if (inst != cfgh)
3225 			continue;
3226 
3227 		/* now read all properties of this vsw instance */
3228 		rv = vsw_get_initial_md_properties(vswp, mdp, listp[i]);
3229 		break;
3230 	}
3231 
3232 vsw_readmd_exit:
3233 
3234 	kmem_free(listp, listsz);
3235 	(void) md_fini_handle(mdp);
3236 	return (rv);
3237 }
3238 
3239 /*
3240  * Read the initial start-of-day values from the specified MD node.
3241  */
3242 static int
3243 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
3244 {
3245 	int		i;
3246 	uint64_t 	macaddr = 0;
3247 
3248 	D1(vswp, "%s: enter", __func__);
3249 
3250 	if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) != 0) {
3251 		return (1);
3252 	}
3253 
3254 	/* mac address for vswitch device itself */
3255 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
3256 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
3257 		    vswp->instance);
3258 		return (1);
3259 	}
3260 
3261 	vsw_save_lmacaddr(vswp, macaddr);
3262 
3263 	if (vsw_get_md_smodes(vswp, mdp, node, vswp->smode, &vswp->smode_num)) {
3264 		cmn_err(CE_WARN, "vsw%d: Unable to read %s property from "
3265 		    "MD, defaulting to programmed mode", vswp->instance,
3266 		    smode_propname);
3267 
3268 		for (i = 0; i < NUM_SMODES; i++)
3269 			vswp->smode[i] = VSW_LAYER2;
3270 
3271 		vswp->smode_num = NUM_SMODES;
3272 	} else {
3273 		ASSERT(vswp->smode_num != 0);
3274 	}
3275 
3276 	D1(vswp, "%s: exit", __func__);
3277 	return (0);
3278 }
3279 
3280 /*
3281  * Check to see if the relevant properties in the specified node have
3282  * changed, and if so take the appropriate action.
3283  *
3284  * If any of the properties are missing or invalid we don't take
3285  * any action, as this function should only be invoked when modifications
3286  * have been made to what we assume is a working configuration, which
3287  * we leave active.
3288  *
3289  * Note it is legal for this routine to be invoked even if none of the
3290  * properties in the port node within the MD have actually changed.
3291  */
3292 static void
3293 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
3294 {
3295 	char		physname[LIFNAMSIZ];
3296 	char		drv[LIFNAMSIZ];
3297 	uint_t		ddi_instance;
3298 	uint8_t		new_smode[NUM_SMODES];
3299 	int		i, smode_num = 0;
3300 	uint64_t 	macaddr = 0;
3301 	enum		{MD_init = 0x1,
3302 				MD_physname = 0x2,
3303 				MD_macaddr = 0x4,
3304 				MD_smode = 0x8} updated;
3305 	int		rv;
3306 
3307 	updated = MD_init;
3308 
3309 	D1(vswp, "%s: enter", __func__);
3310 
3311 	/*
3312 	 * Check if name of physical device in MD has changed.
3313 	 */
3314 	if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) {
3315 		/*
3316 		 * Do basic sanity check on new device name/instance,
3317 		 * if its non NULL. It is valid for the device name to
3318 		 * have changed from a non NULL to a NULL value, i.e.
3319 		 * the vsw is being changed to 'routed' mode.
3320 		 */
3321 		if ((strlen(physname) != 0) &&
3322 		    (ddi_parse(physname, drv,
3323 		    &ddi_instance) != DDI_SUCCESS)) {
3324 			cmn_err(CE_WARN, "!vsw%d: new device name %s is not"
3325 			    " a valid device name/instance",
3326 			    vswp->instance, physname);
3327 			goto fail_reconf;
3328 		}
3329 
3330 		if (strcmp(physname, vswp->physname)) {
3331 			D2(vswp, "%s: device name changed from %s to %s",
3332 			    __func__, vswp->physname, physname);
3333 
3334 			updated |= MD_physname;
3335 		} else {
3336 			D2(vswp, "%s: device name unchanged at %s",
3337 			    __func__, vswp->physname);
3338 		}
3339 	} else {
3340 		cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
3341 		    "device from updated MD.", vswp->instance);
3342 		goto fail_reconf;
3343 	}
3344 
3345 	/*
3346 	 * Check if MAC address has changed.
3347 	 */
3348 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
3349 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
3350 		    vswp->instance);
3351 		goto fail_reconf;
3352 	} else {
3353 		uint64_t maddr = macaddr;
3354 		READ_ENTER(&vswp->if_lockrw);
3355 		for (i = ETHERADDRL - 1; i >= 0; i--) {
3356 			if (vswp->if_addr.ether_addr_octet[i]
3357 			    != (macaddr & 0xFF)) {
3358 				D2(vswp, "%s: octet[%d] 0x%x != 0x%x",
3359 				    __func__, i,
3360 				    vswp->if_addr.ether_addr_octet[i],
3361 				    (macaddr & 0xFF));
3362 				updated |= MD_macaddr;
3363 				macaddr = maddr;
3364 				break;
3365 			}
3366 			macaddr >>= 8;
3367 		}
3368 		RW_EXIT(&vswp->if_lockrw);
3369 		if (updated & MD_macaddr) {
3370 			vsw_save_lmacaddr(vswp, macaddr);
3371 		}
3372 	}
3373 
3374 	/*
3375 	 * Check if switching modes have changed.
3376 	 */
3377 	if (vsw_get_md_smodes(vswp, mdp, node,
3378 	    new_smode, &smode_num)) {
3379 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
3380 		    vswp->instance, smode_propname);
3381 		goto fail_reconf;
3382 	} else {
3383 		ASSERT(smode_num != 0);
3384 		if (smode_num != vswp->smode_num) {
3385 			D2(vswp, "%s: number of modes changed from %d to %d",
3386 			    __func__, vswp->smode_num, smode_num);
3387 		}
3388 
3389 		for (i = 0; i < smode_num; i++) {
3390 			if (new_smode[i] != vswp->smode[i]) {
3391 				D2(vswp, "%s: mode changed from %d to %d",
3392 				    __func__, vswp->smode[i], new_smode[i]);
3393 				updated |= MD_smode;
3394 				break;
3395 			}
3396 		}
3397 	}
3398 
3399 	/*
3400 	 * Now make any changes which are needed...
3401 	 */
3402 
3403 	if (updated & (MD_physname | MD_smode)) {
3404 
3405 		/*
3406 		 * Stop any pending timeout to setup switching mode.
3407 		 */
3408 		vsw_stop_switching_timeout(vswp);
3409 
3410 		/*
3411 		 * Remove unicst, mcst addrs of vsw interface
3412 		 * and ports from the physdev.
3413 		 */
3414 		vsw_unset_addrs(vswp);
3415 
3416 		/*
3417 		 * Stop, detach and close the old device..
3418 		 */
3419 		mutex_enter(&vswp->mac_lock);
3420 
3421 		vsw_mac_detach(vswp);
3422 		vsw_mac_close(vswp);
3423 
3424 		mutex_exit(&vswp->mac_lock);
3425 
3426 		/*
3427 		 * Update phys name.
3428 		 */
3429 		if (updated & MD_physname) {
3430 			cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s",
3431 			    vswp->instance, vswp->physname, physname);
3432 			(void) strncpy(vswp->physname,
3433 			    physname, strlen(physname) + 1);
3434 		}
3435 
3436 		/*
3437 		 * Update array with the new switch mode values.
3438 		 */
3439 		if (updated & MD_smode) {
3440 			for (i = 0; i < smode_num; i++)
3441 				vswp->smode[i] = new_smode[i];
3442 
3443 			vswp->smode_num = smode_num;
3444 			vswp->smode_idx = 0;
3445 		}
3446 
3447 		/*
3448 		 * ..and attach, start the new device.
3449 		 */
3450 		rv = vsw_setup_switching(vswp);
3451 		if (rv == EAGAIN) {
3452 			/*
3453 			 * Unable to setup switching mode.
3454 			 * As the error is EAGAIN, schedule a timeout to retry
3455 			 * and return. Programming addresses of ports and
3456 			 * vsw interface will be done when the timeout handler
3457 			 * completes successfully.
3458 			 */
3459 			mutex_enter(&vswp->swtmout_lock);
3460 
3461 			vswp->swtmout_enabled = B_TRUE;
3462 			vswp->swtmout_id =
3463 			    timeout(vsw_setup_switching_timeout, vswp,
3464 			    (vsw_setup_switching_delay *
3465 			    drv_usectohz(MICROSEC)));
3466 
3467 			mutex_exit(&vswp->swtmout_lock);
3468 
3469 			return;
3470 
3471 		} else if (rv) {
3472 			goto fail_update;
3473 		}
3474 
3475 		/*
3476 		 * program unicst, mcst addrs of vsw interface
3477 		 * and ports in the physdev.
3478 		 */
3479 		vsw_set_addrs(vswp);
3480 
3481 	} else if (updated & MD_macaddr) {
3482 		/*
3483 		 * We enter here if only MD_macaddr is exclusively updated.
3484 		 * If MD_physname and/or MD_smode are also updated, then
3485 		 * as part of that, we would have implicitly processed
3486 		 * MD_macaddr update (above).
3487 		 */
3488 		cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx",
3489 		    vswp->instance, macaddr);
3490 
3491 		READ_ENTER(&vswp->if_lockrw);
3492 		if (vswp->if_state & VSW_IF_UP) {
3493 
3494 			mutex_enter(&vswp->hw_lock);
3495 			/*
3496 			 * Remove old mac address of vsw interface
3497 			 * from the physdev
3498 			 */
3499 			(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
3500 			/*
3501 			 * Program new mac address of vsw interface
3502 			 * in the physdev
3503 			 */
3504 			rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
3505 			mutex_exit(&vswp->hw_lock);
3506 			if (rv != 0) {
3507 				cmn_err(CE_NOTE,
3508 				    "!vsw%d: failed to program interface "
3509 				    "unicast address\n", vswp->instance);
3510 			}
3511 			/*
3512 			 * Notify the MAC layer of the changed address.
3513 			 */
3514 			mac_unicst_update(vswp->if_mh,
3515 			    (uint8_t *)&vswp->if_addr);
3516 
3517 		}
3518 		RW_EXIT(&vswp->if_lockrw);
3519 
3520 	}
3521 
3522 	return;
3523 
3524 fail_reconf:
3525 	cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance);
3526 	return;
3527 
3528 fail_update:
3529 	cmn_err(CE_WARN, "!vsw%d: update of configuration failed",
3530 	    vswp->instance);
3531 }
3532 
3533 /*
3534  * Add a new port to the system.
3535  *
3536  * Returns 0 on success, 1 on failure.
3537  */
3538 int
3539 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
3540 {
3541 	uint64_t		ldc_id;
3542 	uint8_t			*addrp;
3543 	int			i, addrsz;
3544 	int			num_nodes = 0, nchan = 0;
3545 	int			listsz = 0;
3546 	mde_cookie_t		*listp = NULL;
3547 	struct ether_addr	ea;
3548 	uint64_t		macaddr;
3549 	uint64_t		inst = 0;
3550 	vsw_port_t		*port;
3551 
3552 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
3553 		DWARN(vswp, "%s: prop(%s) not found", __func__,
3554 		    id_propname);
3555 		return (1);
3556 	}
3557 
3558 	/*
3559 	 * Find the channel endpoint node(s) (which should be under this
3560 	 * port node) which contain the channel id(s).
3561 	 */
3562 	if ((num_nodes = md_node_count(mdp)) <= 0) {
3563 		DERR(vswp, "%s: invalid number of nodes found (%d)",
3564 		    __func__, num_nodes);
3565 		return (1);
3566 	}
3567 
3568 	D2(vswp, "%s: %d nodes found", __func__, num_nodes);
3569 
3570 	/* allocate enough space for node list */
3571 	listsz = num_nodes * sizeof (mde_cookie_t);
3572 	listp = kmem_zalloc(listsz, KM_SLEEP);
3573 
3574 	nchan = md_scan_dag(mdp, *node, md_find_name(mdp, chan_propname),
3575 	    md_find_name(mdp, "fwd"), listp);
3576 
3577 	if (nchan <= 0) {
3578 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
3579 		kmem_free(listp, listsz);
3580 		return (1);
3581 	}
3582 
3583 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
3584 
3585 	/* use property from first node found */
3586 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
3587 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
3588 		    id_propname);
3589 		kmem_free(listp, listsz);
3590 		return (1);
3591 	}
3592 
3593 	/* don't need list any more */
3594 	kmem_free(listp, listsz);
3595 
3596 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
3597 
3598 	/* read mac-address property */
3599 	if (md_get_prop_data(mdp, *node, remaddr_propname,
3600 	    &addrp, &addrsz)) {
3601 		DWARN(vswp, "%s: prop(%s) not found",
3602 		    __func__, remaddr_propname);
3603 		return (1);
3604 	}
3605 
3606 	if (addrsz < ETHERADDRL) {
3607 		DWARN(vswp, "%s: invalid address size", __func__);
3608 		return (1);
3609 	}
3610 
3611 	macaddr = *((uint64_t *)addrp);
3612 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
3613 
3614 	for (i = ETHERADDRL - 1; i >= 0; i--) {
3615 		ea.ether_addr_octet[i] = macaddr & 0xFF;
3616 		macaddr >>= 8;
3617 	}
3618 
3619 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
3620 		DERR(vswp, "%s: failed to attach port", __func__);
3621 		return (1);
3622 	}
3623 
3624 	port = vsw_lookup_port(vswp, (int)inst);
3625 
3626 	/* just successfuly created the port, so it should exist */
3627 	ASSERT(port != NULL);
3628 
3629 	return (0);
3630 }
3631 
3632 /*
3633  * Attach the specified port.
3634  *
3635  * Returns 0 on success, 1 on failure.
3636  */
3637 static int
3638 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
3639 struct ether_addr *macaddr)
3640 {
3641 	vsw_port_list_t		*plist = &vswp->plist;
3642 	vsw_port_t		*port, **prev_port;
3643 	int			i;
3644 
3645 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
3646 
3647 	/* port already exists? */
3648 	READ_ENTER(&plist->lockrw);
3649 	for (port = plist->head; port != NULL; port = port->p_next) {
3650 		if (port->p_instance == p_instance) {
3651 			DWARN(vswp, "%s: port instance %d already attached",
3652 			    __func__, p_instance);
3653 			RW_EXIT(&plist->lockrw);
3654 			return (1);
3655 		}
3656 	}
3657 	RW_EXIT(&plist->lockrw);
3658 
3659 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
3660 	port->p_vswp = vswp;
3661 	port->p_instance = p_instance;
3662 	port->p_ldclist.num_ldcs = 0;
3663 	port->p_ldclist.head = NULL;
3664 	port->addr_set = VSW_ADDR_UNSET;
3665 
3666 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
3667 
3668 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
3669 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
3670 
3671 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
3672 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
3673 
3674 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
3675 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
3676 	port->state = VSW_PORT_INIT;
3677 
3678 	if (nids > VSW_PORT_MAX_LDCS) {
3679 		D2(vswp, "%s: using first of %d ldc ids",
3680 		    __func__, nids);
3681 		nids = VSW_PORT_MAX_LDCS;
3682 	}
3683 
3684 	D2(vswp, "%s: %d nids", __func__, nids);
3685 	for (i = 0; i < nids; i++) {
3686 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
3687 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
3688 			DERR(vswp, "%s: ldc_attach failed", __func__);
3689 
3690 			rw_destroy(&port->p_ldclist.lockrw);
3691 
3692 			cv_destroy(&port->ref_cv);
3693 			mutex_destroy(&port->ref_lock);
3694 
3695 			cv_destroy(&port->state_cv);
3696 			mutex_destroy(&port->state_lock);
3697 
3698 			mutex_destroy(&port->tx_lock);
3699 			mutex_destroy(&port->mca_lock);
3700 			kmem_free(port, sizeof (vsw_port_t));
3701 			return (1);
3702 		}
3703 	}
3704 
3705 	ether_copy(macaddr, &port->p_macaddr);
3706 
3707 	if (vswp->switching_setup_done == B_TRUE) {
3708 		/*
3709 		 * If the underlying physical device has been setup,
3710 		 * program the mac address of this port in it.
3711 		 * Otherwise, port macaddr will be set after the physical
3712 		 * device is successfully setup by the timeout handler.
3713 		 */
3714 		mutex_enter(&vswp->hw_lock);
3715 		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
3716 		mutex_exit(&vswp->hw_lock);
3717 	}
3718 
3719 	WRITE_ENTER(&plist->lockrw);
3720 
3721 	/* create the fdb entry for this port/mac address */
3722 	(void) vsw_add_fdb(vswp, port);
3723 
3724 	/* link it into the list of ports for this vsw instance */
3725 	prev_port = (vsw_port_t **)(&plist->head);
3726 	port->p_next = *prev_port;
3727 	*prev_port = port;
3728 	plist->num_ports++;
3729 
3730 	RW_EXIT(&plist->lockrw);
3731 
3732 	/*
3733 	 * Initialise the port and any ldc's under it.
3734 	 */
3735 	(void) vsw_init_ldcs(port);
3736 
3737 	D1(vswp, "%s: exit", __func__);
3738 	return (0);
3739 }
3740 
3741 /*
3742  * Detach the specified port.
3743  *
3744  * Returns 0 on success, 1 on failure.
3745  */
3746 static int
3747 vsw_port_detach(vsw_t *vswp, int p_instance)
3748 {
3749 	vsw_port_t	*port = NULL;
3750 	vsw_port_list_t	*plist = &vswp->plist;
3751 
3752 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
3753 
3754 	WRITE_ENTER(&plist->lockrw);
3755 
3756 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
3757 		RW_EXIT(&plist->lockrw);
3758 		return (1);
3759 	}
3760 
3761 	if (vsw_plist_del_node(vswp, port)) {
3762 		RW_EXIT(&plist->lockrw);
3763 		return (1);
3764 	}
3765 
3766 	/* Remove the fdb entry for this port/mac address */
3767 	(void) vsw_del_fdb(vswp, port);
3768 
3769 	/* Remove any multicast addresses.. */
3770 	vsw_del_mcst_port(port);
3771 
3772 	/*
3773 	 * No longer need to hold writer lock on port list now
3774 	 * that we have unlinked the target port from the list.
3775 	 */
3776 	RW_EXIT(&plist->lockrw);
3777 
3778 	/* Remove address if was programmed into HW. */
3779 	mutex_enter(&vswp->hw_lock);
3780 
3781 	/*
3782 	 * Port's address may not have been set in hardware. This could
3783 	 * happen if the underlying physical device is not yet available and
3784 	 * vsw_setup_switching_timeout() may be in progress.
3785 	 * We remove its addr from hardware only if it has been set before.
3786 	 */
3787 	if (port->addr_set != VSW_ADDR_UNSET)
3788 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
3789 
3790 	if (vswp->recfg_reqd)
3791 		vsw_reconfig_hw(vswp);
3792 
3793 	mutex_exit(&vswp->hw_lock);
3794 
3795 	if (vsw_port_delete(port)) {
3796 		return (1);
3797 	}
3798 
3799 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
3800 	return (0);
3801 }
3802 
3803 /*
3804  * Detach all active ports.
3805  *
3806  * Returns 0 on success, 1 on failure.
3807  */
3808 static int
3809 vsw_detach_ports(vsw_t *vswp)
3810 {
3811 	vsw_port_list_t 	*plist = &vswp->plist;
3812 	vsw_port_t		*port = NULL;
3813 
3814 	D1(vswp, "%s: enter", __func__);
3815 
3816 	WRITE_ENTER(&plist->lockrw);
3817 
3818 	while ((port = plist->head) != NULL) {
3819 		if (vsw_plist_del_node(vswp, port)) {
3820 			DERR(vswp, "%s: Error deleting port %d"
3821 			    " from port list", __func__, port->p_instance);
3822 			RW_EXIT(&plist->lockrw);
3823 			return (1);
3824 		}
3825 
3826 		/* Remove address if was programmed into HW. */
3827 		mutex_enter(&vswp->hw_lock);
3828 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
3829 		mutex_exit(&vswp->hw_lock);
3830 
3831 		/* Remove the fdb entry for this port/mac address */
3832 		(void) vsw_del_fdb(vswp, port);
3833 
3834 		/* Remove any multicast addresses.. */
3835 		vsw_del_mcst_port(port);
3836 
3837 		/*
3838 		 * No longer need to hold the lock on the port list
3839 		 * now that we have unlinked the target port from the
3840 		 * list.
3841 		 */
3842 		RW_EXIT(&plist->lockrw);
3843 		if (vsw_port_delete(port)) {
3844 			DERR(vswp, "%s: Error deleting port %d",
3845 			    __func__, port->p_instance);
3846 			return (1);
3847 		}
3848 		WRITE_ENTER(&plist->lockrw);
3849 	}
3850 	RW_EXIT(&plist->lockrw);
3851 
3852 	D1(vswp, "%s: exit", __func__);
3853 
3854 	return (0);
3855 }
3856 
3857 /*
3858  * Delete the specified port.
3859  *
3860  * Returns 0 on success, 1 on failure.
3861  */
3862 static int
3863 vsw_port_delete(vsw_port_t *port)
3864 {
3865 	vsw_ldc_list_t 		*ldcl;
3866 	vsw_t			*vswp = port->p_vswp;
3867 
3868 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
3869 
3870 	(void) vsw_uninit_ldcs(port);
3871 
3872 	/*
3873 	 * Wait for any pending ctrl msg tasks which reference this
3874 	 * port to finish.
3875 	 */
3876 	if (vsw_drain_port_taskq(port))
3877 		return (1);
3878 
3879 	/*
3880 	 * Wait for port reference count to hit zero.
3881 	 */
3882 	mutex_enter(&port->ref_lock);
3883 	while (port->ref_cnt != 0)
3884 		cv_wait(&port->ref_cv, &port->ref_lock);
3885 	mutex_exit(&port->ref_lock);
3886 
3887 	/*
3888 	 * Wait for any active callbacks to finish
3889 	 */
3890 	if (vsw_drain_ldcs(port))
3891 		return (1);
3892 
3893 	ldcl = &port->p_ldclist;
3894 	WRITE_ENTER(&ldcl->lockrw);
3895 	while (ldcl->num_ldcs > 0) {
3896 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
3897 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
3898 			    vswp->instance, ldcl->head->ldc_id);
3899 			RW_EXIT(&ldcl->lockrw);
3900 			return (1);
3901 		}
3902 	}
3903 	RW_EXIT(&ldcl->lockrw);
3904 
3905 	rw_destroy(&port->p_ldclist.lockrw);
3906 
3907 	mutex_destroy(&port->mca_lock);
3908 	mutex_destroy(&port->tx_lock);
3909 	cv_destroy(&port->ref_cv);
3910 	mutex_destroy(&port->ref_lock);
3911 
3912 	cv_destroy(&port->state_cv);
3913 	mutex_destroy(&port->state_lock);
3914 
3915 	kmem_free(port, sizeof (vsw_port_t));
3916 
3917 	D1(vswp, "%s: exit", __func__);
3918 
3919 	return (0);
3920 }
3921 
3922 /*
3923  * Attach a logical domain channel (ldc) under a specified port.
3924  *
3925  * Returns 0 on success, 1 on failure.
3926  */
3927 static int
3928 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
3929 {
3930 	vsw_t 		*vswp = port->p_vswp;
3931 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
3932 	vsw_ldc_t 	*ldcp = NULL;
3933 	ldc_attr_t 	attr;
3934 	ldc_status_t	istatus;
3935 	int 		status = DDI_FAILURE;
3936 	int		rv;
3937 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
3938 				PROG_callback = 0x2}
3939 			progress;
3940 
3941 	progress = PROG_init;
3942 
3943 	D1(vswp, "%s: enter", __func__);
3944 
3945 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
3946 	if (ldcp == NULL) {
3947 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
3948 		return (1);
3949 	}
3950 	ldcp->ldc_id = ldc_id;
3951 
3952 	/* allocate pool of receive mblks */
3953 	rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
3954 	if (rv) {
3955 		DWARN(vswp, "%s: unable to create free mblk pool for"
3956 		    " channel %ld (rv %d)", __func__, ldc_id, rv);
3957 		kmem_free(ldcp, sizeof (vsw_ldc_t));
3958 		return (1);
3959 	}
3960 
3961 	progress |= PROG_mblks;
3962 
3963 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
3964 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
3965 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
3966 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
3967 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
3968 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
3969 
3970 	/* required for handshake with peer */
3971 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
3972 	ldcp->peer_session = 0;
3973 	ldcp->session_status = 0;
3974 
3975 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
3976 	ldcp->hss_id = 1;	/* Initial handshake session id */
3977 
3978 	/* only set for outbound lane, inbound set by peer */
3979 	mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
3980 	mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
3981 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
3982 
3983 	attr.devclass = LDC_DEV_NT_SVC;
3984 	attr.instance = ddi_get_instance(vswp->dip);
3985 	attr.mode = LDC_MODE_UNRELIABLE;
3986 	attr.mtu = VSW_LDC_MTU;
3987 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
3988 	if (status != 0) {
3989 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
3990 		    __func__, ldc_id, status);
3991 		goto ldc_attach_fail;
3992 	}
3993 
3994 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
3995 	if (status != 0) {
3996 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
3997 		    __func__, ldc_id, status);
3998 		(void) ldc_fini(ldcp->ldc_handle);
3999 		goto ldc_attach_fail;
4000 	}
4001 
4002 	progress |= PROG_callback;
4003 
4004 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
4005 
4006 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
4007 		DERR(vswp, "%s: ldc_status failed", __func__);
4008 		mutex_destroy(&ldcp->status_lock);
4009 		goto ldc_attach_fail;
4010 	}
4011 
4012 	ldcp->ldc_status = istatus;
4013 	ldcp->ldc_port = port;
4014 	ldcp->ldc_vswp = vswp;
4015 
4016 	/* link it into the list of channels for this port */
4017 	WRITE_ENTER(&ldcl->lockrw);
4018 	ldcp->ldc_next = ldcl->head;
4019 	ldcl->head = ldcp;
4020 	ldcl->num_ldcs++;
4021 	RW_EXIT(&ldcl->lockrw);
4022 
4023 	D1(vswp, "%s: exit", __func__);
4024 	return (0);
4025 
4026 ldc_attach_fail:
4027 	mutex_destroy(&ldcp->ldc_txlock);
4028 	mutex_destroy(&ldcp->ldc_cblock);
4029 
4030 	cv_destroy(&ldcp->drain_cv);
4031 
4032 	rw_destroy(&ldcp->lane_in.dlistrw);
4033 	rw_destroy(&ldcp->lane_out.dlistrw);
4034 
4035 	if (progress & PROG_callback) {
4036 		(void) ldc_unreg_callback(ldcp->ldc_handle);
4037 	}
4038 
4039 	if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) {
4040 		if (vio_destroy_mblks(ldcp->rxh) != 0) {
4041 			/*
4042 			 * Something odd has happened, as the destroy
4043 			 * will only fail if some mblks have been allocated
4044 			 * from the pool already (which shouldn't happen)
4045 			 * and have not been returned.
4046 			 *
4047 			 * Add the pool pointer to a list maintained in
4048 			 * the device instance. Another attempt will be made
4049 			 * to free the pool when the device itself detaches.
4050 			 */
4051 			cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld "
4052 			    "failed and cannot destroy associated mblk "
4053 			    "pool", vswp->instance, ldc_id);
4054 			ldcp->rxh->nextp =  vswp->rxh;
4055 			vswp->rxh = ldcp->rxh;
4056 		}
4057 	}
4058 	mutex_destroy(&ldcp->drain_cv_lock);
4059 	mutex_destroy(&ldcp->hss_lock);
4060 
4061 	mutex_destroy(&ldcp->lane_in.seq_lock);
4062 	mutex_destroy(&ldcp->lane_out.seq_lock);
4063 	kmem_free(ldcp, sizeof (vsw_ldc_t));
4064 
4065 	return (1);
4066 }
4067 
4068 /*
4069  * Detach a logical domain channel (ldc) belonging to a
4070  * particular port.
4071  *
4072  * Returns 0 on success, 1 on failure.
4073  */
4074 static int
4075 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
4076 {
4077 	vsw_t 		*vswp = port->p_vswp;
4078 	vsw_ldc_t 	*ldcp, *prev_ldcp;
4079 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
4080 	int 		rv;
4081 
4082 	prev_ldcp = ldcl->head;
4083 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
4084 		if (ldcp->ldc_id == ldc_id) {
4085 			break;
4086 		}
4087 	}
4088 
4089 	/* specified ldc id not found */
4090 	if (ldcp == NULL) {
4091 		DERR(vswp, "%s: ldcp = NULL", __func__);
4092 		return (1);
4093 	}
4094 
4095 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
4096 
4097 	/*
4098 	 * Before we can close the channel we must release any mapped
4099 	 * resources (e.g. drings).
4100 	 */
4101 	vsw_free_lane_resources(ldcp, INBOUND);
4102 	vsw_free_lane_resources(ldcp, OUTBOUND);
4103 
4104 	/*
4105 	 * If the close fails we are in serious trouble, as won't
4106 	 * be able to delete the parent port.
4107 	 */
4108 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
4109 		DERR(vswp, "%s: error %d closing channel %lld",
4110 		    __func__, rv, ldcp->ldc_id);
4111 		return (1);
4112 	}
4113 
4114 	(void) ldc_fini(ldcp->ldc_handle);
4115 
4116 	ldcp->ldc_status = LDC_INIT;
4117 	ldcp->ldc_handle = NULL;
4118 	ldcp->ldc_vswp = NULL;
4119 
4120 	if (ldcp->rxh != NULL) {
4121 		if (vio_destroy_mblks(ldcp->rxh)) {
4122 			/*
4123 			 * Mostly likely some mblks are still in use and
4124 			 * have not been returned to the pool. Add the pool
4125 			 * to the list maintained in the device instance.
4126 			 * Another attempt will be made to destroy the pool
4127 			 * when the device detaches.
4128 			 */
4129 			ldcp->rxh->nextp =  vswp->rxh;
4130 			vswp->rxh = ldcp->rxh;
4131 		}
4132 	}
4133 
4134 	/* unlink it from the list */
4135 	prev_ldcp = ldcp->ldc_next;
4136 	ldcl->num_ldcs--;
4137 
4138 	mutex_destroy(&ldcp->ldc_txlock);
4139 	mutex_destroy(&ldcp->ldc_cblock);
4140 	cv_destroy(&ldcp->drain_cv);
4141 	mutex_destroy(&ldcp->drain_cv_lock);
4142 	mutex_destroy(&ldcp->hss_lock);
4143 	mutex_destroy(&ldcp->lane_in.seq_lock);
4144 	mutex_destroy(&ldcp->lane_out.seq_lock);
4145 	mutex_destroy(&ldcp->status_lock);
4146 	rw_destroy(&ldcp->lane_in.dlistrw);
4147 	rw_destroy(&ldcp->lane_out.dlistrw);
4148 
4149 	kmem_free(ldcp, sizeof (vsw_ldc_t));
4150 
4151 	return (0);
4152 }
4153 
4154 /*
4155  * Open and attempt to bring up the channel. Note that channel
4156  * can only be brought up if peer has also opened channel.
4157  *
4158  * Returns 0 if can open and bring up channel, otherwise
4159  * returns 1.
4160  */
4161 static int
4162 vsw_ldc_init(vsw_ldc_t *ldcp)
4163 {
4164 	vsw_t 		*vswp = ldcp->ldc_vswp;
4165 	ldc_status_t	istatus = 0;
4166 	int		rv;
4167 
4168 	D1(vswp, "%s: enter", __func__);
4169 
4170 	LDC_ENTER_LOCK(ldcp);
4171 
4172 	/* don't start at 0 in case clients don't like that */
4173 	ldcp->next_ident = 1;
4174 
4175 	rv = ldc_open(ldcp->ldc_handle);
4176 	if (rv != 0) {
4177 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
4178 		    __func__, ldcp->ldc_id, rv);
4179 		LDC_EXIT_LOCK(ldcp);
4180 		return (1);
4181 	}
4182 
4183 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
4184 		DERR(vswp, "%s: unable to get status", __func__);
4185 		LDC_EXIT_LOCK(ldcp);
4186 		return (1);
4187 
4188 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
4189 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
4190 		    __func__, ldcp->ldc_id, istatus);
4191 		LDC_EXIT_LOCK(ldcp);
4192 		return (1);
4193 	}
4194 
4195 	mutex_enter(&ldcp->status_lock);
4196 	ldcp->ldc_status = istatus;
4197 	mutex_exit(&ldcp->status_lock);
4198 
4199 	rv = ldc_up(ldcp->ldc_handle);
4200 	if (rv != 0) {
4201 		/*
4202 		 * Not a fatal error for ldc_up() to fail, as peer
4203 		 * end point may simply not be ready yet.
4204 		 */
4205 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
4206 		    ldcp->ldc_id, rv);
4207 		LDC_EXIT_LOCK(ldcp);
4208 		return (1);
4209 	}
4210 
4211 	/*
4212 	 * ldc_up() call is non-blocking so need to explicitly
4213 	 * check channel status to see if in fact the channel
4214 	 * is UP.
4215 	 */
4216 	mutex_enter(&ldcp->status_lock);
4217 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
4218 		DERR(vswp, "%s: unable to get status", __func__);
4219 		mutex_exit(&ldcp->status_lock);
4220 		LDC_EXIT_LOCK(ldcp);
4221 		return (1);
4222 
4223 	}
4224 
4225 	if (ldcp->ldc_status == LDC_UP) {
4226 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
4227 		    ldcp->ldc_id, istatus);
4228 		mutex_exit(&ldcp->status_lock);
4229 		LDC_EXIT_LOCK(ldcp);
4230 
4231 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
4232 		return (0);
4233 	}
4234 
4235 	mutex_exit(&ldcp->status_lock);
4236 	LDC_EXIT_LOCK(ldcp);
4237 
4238 	D1(vswp, "%s: exit", __func__);
4239 	return (0);
4240 }
4241 
4242 /* disable callbacks on the channel */
4243 static int
4244 vsw_ldc_uninit(vsw_ldc_t *ldcp)
4245 {
4246 	vsw_t	*vswp = ldcp->ldc_vswp;
4247 	int	rv;
4248 
4249 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
4250 
4251 	LDC_ENTER_LOCK(ldcp);
4252 
4253 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
4254 	if (rv != 0) {
4255 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
4256 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
4257 		LDC_EXIT_LOCK(ldcp);
4258 		return (1);
4259 	}
4260 
4261 	mutex_enter(&ldcp->status_lock);
4262 	ldcp->ldc_status = LDC_INIT;
4263 	mutex_exit(&ldcp->status_lock);
4264 
4265 	LDC_EXIT_LOCK(ldcp);
4266 
4267 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
4268 
4269 	return (0);
4270 }
4271 
4272 static int
4273 vsw_init_ldcs(vsw_port_t *port)
4274 {
4275 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
4276 	vsw_ldc_t	*ldcp;
4277 
4278 	READ_ENTER(&ldcl->lockrw);
4279 	ldcp =  ldcl->head;
4280 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
4281 		(void) vsw_ldc_init(ldcp);
4282 	}
4283 	RW_EXIT(&ldcl->lockrw);
4284 
4285 	return (0);
4286 }
4287 
4288 static int
4289 vsw_uninit_ldcs(vsw_port_t *port)
4290 {
4291 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
4292 	vsw_ldc_t	*ldcp;
4293 
4294 	D1(NULL, "vsw_uninit_ldcs: enter\n");
4295 
4296 	READ_ENTER(&ldcl->lockrw);
4297 	ldcp =  ldcl->head;
4298 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
4299 		(void) vsw_ldc_uninit(ldcp);
4300 	}
4301 	RW_EXIT(&ldcl->lockrw);
4302 
4303 	D1(NULL, "vsw_uninit_ldcs: exit\n");
4304 
4305 	return (0);
4306 }
4307 
4308 /*
4309  * Wait until the callback(s) associated with the ldcs under the specified
4310  * port have completed.
4311  *
4312  * Prior to this function being invoked each channel under this port
4313  * should have been quiesced via ldc_set_cb_mode(DISABLE).
4314  *
4315  * A short explaination of what we are doing below..
4316  *
4317  * The simplest approach would be to have a reference counter in
4318  * the ldc structure which is increment/decremented by the callbacks as
4319  * they use the channel. The drain function could then simply disable any
4320  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
4321  * there is a tiny window here - before the callback is able to get the lock
4322  * on the channel it is interrupted and this function gets to execute. It
4323  * sees that the ref count is zero and believes its free to delete the
4324  * associated data structures.
4325  *
4326  * We get around this by taking advantage of the fact that before the ldc
4327  * framework invokes a callback it sets a flag to indicate that there is a
4328  * callback active (or about to become active). If when we attempt to
4329  * unregister a callback when this active flag is set then the unregister
4330  * will fail with EWOULDBLOCK.
4331  *
4332  * If the unregister fails we do a cv_timedwait. We will either be signaled
4333  * by the callback as it is exiting (note we have to wait a short period to
4334  * allow the callback to return fully to the ldc framework and it to clear
4335  * the active flag), or by the timer expiring. In either case we again attempt
4336  * the unregister. We repeat this until we can succesfully unregister the
4337  * callback.
4338  *
4339  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
4340  * the case where the callback has finished but the ldc framework has not yet
4341  * cleared the active flag. In this case we would never get a cv_signal.
4342  */
4343 static int
4344 vsw_drain_ldcs(vsw_port_t *port)
4345 {
4346 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
4347 	vsw_ldc_t	*ldcp;
4348 	vsw_t		*vswp = port->p_vswp;
4349 
4350 	D1(vswp, "%s: enter", __func__);
4351 
4352 	READ_ENTER(&ldcl->lockrw);
4353 
4354 	ldcp = ldcl->head;
4355 
4356 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
4357 		/*
4358 		 * If we can unregister the channel callback then we
4359 		 * know that there is no callback either running or
4360 		 * scheduled to run for this channel so move on to next
4361 		 * channel in the list.
4362 		 */
4363 		mutex_enter(&ldcp->drain_cv_lock);
4364 
4365 		/* prompt active callbacks to quit */
4366 		ldcp->drain_state = VSW_LDC_DRAINING;
4367 
4368 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
4369 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
4370 			    ldcp->ldc_id);
4371 			mutex_exit(&ldcp->drain_cv_lock);
4372 			continue;
4373 		} else {
4374 			/*
4375 			 * If we end up here we know that either 1) a callback
4376 			 * is currently executing, 2) is about to start (i.e.
4377 			 * the ldc framework has set the active flag but
4378 			 * has not actually invoked the callback yet, or 3)
4379 			 * has finished and has returned to the ldc framework
4380 			 * but the ldc framework has not yet cleared the
4381 			 * active bit.
4382 			 *
4383 			 * Wait for it to finish.
4384 			 */
4385 			while (ldc_unreg_callback(ldcp->ldc_handle)
4386 			    == EWOULDBLOCK)
4387 				(void) cv_timedwait(&ldcp->drain_cv,
4388 				    &ldcp->drain_cv_lock, lbolt + hz);
4389 
4390 			mutex_exit(&ldcp->drain_cv_lock);
4391 			D2(vswp, "%s: unreg callback for chan %ld after "
4392 			    "timeout", __func__, ldcp->ldc_id);
4393 		}
4394 	}
4395 	RW_EXIT(&ldcl->lockrw);
4396 
4397 	D1(vswp, "%s: exit", __func__);
4398 	return (0);
4399 }
4400 
4401 /*
4402  * Wait until all tasks which reference this port have completed.
4403  *
4404  * Prior to this function being invoked each channel under this port
4405  * should have been quiesced via ldc_set_cb_mode(DISABLE).
4406  */
4407 static int
4408 vsw_drain_port_taskq(vsw_port_t *port)
4409 {
4410 	vsw_t		*vswp = port->p_vswp;
4411 
4412 	D1(vswp, "%s: enter", __func__);
4413 
4414 	/*
4415 	 * Mark the port as in the process of being detached, and
4416 	 * dispatch a marker task to the queue so we know when all
4417 	 * relevant tasks have completed.
4418 	 */
4419 	mutex_enter(&port->state_lock);
4420 	port->state = VSW_PORT_DETACHING;
4421 
4422 	if ((vswp->taskq_p == NULL) ||
4423 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
4424 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
4425 		DERR(vswp, "%s: unable to dispatch marker task",
4426 		    __func__);
4427 		mutex_exit(&port->state_lock);
4428 		return (1);
4429 	}
4430 
4431 	/*
4432 	 * Wait for the marker task to finish.
4433 	 */
4434 	while (port->state != VSW_PORT_DETACHABLE)
4435 		cv_wait(&port->state_cv, &port->state_lock);
4436 
4437 	mutex_exit(&port->state_lock);
4438 
4439 	D1(vswp, "%s: exit", __func__);
4440 
4441 	return (0);
4442 }
4443 
4444 static void
4445 vsw_marker_task(void *arg)
4446 {
4447 	vsw_port_t	*port = arg;
4448 	vsw_t		*vswp = port->p_vswp;
4449 
4450 	D1(vswp, "%s: enter", __func__);
4451 
4452 	mutex_enter(&port->state_lock);
4453 
4454 	/*
4455 	 * No further tasks should be dispatched which reference
4456 	 * this port so ok to mark it as safe to detach.
4457 	 */
4458 	port->state = VSW_PORT_DETACHABLE;
4459 
4460 	cv_signal(&port->state_cv);
4461 
4462 	mutex_exit(&port->state_lock);
4463 
4464 	D1(vswp, "%s: exit", __func__);
4465 }
4466 
4467 static vsw_port_t *
4468 vsw_lookup_port(vsw_t *vswp, int p_instance)
4469 {
4470 	vsw_port_list_t *plist = &vswp->plist;
4471 	vsw_port_t	*port;
4472 
4473 	for (port = plist->head; port != NULL; port = port->p_next) {
4474 		if (port->p_instance == p_instance) {
4475 			D2(vswp, "vsw_lookup_port: found p_instance\n");
4476 			return (port);
4477 		}
4478 	}
4479 
4480 	return (NULL);
4481 }
4482 
4483 /*
4484  * Search for and remove the specified port from the port
4485  * list. Returns 0 if able to locate and remove port, otherwise
4486  * returns 1.
4487  */
4488 static int
4489 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
4490 {
4491 	vsw_port_list_t *plist = &vswp->plist;
4492 	vsw_port_t	*curr_p, *prev_p;
4493 
4494 	if (plist->head == NULL)
4495 		return (1);
4496 
4497 	curr_p = prev_p = plist->head;
4498 
4499 	while (curr_p != NULL) {
4500 		if (curr_p == port) {
4501 			if (prev_p == curr_p) {
4502 				plist->head = curr_p->p_next;
4503 			} else {
4504 				prev_p->p_next = curr_p->p_next;
4505 			}
4506 			plist->num_ports--;
4507 			break;
4508 		} else {
4509 			prev_p = curr_p;
4510 			curr_p = curr_p->p_next;
4511 		}
4512 	}
4513 	return (0);
4514 }
4515 
4516 /*
4517  * Interrupt handler for ldc messages.
4518  */
4519 static uint_t
4520 vsw_ldc_cb(uint64_t event, caddr_t arg)
4521 {
4522 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
4523 	vsw_t 		*vswp = ldcp->ldc_vswp;
4524 
4525 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4526 
4527 	mutex_enter(&ldcp->ldc_cblock);
4528 
4529 	mutex_enter(&ldcp->status_lock);
4530 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
4531 		mutex_exit(&ldcp->status_lock);
4532 		mutex_exit(&ldcp->ldc_cblock);
4533 		return (LDC_SUCCESS);
4534 	}
4535 	mutex_exit(&ldcp->status_lock);
4536 
4537 	if (event & LDC_EVT_UP) {
4538 		/*
4539 		 * Channel has come up.
4540 		 */
4541 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
4542 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
4543 
4544 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
4545 
4546 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
4547 	}
4548 
4549 	if (event & LDC_EVT_READ) {
4550 		/*
4551 		 * Data available for reading.
4552 		 */
4553 		D2(vswp, "%s: id(ld) event(%llx) data READ",
4554 		    __func__, ldcp->ldc_id, event);
4555 
4556 		vsw_process_pkt(ldcp);
4557 
4558 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
4559 
4560 		goto vsw_cb_exit;
4561 	}
4562 
4563 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
4564 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
4565 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
4566 
4567 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4568 	}
4569 
4570 	/*
4571 	 * Catch either LDC_EVT_WRITE which we don't support or any
4572 	 * unknown event.
4573 	 */
4574 	if (event &
4575 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
4576 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
4577 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
4578 	}
4579 
4580 vsw_cb_exit:
4581 	mutex_exit(&ldcp->ldc_cblock);
4582 
4583 	/*
4584 	 * Let the drain function know we are finishing if it
4585 	 * is waiting.
4586 	 */
4587 	mutex_enter(&ldcp->drain_cv_lock);
4588 	if (ldcp->drain_state == VSW_LDC_DRAINING)
4589 		cv_signal(&ldcp->drain_cv);
4590 	mutex_exit(&ldcp->drain_cv_lock);
4591 
4592 	return (LDC_SUCCESS);
4593 }
4594 
4595 /*
4596  * Reinitialise data structures associated with the channel.
4597  */
4598 static void
4599 vsw_ldc_reinit(vsw_ldc_t *ldcp)
4600 {
4601 	vsw_t		*vswp = ldcp->ldc_vswp;
4602 	vsw_port_t	*port;
4603 	vsw_ldc_list_t	*ldcl;
4604 
4605 	D1(vswp, "%s: enter", __func__);
4606 
4607 	port = ldcp->ldc_port;
4608 	ldcl = &port->p_ldclist;
4609 
4610 	READ_ENTER(&ldcl->lockrw);
4611 
4612 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
4613 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
4614 
4615 	vsw_free_lane_resources(ldcp, INBOUND);
4616 	vsw_free_lane_resources(ldcp, OUTBOUND);
4617 	RW_EXIT(&ldcl->lockrw);
4618 
4619 	ldcp->lane_in.lstate = 0;
4620 	ldcp->lane_out.lstate = 0;
4621 
4622 	/*
4623 	 * Remove parent port from any multicast groups
4624 	 * it may have registered with. Client must resend
4625 	 * multicast add command after handshake completes.
4626 	 */
4627 	(void) vsw_del_fdb(vswp, port);
4628 
4629 	vsw_del_mcst_port(port);
4630 
4631 	ldcp->peer_session = 0;
4632 	ldcp->session_status = 0;
4633 	ldcp->hcnt = 0;
4634 	ldcp->hphase = VSW_MILESTONE0;
4635 
4636 	D1(vswp, "%s: exit", __func__);
4637 }
4638 
4639 /*
4640  * Process a connection event.
4641  *
4642  * Note - care must be taken to ensure that this function is
4643  * not called with the dlistrw lock held.
4644  */
4645 static void
4646 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
4647 {
4648 	vsw_t		*vswp = ldcp->ldc_vswp;
4649 	vsw_conn_evt_t	*conn = NULL;
4650 
4651 	D1(vswp, "%s: enter", __func__);
4652 
4653 	/*
4654 	 * Check if either a reset or restart event is pending
4655 	 * or in progress. If so just return.
4656 	 *
4657 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
4658 	 * being received by the callback handler, or a ECONNRESET error
4659 	 * code being returned from a ldc_read() or ldc_write() call.
4660 	 *
4661 	 * A VSW_CONN_RESTART event occurs when some error checking code
4662 	 * decides that there is a problem with data from the channel,
4663 	 * and that the handshake should be restarted.
4664 	 */
4665 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
4666 	    (ldstub((uint8_t *)&ldcp->reset_active)))
4667 		return;
4668 
4669 	/*
4670 	 * If it is an LDC_UP event we first check the recorded
4671 	 * state of the channel. If this is UP then we know that
4672 	 * the channel moving to the UP state has already been dealt
4673 	 * with and don't need to dispatch a  new task.
4674 	 *
4675 	 * The reason for this check is that when we do a ldc_up(),
4676 	 * depending on the state of the peer, we may or may not get
4677 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
4678 	 * every time we do ldc_up() we explicitly check the channel
4679 	 * status to see has it come up (ldc_up() is asynch and will
4680 	 * complete at some undefined time), and take the appropriate
4681 	 * action.
4682 	 *
4683 	 * The flip side of this is that we may get a LDC_UP event
4684 	 * when we have already seen that the channel is up and have
4685 	 * dealt with that.
4686 	 */
4687 	mutex_enter(&ldcp->status_lock);
4688 	if (evt == VSW_CONN_UP) {
4689 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
4690 			mutex_exit(&ldcp->status_lock);
4691 			return;
4692 		}
4693 	}
4694 	mutex_exit(&ldcp->status_lock);
4695 
4696 	/*
4697 	 * The transaction group id allows us to identify and discard
4698 	 * any tasks which are still pending on the taskq and refer
4699 	 * to the handshake session we are about to restart or reset.
4700 	 * These stale messages no longer have any real meaning.
4701 	 */
4702 	mutex_enter(&ldcp->hss_lock);
4703 	ldcp->hss_id++;
4704 	mutex_exit(&ldcp->hss_lock);
4705 
4706 	ASSERT(vswp->taskq_p != NULL);
4707 
4708 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
4709 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
4710 		    " connection event", vswp->instance);
4711 		goto err_exit;
4712 	}
4713 
4714 	conn->evt = evt;
4715 	conn->ldcp = ldcp;
4716 
4717 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
4718 	    DDI_NOSLEEP) != DDI_SUCCESS) {
4719 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
4720 		    vswp->instance);
4721 
4722 		kmem_free(conn, sizeof (vsw_conn_evt_t));
4723 		goto err_exit;
4724 	}
4725 
4726 	D1(vswp, "%s: exit", __func__);
4727 	return;
4728 
4729 err_exit:
4730 	/*
4731 	 * Have mostly likely failed due to memory shortage. Clear the flag so
4732 	 * that future requests will at least be attempted and will hopefully
4733 	 * succeed.
4734 	 */
4735 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
4736 		ldcp->reset_active = 0;
4737 }
4738 
4739 /*
4740  * Deal with events relating to a connection. Invoked from a taskq.
4741  */
4742 static void
4743 vsw_conn_task(void *arg)
4744 {
4745 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
4746 	vsw_ldc_t	*ldcp = NULL;
4747 	vsw_t		*vswp = NULL;
4748 	uint16_t	evt;
4749 	ldc_status_t	curr_status;
4750 
4751 	ldcp = conn->ldcp;
4752 	evt = conn->evt;
4753 	vswp = ldcp->ldc_vswp;
4754 
4755 	D1(vswp, "%s: enter", __func__);
4756 
4757 	/* can safely free now have copied out data */
4758 	kmem_free(conn, sizeof (vsw_conn_evt_t));
4759 
4760 	mutex_enter(&ldcp->status_lock);
4761 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
4762 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
4763 		    "channel %ld", vswp->instance, ldcp->ldc_id);
4764 		mutex_exit(&ldcp->status_lock);
4765 		return;
4766 	}
4767 
4768 	/*
4769 	 * If we wish to restart the handshake on this channel, then if
4770 	 * the channel is UP we bring it DOWN to flush the underlying
4771 	 * ldc queue.
4772 	 */
4773 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
4774 		(void) ldc_down(ldcp->ldc_handle);
4775 
4776 	/*
4777 	 * re-init all the associated data structures.
4778 	 */
4779 	vsw_ldc_reinit(ldcp);
4780 
4781 	/*
4782 	 * Bring the channel back up (note it does no harm to
4783 	 * do this even if the channel is already UP, Just
4784 	 * becomes effectively a no-op).
4785 	 */
4786 	(void) ldc_up(ldcp->ldc_handle);
4787 
4788 	/*
4789 	 * Check if channel is now UP. This will only happen if
4790 	 * peer has also done a ldc_up().
4791 	 */
4792 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
4793 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
4794 		    "channel %ld", vswp->instance, ldcp->ldc_id);
4795 		mutex_exit(&ldcp->status_lock);
4796 		return;
4797 	}
4798 
4799 	ldcp->ldc_status = curr_status;
4800 
4801 	/* channel UP so restart handshake by sending version info */
4802 	if (curr_status == LDC_UP) {
4803 		if (ldcp->hcnt++ > vsw_num_handshakes) {
4804 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
4805 			    " handshake attempts (%d) on channel %ld",
4806 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
4807 			mutex_exit(&ldcp->status_lock);
4808 			return;
4809 		}
4810 
4811 		if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
4812 		    DDI_NOSLEEP) != DDI_SUCCESS) {
4813 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
4814 			    vswp->instance);
4815 
4816 			/*
4817 			 * Don't count as valid restart attempt if couldn't
4818 			 * send version msg.
4819 			 */
4820 			if (ldcp->hcnt > 0)
4821 				ldcp->hcnt--;
4822 		}
4823 	}
4824 
4825 	/*
4826 	 * Mark that the process is complete by clearing the flag.
4827 	 *
4828 	 * Note is it possible that the taskq dispatch above may have failed,
4829 	 * most likely due to memory shortage. We still clear the flag so
4830 	 * future attempts will at least be attempted and will hopefully
4831 	 * succeed.
4832 	 */
4833 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
4834 		ldcp->reset_active = 0;
4835 
4836 	mutex_exit(&ldcp->status_lock);
4837 
4838 	D1(vswp, "%s: exit", __func__);
4839 }
4840 
4841 /*
4842  * returns 0 if legal for event signified by flag to have
4843  * occured at the time it did. Otherwise returns 1.
4844  */
4845 int
4846 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
4847 {
4848 	vsw_t		*vswp = ldcp->ldc_vswp;
4849 	uint64_t	state;
4850 	uint64_t	phase;
4851 
4852 	if (dir == INBOUND)
4853 		state = ldcp->lane_in.lstate;
4854 	else
4855 		state = ldcp->lane_out.lstate;
4856 
4857 	phase = ldcp->hphase;
4858 
4859 	switch (flag) {
4860 	case VSW_VER_INFO_RECV:
4861 		if (phase > VSW_MILESTONE0) {
4862 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
4863 			    " when in state %d\n", ldcp->ldc_id, phase);
4864 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4865 			return (1);
4866 		}
4867 		break;
4868 
4869 	case VSW_VER_ACK_RECV:
4870 	case VSW_VER_NACK_RECV:
4871 		if (!(state & VSW_VER_INFO_SENT)) {
4872 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
4873 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
4874 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4875 			return (1);
4876 		} else
4877 			state &= ~VSW_VER_INFO_SENT;
4878 		break;
4879 
4880 	case VSW_ATTR_INFO_RECV:
4881 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
4882 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
4883 			    " when in state %d\n", ldcp->ldc_id, phase);
4884 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4885 			return (1);
4886 		}
4887 		break;
4888 
4889 	case VSW_ATTR_ACK_RECV:
4890 	case VSW_ATTR_NACK_RECV:
4891 		if (!(state & VSW_ATTR_INFO_SENT)) {
4892 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
4893 			    " or ATTR_NACK when in state %d\n",
4894 			    ldcp->ldc_id, phase);
4895 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4896 			return (1);
4897 		} else
4898 			state &= ~VSW_ATTR_INFO_SENT;
4899 		break;
4900 
4901 	case VSW_DRING_INFO_RECV:
4902 		if (phase < VSW_MILESTONE1) {
4903 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
4904 			    " when in state %d\n", ldcp->ldc_id, phase);
4905 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4906 			return (1);
4907 		}
4908 		break;
4909 
4910 	case VSW_DRING_ACK_RECV:
4911 	case VSW_DRING_NACK_RECV:
4912 		if (!(state & VSW_DRING_INFO_SENT)) {
4913 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
4914 			    " or DRING_NACK when in state %d\n",
4915 			    ldcp->ldc_id, phase);
4916 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4917 			return (1);
4918 		} else
4919 			state &= ~VSW_DRING_INFO_SENT;
4920 		break;
4921 
4922 	case VSW_RDX_INFO_RECV:
4923 		if (phase < VSW_MILESTONE3) {
4924 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
4925 			    " when in state %d\n", ldcp->ldc_id, phase);
4926 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4927 			return (1);
4928 		}
4929 		break;
4930 
4931 	case VSW_RDX_ACK_RECV:
4932 	case VSW_RDX_NACK_RECV:
4933 		if (!(state & VSW_RDX_INFO_SENT)) {
4934 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
4935 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
4936 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4937 			return (1);
4938 		} else
4939 			state &= ~VSW_RDX_INFO_SENT;
4940 		break;
4941 
4942 	case VSW_MCST_INFO_RECV:
4943 		if (phase < VSW_MILESTONE3) {
4944 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
4945 			    " when in state %d\n", ldcp->ldc_id, phase);
4946 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4947 			return (1);
4948 		}
4949 		break;
4950 
4951 	default:
4952 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
4953 		    ldcp->ldc_id, flag);
4954 		return (1);
4955 	}
4956 
4957 	if (dir == INBOUND)
4958 		ldcp->lane_in.lstate = state;
4959 	else
4960 		ldcp->lane_out.lstate = state;
4961 
4962 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
4963 
4964 	return (0);
4965 }
4966 
4967 void
4968 vsw_next_milestone(vsw_ldc_t *ldcp)
4969 {
4970 	vsw_t		*vswp = ldcp->ldc_vswp;
4971 
4972 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
4973 	    ldcp->ldc_id, ldcp->hphase);
4974 
4975 	DUMP_FLAGS(ldcp->lane_in.lstate);
4976 	DUMP_FLAGS(ldcp->lane_out.lstate);
4977 
4978 	switch (ldcp->hphase) {
4979 
4980 	case VSW_MILESTONE0:
4981 		/*
4982 		 * If we haven't started to handshake with our peer,
4983 		 * start to do so now.
4984 		 */
4985 		if (ldcp->lane_out.lstate == 0) {
4986 			D2(vswp, "%s: (chan %lld) starting handshake "
4987 			    "with peer", __func__, ldcp->ldc_id);
4988 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
4989 		}
4990 
4991 		/*
4992 		 * Only way to pass this milestone is to have successfully
4993 		 * negotiated version info.
4994 		 */
4995 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
4996 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
4997 
4998 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
4999 			    __func__, ldcp->ldc_id);
5000 
5001 			/*
5002 			 * Next milestone is passed when attribute
5003 			 * information has been successfully exchanged.
5004 			 */
5005 			ldcp->hphase = VSW_MILESTONE1;
5006 			vsw_send_attr(ldcp);
5007 
5008 		}
5009 		break;
5010 
5011 	case VSW_MILESTONE1:
5012 		/*
5013 		 * Only way to pass this milestone is to have successfully
5014 		 * negotiated attribute information.
5015 		 */
5016 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
5017 
5018 			ldcp->hphase = VSW_MILESTONE2;
5019 
5020 			/*
5021 			 * If the peer device has said it wishes to
5022 			 * use descriptor rings then we send it our ring
5023 			 * info, otherwise we just set up a private ring
5024 			 * which we use an internal buffer
5025 			 */
5026 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
5027 				vsw_send_dring_info(ldcp);
5028 		}
5029 		break;
5030 
5031 	case VSW_MILESTONE2:
5032 		/*
5033 		 * If peer has indicated in its attribute message that
5034 		 * it wishes to use descriptor rings then the only way
5035 		 * to pass this milestone is for us to have received
5036 		 * valid dring info.
5037 		 *
5038 		 * If peer is not using descriptor rings then just fall
5039 		 * through.
5040 		 */
5041 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
5042 		    (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
5043 			break;
5044 
5045 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
5046 		    __func__, ldcp->ldc_id);
5047 
5048 		ldcp->hphase = VSW_MILESTONE3;
5049 		vsw_send_rdx(ldcp);
5050 		break;
5051 
5052 	case VSW_MILESTONE3:
5053 		/*
5054 		 * Pass this milestone when all paramaters have been
5055 		 * successfully exchanged and RDX sent in both directions.
5056 		 *
5057 		 * Mark outbound lane as available to transmit data.
5058 		 */
5059 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
5060 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
5061 
5062 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
5063 			    __func__, ldcp->ldc_id);
5064 			D2(vswp, "%s: ** handshake complete (0x%llx : "
5065 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
5066 			    ldcp->lane_out.lstate);
5067 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
5068 			ldcp->hphase = VSW_MILESTONE4;
5069 			ldcp->hcnt = 0;
5070 			DISPLAY_STATE();
5071 		} else {
5072 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
5073 			    __func__, ldcp->lane_in.lstate,
5074 			    ldcp->lane_out.lstate);
5075 		}
5076 		break;
5077 
5078 	case VSW_MILESTONE4:
5079 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
5080 		    ldcp->ldc_id);
5081 		break;
5082 
5083 	default:
5084 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
5085 		    ldcp->ldc_id, ldcp->hphase);
5086 	}
5087 
5088 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
5089 	    ldcp->hphase);
5090 }
5091 
5092 /*
5093  * Check if major version is supported.
5094  *
5095  * Returns 0 if finds supported major number, and if necessary
5096  * adjusts the minor field.
5097  *
5098  * Returns 1 if can't match major number exactly. Sets mjor/minor
5099  * to next lowest support values, or to zero if no other values possible.
5100  */
5101 static int
5102 vsw_supported_version(vio_ver_msg_t *vp)
5103 {
5104 	int	i;
5105 
5106 	D1(NULL, "vsw_supported_version: enter");
5107 
5108 	for (i = 0; i < VSW_NUM_VER; i++) {
5109 		if (vsw_versions[i].ver_major == vp->ver_major) {
5110 			/*
5111 			 * Matching or lower major version found. Update
5112 			 * minor number if necessary.
5113 			 */
5114 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
5115 				D2(NULL, "%s: adjusting minor value from %d "
5116 				    "to %d", __func__, vp->ver_minor,
5117 				    vsw_versions[i].ver_minor);
5118 				vp->ver_minor = vsw_versions[i].ver_minor;
5119 			}
5120 
5121 			return (0);
5122 		}
5123 
5124 		if (vsw_versions[i].ver_major < vp->ver_major) {
5125 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
5126 				D2(NULL, "%s: adjusting minor value from %d "
5127 				    "to %d", __func__, vp->ver_minor,
5128 				    vsw_versions[i].ver_minor);
5129 				vp->ver_minor = vsw_versions[i].ver_minor;
5130 			}
5131 			return (1);
5132 		}
5133 	}
5134 
5135 	/* No match was possible, zero out fields */
5136 	vp->ver_major = 0;
5137 	vp->ver_minor = 0;
5138 
5139 	D1(NULL, "vsw_supported_version: exit");
5140 
5141 	return (1);
5142 }
5143 
5144 /*
5145  * Main routine for processing messages received over LDC.
5146  */
5147 static void
5148 vsw_process_pkt(void *arg)
5149 {
5150 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
5151 	vsw_t 		*vswp = ldcp->ldc_vswp;
5152 	size_t		msglen;
5153 	vio_msg_tag_t	tag;
5154 	def_msg_t	dmsg;
5155 	int 		rv = 0;
5156 
5157 
5158 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
5159 
5160 	/*
5161 	 * If channel is up read messages until channel is empty.
5162 	 */
5163 	do {
5164 		msglen = sizeof (dmsg);
5165 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
5166 
5167 		if (rv != 0) {
5168 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
5169 			    __func__, ldcp->ldc_id, rv, msglen);
5170 		}
5171 
5172 		/* channel has been reset */
5173 		if (rv == ECONNRESET) {
5174 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
5175 			break;
5176 		}
5177 
5178 		if (msglen == 0) {
5179 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
5180 			    ldcp->ldc_id);
5181 			break;
5182 		}
5183 
5184 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
5185 		    ldcp->ldc_id, msglen);
5186 
5187 		/*
5188 		 * Figure out what sort of packet we have gotten by
5189 		 * examining the msg tag, and then switch it appropriately.
5190 		 */
5191 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
5192 
5193 		switch (tag.vio_msgtype) {
5194 		case VIO_TYPE_CTRL:
5195 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
5196 			break;
5197 		case VIO_TYPE_DATA:
5198 			vsw_process_data_pkt(ldcp, &dmsg, tag);
5199 			break;
5200 		case VIO_TYPE_ERR:
5201 			vsw_process_err_pkt(ldcp, &dmsg, tag);
5202 			break;
5203 		default:
5204 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
5205 			    "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
5206 			break;
5207 		}
5208 	} while (msglen);
5209 
5210 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
5211 }
5212 
5213 /*
5214  * Dispatch a task to process a VIO control message.
5215  */
5216 static void
5217 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
5218 {
5219 	vsw_ctrl_task_t		*ctaskp = NULL;
5220 	vsw_port_t		*port = ldcp->ldc_port;
5221 	vsw_t			*vswp = port->p_vswp;
5222 
5223 	D1(vswp, "%s: enter", __func__);
5224 
5225 	/*
5226 	 * We need to handle RDX ACK messages in-band as once they
5227 	 * are exchanged it is possible that we will get an
5228 	 * immediate (legitimate) data packet.
5229 	 */
5230 	if ((tag.vio_subtype_env == VIO_RDX) &&
5231 	    (tag.vio_subtype == VIO_SUBTYPE_ACK)) {
5232 
5233 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
5234 			return;
5235 
5236 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
5237 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
5238 		    "(ostate 0x%llx : hphase %d)", __func__,
5239 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
5240 		vsw_next_milestone(ldcp);
5241 		return;
5242 	}
5243 
5244 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
5245 
5246 	if (ctaskp == NULL) {
5247 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
5248 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5249 		return;
5250 	}
5251 
5252 	ctaskp->ldcp = ldcp;
5253 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
5254 	mutex_enter(&ldcp->hss_lock);
5255 	ctaskp->hss_id = ldcp->hss_id;
5256 	mutex_exit(&ldcp->hss_lock);
5257 
5258 	/*
5259 	 * Dispatch task to processing taskq if port is not in
5260 	 * the process of being detached.
5261 	 */
5262 	mutex_enter(&port->state_lock);
5263 	if (port->state == VSW_PORT_INIT) {
5264 		if ((vswp->taskq_p == NULL) ||
5265 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
5266 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
5267 			DERR(vswp, "%s: unable to dispatch task to taskq",
5268 			    __func__);
5269 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
5270 			mutex_exit(&port->state_lock);
5271 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5272 			return;
5273 		}
5274 	} else {
5275 		DWARN(vswp, "%s: port %d detaching, not dispatching "
5276 		    "task", __func__, port->p_instance);
5277 	}
5278 
5279 	mutex_exit(&port->state_lock);
5280 
5281 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
5282 	    ldcp->ldc_id);
5283 	D1(vswp, "%s: exit", __func__);
5284 }
5285 
5286 /*
5287  * Process a VIO ctrl message. Invoked from taskq.
5288  */
5289 static void
5290 vsw_process_ctrl_pkt(void *arg)
5291 {
5292 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
5293 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
5294 	vsw_t 		*vswp = ldcp->ldc_vswp;
5295 	vio_msg_tag_t	tag;
5296 	uint16_t	env;
5297 
5298 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5299 
5300 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
5301 	env = tag.vio_subtype_env;
5302 
5303 	/* stale pkt check */
5304 	mutex_enter(&ldcp->hss_lock);
5305 	if (ctaskp->hss_id < ldcp->hss_id) {
5306 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
5307 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
5308 		mutex_exit(&ldcp->hss_lock);
5309 		return;
5310 	}
5311 	mutex_exit(&ldcp->hss_lock);
5312 
5313 	/* session id check */
5314 	if (ldcp->session_status & VSW_PEER_SESSION) {
5315 		if (ldcp->peer_session != tag.vio_sid) {
5316 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
5317 			    __func__, ldcp->ldc_id, tag.vio_sid);
5318 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
5319 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5320 			return;
5321 		}
5322 	}
5323 
5324 	/*
5325 	 * Switch on vio_subtype envelope, then let lower routines
5326 	 * decide if its an INFO, ACK or NACK packet.
5327 	 */
5328 	switch (env) {
5329 	case VIO_VER_INFO:
5330 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
5331 		break;
5332 	case VIO_DRING_REG:
5333 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
5334 		break;
5335 	case VIO_DRING_UNREG:
5336 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
5337 		break;
5338 	case VIO_ATTR_INFO:
5339 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
5340 		break;
5341 	case VNET_MCAST_INFO:
5342 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
5343 		break;
5344 	case VIO_RDX:
5345 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
5346 		break;
5347 	default:
5348 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
5349 	}
5350 
5351 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
5352 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5353 }
5354 
5355 /*
5356  * Version negotiation. We can end up here either because our peer
5357  * has responded to a handshake message we have sent it, or our peer
5358  * has initiated a handshake with us. If its the former then can only
5359  * be ACK or NACK, if its the later can only be INFO.
5360  *
5361  * If its an ACK we move to the next stage of the handshake, namely
5362  * attribute exchange. If its a NACK we see if we can specify another
5363  * version, if we can't we stop.
5364  *
5365  * If it is an INFO we reset all params associated with communication
5366  * in that direction over this channel (remember connection is
5367  * essentially 2 independent simplex channels).
5368  */
5369 void
5370 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
5371 {
5372 	vio_ver_msg_t	*ver_pkt;
5373 	vsw_t 		*vswp = ldcp->ldc_vswp;
5374 
5375 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5376 
5377 	/*
5378 	 * We know this is a ctrl/version packet so
5379 	 * cast it into the correct structure.
5380 	 */
5381 	ver_pkt = (vio_ver_msg_t *)pkt;
5382 
5383 	switch (ver_pkt->tag.vio_subtype) {
5384 	case VIO_SUBTYPE_INFO:
5385 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
5386 
5387 		/*
5388 		 * Record the session id, which we will use from now
5389 		 * until we see another VER_INFO msg. Even then the
5390 		 * session id in most cases will be unchanged, execpt
5391 		 * if channel was reset.
5392 		 */
5393 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
5394 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
5395 			DERR(vswp, "%s: updating session id for chan %lld "
5396 			    "from %llx to %llx", __func__, ldcp->ldc_id,
5397 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
5398 		}
5399 
5400 		ldcp->peer_session = ver_pkt->tag.vio_sid;
5401 		ldcp->session_status |= VSW_PEER_SESSION;
5402 
5403 		/* Legal message at this time ? */
5404 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
5405 			return;
5406 
5407 		/*
5408 		 * First check the device class. Currently only expect
5409 		 * to be talking to a network device. In the future may
5410 		 * also talk to another switch.
5411 		 */
5412 		if (ver_pkt->dev_class != VDEV_NETWORK) {
5413 			DERR(vswp, "%s: illegal device class %d", __func__,
5414 			    ver_pkt->dev_class);
5415 
5416 			ver_pkt->tag.vio_sid = ldcp->local_session;
5417 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5418 
5419 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
5420 
5421 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
5422 			    sizeof (vio_ver_msg_t), B_TRUE);
5423 
5424 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
5425 			vsw_next_milestone(ldcp);
5426 			return;
5427 		} else {
5428 			ldcp->dev_class = ver_pkt->dev_class;
5429 		}
5430 
5431 		/*
5432 		 * Now check the version.
5433 		 */
5434 		if (vsw_supported_version(ver_pkt) == 0) {
5435 			/*
5436 			 * Support this major version and possibly
5437 			 * adjusted minor version.
5438 			 */
5439 
5440 			D2(vswp, "%s: accepted ver %d:%d", __func__,
5441 			    ver_pkt->ver_major, ver_pkt->ver_minor);
5442 
5443 			/* Store accepted values */
5444 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
5445 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
5446 
5447 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5448 
5449 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
5450 		} else {
5451 			/*
5452 			 * NACK back with the next lower major/minor
5453 			 * pairing we support (if don't suuport any more
5454 			 * versions then they will be set to zero.
5455 			 */
5456 
5457 			D2(vswp, "%s: replying with ver %d:%d", __func__,
5458 			    ver_pkt->ver_major, ver_pkt->ver_minor);
5459 
5460 			/* Store updated values */
5461 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
5462 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
5463 
5464 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5465 
5466 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
5467 		}
5468 
5469 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
5470 		ver_pkt->tag.vio_sid = ldcp->local_session;
5471 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
5472 		    sizeof (vio_ver_msg_t), B_TRUE);
5473 
5474 		vsw_next_milestone(ldcp);
5475 		break;
5476 
5477 	case VIO_SUBTYPE_ACK:
5478 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
5479 
5480 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
5481 			return;
5482 
5483 		/* Store updated values */
5484 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
5485 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
5486 
5487 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
5488 		vsw_next_milestone(ldcp);
5489 
5490 		break;
5491 
5492 	case VIO_SUBTYPE_NACK:
5493 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
5494 
5495 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
5496 			return;
5497 
5498 		/*
5499 		 * If our peer sent us a NACK with the ver fields set to
5500 		 * zero then there is nothing more we can do. Otherwise see
5501 		 * if we support either the version suggested, or a lesser
5502 		 * one.
5503 		 */
5504 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
5505 			DERR(vswp, "%s: peer unable to negotiate any "
5506 			    "further.", __func__);
5507 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
5508 			vsw_next_milestone(ldcp);
5509 			return;
5510 		}
5511 
5512 		/*
5513 		 * Check to see if we support this major version or
5514 		 * a lower one. If we don't then maj/min will be set
5515 		 * to zero.
5516 		 */
5517 		(void) vsw_supported_version(ver_pkt);
5518 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
5519 			/* Nothing more we can do */
5520 			DERR(vswp, "%s: version negotiation failed.\n",
5521 			    __func__);
5522 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
5523 			vsw_next_milestone(ldcp);
5524 		} else {
5525 			/* found a supported major version */
5526 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
5527 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
5528 
5529 			D2(vswp, "%s: resending with updated values (%x, %x)",
5530 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
5531 
5532 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
5533 			ver_pkt->tag.vio_sid = ldcp->local_session;
5534 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
5535 
5536 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
5537 
5538 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
5539 			    sizeof (vio_ver_msg_t), B_TRUE);
5540 
5541 			vsw_next_milestone(ldcp);
5542 
5543 		}
5544 		break;
5545 
5546 	default:
5547 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5548 		    ver_pkt->tag.vio_subtype);
5549 	}
5550 
5551 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
5552 }
5553 
5554 /*
5555  * Process an attribute packet. We can end up here either because our peer
5556  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
5557  * peer has sent us an attribute INFO message
5558  *
5559  * If its an ACK we then move to the next stage of the handshake which
5560  * is to send our descriptor ring info to our peer. If its a NACK then
5561  * there is nothing more we can (currently) do.
5562  *
5563  * If we get a valid/acceptable INFO packet (and we have already negotiated
5564  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
5565  * NACK back and reset channel state to INACTIV.
5566  *
5567  * FUTURE: in time we will probably negotiate over attributes, but for
5568  * the moment unacceptable attributes are regarded as a fatal error.
5569  *
5570  */
5571 void
5572 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
5573 {
5574 	vnet_attr_msg_t		*attr_pkt;
5575 	vsw_t			*vswp = ldcp->ldc_vswp;
5576 	vsw_port_t		*port = ldcp->ldc_port;
5577 	uint64_t		macaddr = 0;
5578 	int			i;
5579 
5580 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5581 
5582 	/*
5583 	 * We know this is a ctrl/attr packet so
5584 	 * cast it into the correct structure.
5585 	 */
5586 	attr_pkt = (vnet_attr_msg_t *)pkt;
5587 
5588 	switch (attr_pkt->tag.vio_subtype) {
5589 	case VIO_SUBTYPE_INFO:
5590 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5591 
5592 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
5593 			return;
5594 
5595 		/*
5596 		 * If the attributes are unacceptable then we NACK back.
5597 		 */
5598 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
5599 
5600 			DERR(vswp, "%s (chan %d): invalid attributes",
5601 			    __func__, ldcp->ldc_id);
5602 
5603 			vsw_free_lane_resources(ldcp, INBOUND);
5604 
5605 			attr_pkt->tag.vio_sid = ldcp->local_session;
5606 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5607 
5608 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
5609 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
5610 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
5611 			    sizeof (vnet_attr_msg_t), B_TRUE);
5612 
5613 			vsw_next_milestone(ldcp);
5614 			return;
5615 		}
5616 
5617 		/*
5618 		 * Otherwise store attributes for this lane and update
5619 		 * lane state.
5620 		 */
5621 		ldcp->lane_in.mtu = attr_pkt->mtu;
5622 		ldcp->lane_in.addr = attr_pkt->addr;
5623 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
5624 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
5625 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
5626 
5627 		macaddr = ldcp->lane_in.addr;
5628 		for (i = ETHERADDRL - 1; i >= 0; i--) {
5629 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
5630 			macaddr >>= 8;
5631 		}
5632 
5633 		/* create the fdb entry for this port/mac address */
5634 		(void) vsw_add_fdb(vswp, port);
5635 
5636 		/* setup device specifc xmit routines */
5637 		mutex_enter(&port->tx_lock);
5638 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
5639 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
5640 			port->transmit = vsw_dringsend;
5641 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
5642 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
5643 			vsw_create_privring(ldcp);
5644 			port->transmit = vsw_descrsend;
5645 		}
5646 		mutex_exit(&port->tx_lock);
5647 
5648 		attr_pkt->tag.vio_sid = ldcp->local_session;
5649 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5650 
5651 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
5652 
5653 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
5654 
5655 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
5656 		    sizeof (vnet_attr_msg_t), B_TRUE);
5657 
5658 		vsw_next_milestone(ldcp);
5659 		break;
5660 
5661 	case VIO_SUBTYPE_ACK:
5662 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5663 
5664 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
5665 			return;
5666 
5667 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
5668 		vsw_next_milestone(ldcp);
5669 		break;
5670 
5671 	case VIO_SUBTYPE_NACK:
5672 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5673 
5674 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
5675 			return;
5676 
5677 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
5678 		vsw_next_milestone(ldcp);
5679 		break;
5680 
5681 	default:
5682 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5683 		    attr_pkt->tag.vio_subtype);
5684 	}
5685 
5686 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5687 }
5688 
5689 /*
5690  * Process a dring info packet. We can end up here either because our peer
5691  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
5692  * peer has sent us a dring INFO message.
5693  *
5694  * If we get a valid/acceptable INFO packet (and we have already negotiated
5695  * a version) we ACK back and update the lane state, otherwise we NACK back.
5696  *
5697  * FUTURE: nothing to stop client from sending us info on multiple dring's
5698  * but for the moment we will just use the first one we are given.
5699  *
5700  */
5701 void
5702 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
5703 {
5704 	vio_dring_reg_msg_t	*dring_pkt;
5705 	vsw_t			*vswp = ldcp->ldc_vswp;
5706 	ldc_mem_info_t		minfo;
5707 	dring_info_t		*dp, *dbp;
5708 	int			dring_found = 0;
5709 
5710 	/*
5711 	 * We know this is a ctrl/dring packet so
5712 	 * cast it into the correct structure.
5713 	 */
5714 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
5715 
5716 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5717 
5718 	switch (dring_pkt->tag.vio_subtype) {
5719 	case VIO_SUBTYPE_INFO:
5720 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5721 
5722 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
5723 			return;
5724 
5725 		/*
5726 		 * If the dring params are unacceptable then we NACK back.
5727 		 */
5728 		if (vsw_check_dring_info(dring_pkt)) {
5729 
5730 			DERR(vswp, "%s (%lld): invalid dring info",
5731 			    __func__, ldcp->ldc_id);
5732 
5733 			vsw_free_lane_resources(ldcp, INBOUND);
5734 
5735 			dring_pkt->tag.vio_sid = ldcp->local_session;
5736 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5737 
5738 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5739 
5740 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5741 
5742 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5743 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
5744 
5745 			vsw_next_milestone(ldcp);
5746 			return;
5747 		}
5748 
5749 		/*
5750 		 * Otherwise, attempt to map in the dring using the
5751 		 * cookie. If that succeeds we send back a unique dring
5752 		 * identifier that the sending side will use in future
5753 		 * to refer to this descriptor ring.
5754 		 */
5755 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5756 
5757 		dp->num_descriptors = dring_pkt->num_descriptors;
5758 		dp->descriptor_size = dring_pkt->descriptor_size;
5759 		dp->options = dring_pkt->options;
5760 		dp->ncookies = dring_pkt->ncookies;
5761 
5762 		/*
5763 		 * Note: should only get one cookie. Enforced in
5764 		 * the ldc layer.
5765 		 */
5766 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
5767 		    sizeof (ldc_mem_cookie_t));
5768 
5769 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
5770 		    dp->num_descriptors, dp->descriptor_size);
5771 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
5772 		    dp->options, dp->ncookies);
5773 
5774 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
5775 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
5776 		    LDC_SHADOW_MAP, &(dp->handle))) != 0) {
5777 
5778 			DERR(vswp, "%s: dring_map failed\n", __func__);
5779 
5780 			kmem_free(dp, sizeof (dring_info_t));
5781 			vsw_free_lane_resources(ldcp, INBOUND);
5782 
5783 			dring_pkt->tag.vio_sid = ldcp->local_session;
5784 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5785 
5786 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5787 
5788 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5789 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5790 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
5791 
5792 			vsw_next_milestone(ldcp);
5793 			return;
5794 		}
5795 
5796 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
5797 
5798 			DERR(vswp, "%s: dring_addr failed\n", __func__);
5799 
5800 			kmem_free(dp, sizeof (dring_info_t));
5801 			vsw_free_lane_resources(ldcp, INBOUND);
5802 
5803 			dring_pkt->tag.vio_sid = ldcp->local_session;
5804 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5805 
5806 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5807 
5808 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5809 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5810 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
5811 
5812 			vsw_next_milestone(ldcp);
5813 			return;
5814 		} else {
5815 			/* store the address of the pub part of ring */
5816 			dp->pub_addr = minfo.vaddr;
5817 		}
5818 
5819 		/* no private section as we are importing */
5820 		dp->priv_addr = NULL;
5821 
5822 		/*
5823 		 * Using simple mono increasing int for ident at
5824 		 * the moment.
5825 		 */
5826 		dp->ident = ldcp->next_ident;
5827 		ldcp->next_ident++;
5828 
5829 		dp->end_idx = 0;
5830 		dp->next = NULL;
5831 
5832 		/*
5833 		 * Link it onto the end of the list of drings
5834 		 * for this lane.
5835 		 */
5836 		if (ldcp->lane_in.dringp == NULL) {
5837 			D2(vswp, "%s: adding first INBOUND dring", __func__);
5838 			ldcp->lane_in.dringp = dp;
5839 		} else {
5840 			dbp = ldcp->lane_in.dringp;
5841 
5842 			while (dbp->next != NULL)
5843 				dbp = dbp->next;
5844 
5845 			dbp->next = dp;
5846 		}
5847 
5848 		/* acknowledge it */
5849 		dring_pkt->tag.vio_sid = ldcp->local_session;
5850 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5851 		dring_pkt->dring_ident = dp->ident;
5852 
5853 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5854 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
5855 
5856 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
5857 		vsw_next_milestone(ldcp);
5858 		break;
5859 
5860 	case VIO_SUBTYPE_ACK:
5861 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5862 
5863 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
5864 			return;
5865 
5866 		/*
5867 		 * Peer is acknowledging our dring info and will have
5868 		 * sent us a dring identifier which we will use to
5869 		 * refer to this ring w.r.t. our peer.
5870 		 */
5871 		dp = ldcp->lane_out.dringp;
5872 		if (dp != NULL) {
5873 			/*
5874 			 * Find the ring this ident should be associated
5875 			 * with.
5876 			 */
5877 			if (vsw_dring_match(dp, dring_pkt)) {
5878 				dring_found = 1;
5879 
5880 			} else while (dp != NULL) {
5881 				if (vsw_dring_match(dp, dring_pkt)) {
5882 					dring_found = 1;
5883 					break;
5884 				}
5885 				dp = dp->next;
5886 			}
5887 
5888 			if (dring_found == 0) {
5889 				DERR(NULL, "%s: unrecognised ring cookie",
5890 				    __func__);
5891 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5892 				return;
5893 			}
5894 
5895 		} else {
5896 			DERR(vswp, "%s: DRING ACK received but no drings "
5897 			    "allocated", __func__);
5898 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5899 			return;
5900 		}
5901 
5902 		/* store ident */
5903 		dp->ident = dring_pkt->dring_ident;
5904 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
5905 		vsw_next_milestone(ldcp);
5906 		break;
5907 
5908 	case VIO_SUBTYPE_NACK:
5909 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5910 
5911 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
5912 			return;
5913 
5914 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
5915 		vsw_next_milestone(ldcp);
5916 		break;
5917 
5918 	default:
5919 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5920 		    dring_pkt->tag.vio_subtype);
5921 	}
5922 
5923 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5924 }
5925 
5926 /*
5927  * Process a request from peer to unregister a dring.
5928  *
5929  * For the moment we just restart the handshake if our
5930  * peer endpoint attempts to unregister a dring.
5931  */
5932 void
5933 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
5934 {
5935 	vsw_t			*vswp = ldcp->ldc_vswp;
5936 	vio_dring_unreg_msg_t	*dring_pkt;
5937 
5938 	/*
5939 	 * We know this is a ctrl/dring packet so
5940 	 * cast it into the correct structure.
5941 	 */
5942 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
5943 
5944 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5945 
5946 	switch (dring_pkt->tag.vio_subtype) {
5947 	case VIO_SUBTYPE_INFO:
5948 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5949 
5950 		DWARN(vswp, "%s: restarting handshake..", __func__);
5951 		break;
5952 
5953 	case VIO_SUBTYPE_ACK:
5954 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5955 
5956 		DWARN(vswp, "%s: restarting handshake..", __func__);
5957 		break;
5958 
5959 	case VIO_SUBTYPE_NACK:
5960 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5961 
5962 		DWARN(vswp, "%s: restarting handshake..", __func__);
5963 		break;
5964 
5965 	default:
5966 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5967 		    dring_pkt->tag.vio_subtype);
5968 	}
5969 
5970 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5971 
5972 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5973 }
5974 
5975 #define	SND_MCST_NACK(ldcp, pkt) \
5976 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5977 	pkt->tag.vio_sid = ldcp->local_session; \
5978 	(void) vsw_send_msg(ldcp, (void *)pkt, \
5979 			sizeof (vnet_mcast_msg_t), B_TRUE);
5980 
5981 /*
5982  * Process a multicast request from a vnet.
5983  *
5984  * Vnet's specify a multicast address that they are interested in. This
5985  * address is used as a key into the hash table which forms the multicast
5986  * forwarding database (mFDB).
5987  *
5988  * The table keys are the multicast addresses, while the table entries
5989  * are pointers to lists of ports which wish to receive packets for the
5990  * specified multicast address.
5991  *
5992  * When a multicast packet is being switched we use the address as a key
5993  * into the hash table, and then walk the appropriate port list forwarding
5994  * the pkt to each port in turn.
5995  *
5996  * If a vnet is no longer interested in a particular multicast grouping
5997  * we simply find the correct location in the hash table and then delete
5998  * the relevant port from the port list.
5999  *
6000  * To deal with the case whereby a port is being deleted without first
6001  * removing itself from the lists in the hash table, we maintain a list
6002  * of multicast addresses the port has registered an interest in, within
6003  * the port structure itself. We then simply walk that list of addresses
6004  * using them as keys into the hash table and remove the port from the
6005  * appropriate lists.
6006  */
6007 static void
6008 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
6009 {
6010 	vnet_mcast_msg_t	*mcst_pkt;
6011 	vsw_port_t		*port = ldcp->ldc_port;
6012 	vsw_t			*vswp = ldcp->ldc_vswp;
6013 	int			i;
6014 
6015 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6016 
6017 	/*
6018 	 * We know this is a ctrl/mcast packet so
6019 	 * cast it into the correct structure.
6020 	 */
6021 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
6022 
6023 	switch (mcst_pkt->tag.vio_subtype) {
6024 	case VIO_SUBTYPE_INFO:
6025 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
6026 
6027 		/*
6028 		 * Check if in correct state to receive a multicast
6029 		 * message (i.e. handshake complete). If not reset
6030 		 * the handshake.
6031 		 */
6032 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
6033 			return;
6034 
6035 		/*
6036 		 * Before attempting to add or remove address check
6037 		 * that they are valid multicast addresses.
6038 		 * If not, then NACK back.
6039 		 */
6040 		for (i = 0; i < mcst_pkt->count; i++) {
6041 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
6042 				DERR(vswp, "%s: invalid multicast address",
6043 				    __func__);
6044 				SND_MCST_NACK(ldcp, mcst_pkt);
6045 				return;
6046 			}
6047 		}
6048 
6049 		/*
6050 		 * Now add/remove the addresses. If this fails we
6051 		 * NACK back.
6052 		 */
6053 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
6054 			SND_MCST_NACK(ldcp, mcst_pkt);
6055 			return;
6056 		}
6057 
6058 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
6059 		mcst_pkt->tag.vio_sid = ldcp->local_session;
6060 
6061 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
6062 
6063 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
6064 		    sizeof (vnet_mcast_msg_t), B_TRUE);
6065 		break;
6066 
6067 	case VIO_SUBTYPE_ACK:
6068 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
6069 
6070 		/*
6071 		 * We shouldn't ever get a multicast ACK message as
6072 		 * at the moment we never request multicast addresses
6073 		 * to be set on some other device. This may change in
6074 		 * the future if we have cascading switches.
6075 		 */
6076 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
6077 			return;
6078 
6079 				/* Do nothing */
6080 		break;
6081 
6082 	case VIO_SUBTYPE_NACK:
6083 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
6084 
6085 		/*
6086 		 * We shouldn't get a multicast NACK packet for the
6087 		 * same reasons as we shouldn't get a ACK packet.
6088 		 */
6089 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
6090 			return;
6091 
6092 				/* Do nothing */
6093 		break;
6094 
6095 	default:
6096 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
6097 		    mcst_pkt->tag.vio_subtype);
6098 	}
6099 
6100 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6101 }
6102 
6103 static void
6104 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
6105 {
6106 	vio_rdx_msg_t	*rdx_pkt;
6107 	vsw_t		*vswp = ldcp->ldc_vswp;
6108 
6109 	/*
6110 	 * We know this is a ctrl/rdx packet so
6111 	 * cast it into the correct structure.
6112 	 */
6113 	rdx_pkt = (vio_rdx_msg_t *)pkt;
6114 
6115 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
6116 
6117 	switch (rdx_pkt->tag.vio_subtype) {
6118 	case VIO_SUBTYPE_INFO:
6119 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
6120 
6121 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
6122 			return;
6123 
6124 		rdx_pkt->tag.vio_sid = ldcp->local_session;
6125 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
6126 
6127 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
6128 
6129 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
6130 
6131 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
6132 		    sizeof (vio_rdx_msg_t), B_TRUE);
6133 
6134 		vsw_next_milestone(ldcp);
6135 		break;
6136 
6137 	case VIO_SUBTYPE_ACK:
6138 		/*
6139 		 * Should be handled in-band by callback handler.
6140 		 */
6141 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
6142 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
6143 		break;
6144 
6145 	case VIO_SUBTYPE_NACK:
6146 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
6147 
6148 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
6149 			return;
6150 
6151 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
6152 		vsw_next_milestone(ldcp);
6153 		break;
6154 
6155 	default:
6156 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
6157 		    rdx_pkt->tag.vio_subtype);
6158 	}
6159 
6160 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6161 }
6162 
6163 static void
6164 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
6165 {
6166 	uint16_t	env = tag.vio_subtype_env;
6167 	vsw_t		*vswp = ldcp->ldc_vswp;
6168 
6169 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6170 
6171 	/* session id check */
6172 	if (ldcp->session_status & VSW_PEER_SESSION) {
6173 		if (ldcp->peer_session != tag.vio_sid) {
6174 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
6175 			    __func__, ldcp->ldc_id, tag.vio_sid);
6176 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
6177 			return;
6178 		}
6179 	}
6180 
6181 	/*
6182 	 * It is an error for us to be getting data packets
6183 	 * before the handshake has completed.
6184 	 */
6185 	if (ldcp->hphase != VSW_MILESTONE4) {
6186 		DERR(vswp, "%s: got data packet before handshake complete "
6187 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
6188 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
6189 		DUMP_FLAGS(ldcp->lane_in.lstate);
6190 		DUMP_FLAGS(ldcp->lane_out.lstate);
6191 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
6192 		return;
6193 	}
6194 
6195 	/*
6196 	 * Switch on vio_subtype envelope, then let lower routines
6197 	 * decide if its an INFO, ACK or NACK packet.
6198 	 */
6199 	if (env == VIO_DRING_DATA) {
6200 		vsw_process_data_dring_pkt(ldcp, dpkt);
6201 	} else if (env == VIO_PKT_DATA) {
6202 		vsw_process_data_raw_pkt(ldcp, dpkt);
6203 	} else if (env == VIO_DESC_DATA) {
6204 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
6205 	} else {
6206 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
6207 	}
6208 
6209 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6210 }
6211 
6212 #define	SND_DRING_NACK(ldcp, pkt) \
6213 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
6214 	pkt->tag.vio_sid = ldcp->local_session; \
6215 	(void) vsw_send_msg(ldcp, (void *)pkt, \
6216 			sizeof (vio_dring_msg_t), B_TRUE);
6217 
6218 static void
6219 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
6220 {
6221 	vio_dring_msg_t		*dring_pkt;
6222 	vnet_public_desc_t	*pub_addr = NULL;
6223 	vsw_private_desc_t	*priv_addr = NULL;
6224 	dring_info_t		*dp = NULL;
6225 	vsw_t			*vswp = ldcp->ldc_vswp;
6226 	mblk_t			*mp = NULL;
6227 	mblk_t			*bp = NULL;
6228 	mblk_t			*bpt = NULL;
6229 	size_t			nbytes = 0;
6230 	size_t			off = 0;
6231 	uint64_t		ncookies = 0;
6232 	uint64_t		chain = 0;
6233 	uint64_t		j, len;
6234 	uint32_t		pos, start, datalen;
6235 	uint32_t		range_start, range_end;
6236 	int32_t			end, num, cnt = 0;
6237 	int			i, rv, msg_rv = 0;
6238 	boolean_t		ack_needed = B_FALSE;
6239 	boolean_t		prev_desc_ack = B_FALSE;
6240 	int			read_attempts = 0;
6241 
6242 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6243 
6244 	/*
6245 	 * We know this is a data/dring packet so
6246 	 * cast it into the correct structure.
6247 	 */
6248 	dring_pkt = (vio_dring_msg_t *)dpkt;
6249 
6250 	/*
6251 	 * Switch on the vio_subtype. If its INFO then we need to
6252 	 * process the data. If its an ACK we need to make sure
6253 	 * it makes sense (i.e did we send an earlier data/info),
6254 	 * and if its a NACK then we maybe attempt a retry.
6255 	 */
6256 	switch (dring_pkt->tag.vio_subtype) {
6257 	case VIO_SUBTYPE_INFO:
6258 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
6259 
6260 		READ_ENTER(&ldcp->lane_in.dlistrw);
6261 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
6262 		    dring_pkt->dring_ident)) == NULL) {
6263 			RW_EXIT(&ldcp->lane_in.dlistrw);
6264 
6265 			DERR(vswp, "%s(%lld): unable to find dring from "
6266 			    "ident 0x%llx", __func__, ldcp->ldc_id,
6267 			    dring_pkt->dring_ident);
6268 
6269 			SND_DRING_NACK(ldcp, dring_pkt);
6270 			return;
6271 		}
6272 
6273 		start = pos = dring_pkt->start_idx;
6274 		end = dring_pkt->end_idx;
6275 		len = dp->num_descriptors;
6276 
6277 		range_start = range_end = pos;
6278 
6279 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
6280 		    __func__, ldcp->ldc_id, start, end);
6281 
6282 		if (end == -1) {
6283 			num = -1;
6284 		} else if (end >= 0) {
6285 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
6286 
6287 			/* basic sanity check */
6288 			if (end > len) {
6289 				RW_EXIT(&ldcp->lane_in.dlistrw);
6290 				DERR(vswp, "%s(%lld): endpoint %lld outside "
6291 				    "ring length %lld", __func__,
6292 				    ldcp->ldc_id, end, len);
6293 
6294 				SND_DRING_NACK(ldcp, dring_pkt);
6295 				return;
6296 			}
6297 		} else {
6298 			RW_EXIT(&ldcp->lane_in.dlistrw);
6299 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
6300 			    __func__, ldcp->ldc_id, end);
6301 			SND_DRING_NACK(ldcp, dring_pkt);
6302 			return;
6303 		}
6304 
6305 		while (cnt != num) {
6306 vsw_recheck_desc:
6307 			if ((rv = ldc_mem_dring_acquire(dp->handle,
6308 			    pos, pos)) != 0) {
6309 				RW_EXIT(&ldcp->lane_in.dlistrw);
6310 				DERR(vswp, "%s(%lld): unable to acquire "
6311 				    "descriptor at pos %d: err %d",
6312 				    __func__, pos, ldcp->ldc_id, rv);
6313 				SND_DRING_NACK(ldcp, dring_pkt);
6314 				return;
6315 			}
6316 
6317 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
6318 
6319 			/*
6320 			 * When given a bounded range of descriptors
6321 			 * to process, its an error to hit a descriptor
6322 			 * which is not ready. In the non-bounded case
6323 			 * (end_idx == -1) this simply indicates we have
6324 			 * reached the end of the current active range.
6325 			 */
6326 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
6327 				/* unbound - no error */
6328 				if (end == -1) {
6329 					if (read_attempts == vsw_read_attempts)
6330 						break;
6331 
6332 					delay(drv_usectohz(vsw_desc_delay));
6333 					read_attempts++;
6334 					goto vsw_recheck_desc;
6335 				}
6336 
6337 				/* bounded - error - so NACK back */
6338 				RW_EXIT(&ldcp->lane_in.dlistrw);
6339 				DERR(vswp, "%s(%lld): descriptor not READY "
6340 				    "(%d)", __func__, ldcp->ldc_id,
6341 				    pub_addr->hdr.dstate);
6342 				SND_DRING_NACK(ldcp, dring_pkt);
6343 				return;
6344 			}
6345 
6346 			DTRACE_PROBE1(read_attempts, int, read_attempts);
6347 
6348 			range_end = pos;
6349 
6350 			/*
6351 			 * If we ACK'd the previous descriptor then now
6352 			 * record the new range start position for later
6353 			 * ACK's.
6354 			 */
6355 			if (prev_desc_ack) {
6356 				range_start = pos;
6357 
6358 				D2(vswp, "%s(%lld): updating range start to be "
6359 				    "%d", __func__, ldcp->ldc_id, range_start);
6360 
6361 				prev_desc_ack = B_FALSE;
6362 			}
6363 
6364 			/*
6365 			 * Data is padded to align on 8 byte boundary,
6366 			 * datalen is actual data length, i.e. minus that
6367 			 * padding.
6368 			 */
6369 			datalen = pub_addr->nbytes;
6370 
6371 			/*
6372 			 * Does peer wish us to ACK when we have finished
6373 			 * with this descriptor ?
6374 			 */
6375 			if (pub_addr->hdr.ack)
6376 				ack_needed = B_TRUE;
6377 
6378 			D2(vswp, "%s(%lld): processing desc %lld at pos"
6379 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
6380 			    __func__, ldcp->ldc_id, pos, pub_addr,
6381 			    pub_addr->hdr.dstate, datalen);
6382 
6383 			/*
6384 			 * Mark that we are starting to process descriptor.
6385 			 */
6386 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
6387 
6388 			mp = vio_allocb(ldcp->rxh);
6389 			if (mp == NULL) {
6390 				/*
6391 				 * No free receive buffers available, so
6392 				 * fallback onto allocb(9F). Make sure that
6393 				 * we get a data buffer which is a multiple
6394 				 * of 8 as this is required by ldc_mem_copy.
6395 				 */
6396 				DTRACE_PROBE(allocb);
6397 				if ((mp = allocb(datalen + VNET_IPALIGN + 8,
6398 				    BPRI_MED)) == NULL) {
6399 					DERR(vswp, "%s(%ld): allocb failed",
6400 					    __func__, ldcp->ldc_id);
6401 					pub_addr->hdr.dstate = VIO_DESC_DONE;
6402 					(void) ldc_mem_dring_release(dp->handle,
6403 					    pos, pos);
6404 					break;
6405 				}
6406 			}
6407 
6408 			/*
6409 			 * Ensure that we ask ldc for an aligned
6410 			 * number of bytes.
6411 			 */
6412 			nbytes = datalen + VNET_IPALIGN;
6413 			if (nbytes & 0x7) {
6414 				off = 8 - (nbytes & 0x7);
6415 				nbytes += off;
6416 			}
6417 
6418 			ncookies = pub_addr->ncookies;
6419 			rv = ldc_mem_copy(ldcp->ldc_handle,
6420 			    (caddr_t)mp->b_rptr, 0, &nbytes,
6421 			    pub_addr->memcookie, ncookies, LDC_COPY_IN);
6422 
6423 			if (rv != 0) {
6424 				DERR(vswp, "%s(%d): unable to copy in data "
6425 				    "from %d cookies in desc %d (rv %d)",
6426 				    __func__, ldcp->ldc_id, ncookies, pos, rv);
6427 				freemsg(mp);
6428 
6429 				pub_addr->hdr.dstate = VIO_DESC_DONE;
6430 				(void) ldc_mem_dring_release(dp->handle,
6431 				    pos, pos);
6432 				break;
6433 			} else {
6434 				D2(vswp, "%s(%d): copied in %ld bytes"
6435 				    " using %d cookies", __func__,
6436 				    ldcp->ldc_id, nbytes, ncookies);
6437 			}
6438 
6439 			/* adjust the read pointer to skip over the padding */
6440 			mp->b_rptr += VNET_IPALIGN;
6441 
6442 			/* point to the actual end of data */
6443 			mp->b_wptr = mp->b_rptr + datalen;
6444 
6445 			/* build a chain of received packets */
6446 			if (bp == NULL) {
6447 				/* first pkt */
6448 				bp = mp;
6449 				bp->b_next = bp->b_prev = NULL;
6450 				bpt = bp;
6451 				chain = 1;
6452 			} else {
6453 				mp->b_next = NULL;
6454 				mp->b_prev = bpt;
6455 				bpt->b_next = mp;
6456 				bpt = mp;
6457 				chain++;
6458 			}
6459 
6460 			/* mark we are finished with this descriptor */
6461 			pub_addr->hdr.dstate = VIO_DESC_DONE;
6462 
6463 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
6464 
6465 			/*
6466 			 * Send an ACK back to peer if requested.
6467 			 */
6468 			if (ack_needed) {
6469 				ack_needed = B_FALSE;
6470 
6471 				dring_pkt->start_idx = range_start;
6472 				dring_pkt->end_idx = range_end;
6473 
6474 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
6475 				    " requested", __func__, ldcp->ldc_id,
6476 				    dring_pkt->start_idx, dring_pkt->end_idx);
6477 
6478 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
6479 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
6480 				dring_pkt->tag.vio_sid = ldcp->local_session;
6481 
6482 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
6483 				    sizeof (vio_dring_msg_t), B_FALSE);
6484 
6485 				/*
6486 				 * Check if ACK was successfully sent. If not
6487 				 * we break and deal with that below.
6488 				 */
6489 				if (msg_rv != 0)
6490 					break;
6491 
6492 				prev_desc_ack = B_TRUE;
6493 				range_start = pos;
6494 			}
6495 
6496 			/* next descriptor */
6497 			pos = (pos + 1) % len;
6498 			cnt++;
6499 
6500 			/*
6501 			 * Break out of loop here and stop processing to
6502 			 * allow some other network device (or disk) to
6503 			 * get access to the cpu.
6504 			 */
6505 			if (chain > vsw_chain_len) {
6506 				D3(vswp, "%s(%lld): switching chain of %d "
6507 				    "msgs", __func__, ldcp->ldc_id, chain);
6508 				break;
6509 			}
6510 		}
6511 		RW_EXIT(&ldcp->lane_in.dlistrw);
6512 
6513 		/*
6514 		 * If when we attempted to send the ACK we found that the
6515 		 * channel had been reset then now handle this. We deal with
6516 		 * it here as we cannot reset the channel while holding the
6517 		 * dlistrw lock, and we don't want to acquire/release it
6518 		 * continuously in the above loop, as a channel reset should
6519 		 * be a rare event.
6520 		 */
6521 		if (msg_rv == ECONNRESET) {
6522 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
6523 			break;
6524 		}
6525 
6526 		/* send the chain of packets to be switched */
6527 		if (bp != NULL) {
6528 			D3(vswp, "%s(%lld): switching chain of %d msgs",
6529 			    __func__, ldcp->ldc_id, chain);
6530 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
6531 			    ldcp->ldc_port, NULL);
6532 		}
6533 
6534 		DTRACE_PROBE1(msg_cnt, int, cnt);
6535 
6536 		/*
6537 		 * We are now finished so ACK back with the state
6538 		 * set to STOPPING so our peer knows we are finished
6539 		 */
6540 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
6541 		dring_pkt->tag.vio_sid = ldcp->local_session;
6542 
6543 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
6544 
6545 		DTRACE_PROBE(stop_process_sent);
6546 
6547 		/*
6548 		 * We have not processed any more descriptors beyond
6549 		 * the last one we ACK'd.
6550 		 */
6551 		if (prev_desc_ack)
6552 			range_start = range_end;
6553 
6554 		dring_pkt->start_idx = range_start;
6555 		dring_pkt->end_idx = range_end;
6556 
6557 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
6558 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
6559 		    dring_pkt->end_idx);
6560 
6561 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
6562 		    sizeof (vio_dring_msg_t), B_TRUE);
6563 		break;
6564 
6565 	case VIO_SUBTYPE_ACK:
6566 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
6567 		/*
6568 		 * Verify that the relevant descriptors are all
6569 		 * marked as DONE
6570 		 */
6571 		READ_ENTER(&ldcp->lane_out.dlistrw);
6572 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
6573 		    dring_pkt->dring_ident)) == NULL) {
6574 			RW_EXIT(&ldcp->lane_out.dlistrw);
6575 			DERR(vswp, "%s: unknown ident in ACK", __func__);
6576 			return;
6577 		}
6578 
6579 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
6580 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6581 
6582 		start = end = 0;
6583 		start = dring_pkt->start_idx;
6584 		end = dring_pkt->end_idx;
6585 		len = dp->num_descriptors;
6586 
6587 		j = num = 0;
6588 		/* calculate # descriptors taking into a/c wrap around */
6589 		num = end >= start ? end - start + 1: (len - start + 1) + end;
6590 
6591 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
6592 		    __func__, ldcp->ldc_id, start, end, num);
6593 
6594 		mutex_enter(&dp->dlock);
6595 		dp->last_ack_recv = end;
6596 		mutex_exit(&dp->dlock);
6597 
6598 		for (i = start; j < num; i = (i + 1) % len, j++) {
6599 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6600 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6601 
6602 			/*
6603 			 * If the last descriptor in a range has the ACK
6604 			 * bit set then we will get two messages from our
6605 			 * peer relating to it. The normal ACK msg and then
6606 			 * a subsequent STOP msg. The first message will have
6607 			 * resulted in the descriptor being reclaimed and
6608 			 * its state set to FREE so when we encounter a non
6609 			 * DONE descriptor we need to check to see if its
6610 			 * because we have just reclaimed it.
6611 			 */
6612 			mutex_enter(&priv_addr->dstate_lock);
6613 			if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
6614 				/* clear all the fields */
6615 				bzero(priv_addr->datap, priv_addr->datalen);
6616 				priv_addr->datalen = 0;
6617 
6618 				pub_addr->hdr.dstate = VIO_DESC_FREE;
6619 				pub_addr->hdr.ack = 0;
6620 
6621 				priv_addr->dstate = VIO_DESC_FREE;
6622 				mutex_exit(&priv_addr->dstate_lock);
6623 
6624 				D3(vswp, "clearing descp %d : pub state "
6625 				    "0x%llx : priv state 0x%llx", i,
6626 				    pub_addr->hdr.dstate, priv_addr->dstate);
6627 
6628 			} else {
6629 				mutex_exit(&priv_addr->dstate_lock);
6630 
6631 				if (dring_pkt->dring_process_state !=
6632 				    VIO_DP_STOPPED) {
6633 					DERR(vswp, "%s: descriptor %lld at pos "
6634 					    " 0x%llx not DONE (0x%lx)\n",
6635 					    __func__, i, pub_addr,
6636 					    pub_addr->hdr.dstate);
6637 					RW_EXIT(&ldcp->lane_out.dlistrw);
6638 					return;
6639 				}
6640 			}
6641 		}
6642 
6643 		/*
6644 		 * If our peer is stopping processing descriptors then
6645 		 * we check to make sure it has processed all the descriptors
6646 		 * we have updated. If not then we send it a new message
6647 		 * to prompt it to restart.
6648 		 */
6649 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
6650 			DTRACE_PROBE(stop_process_recv);
6651 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
6652 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
6653 			    dring_pkt->end_idx);
6654 
6655 			/*
6656 			 * Check next descriptor in public section of ring.
6657 			 * If its marked as READY then we need to prompt our
6658 			 * peer to start processing the ring again.
6659 			 */
6660 			i = (end + 1) % len;
6661 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6662 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6663 
6664 			/*
6665 			 * Hold the restart lock across all of this to
6666 			 * make sure that its not possible for us to
6667 			 * decide that a msg needs to be sent in the future
6668 			 * but the sending code having already checked is
6669 			 * about to exit.
6670 			 */
6671 			mutex_enter(&dp->restart_lock);
6672 			mutex_enter(&priv_addr->dstate_lock);
6673 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
6674 
6675 				mutex_exit(&priv_addr->dstate_lock);
6676 
6677 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
6678 				dring_pkt->tag.vio_sid = ldcp->local_session;
6679 
6680 				mutex_enter(&ldcp->lane_out.seq_lock);
6681 				dring_pkt->seq_num = ldcp->lane_out.seq_num++;
6682 				mutex_exit(&ldcp->lane_out.seq_lock);
6683 
6684 				dring_pkt->start_idx = (end + 1) % len;
6685 				dring_pkt->end_idx = -1;
6686 
6687 				D2(vswp, "%s(%lld) : sending restart msg:"
6688 				    " %d : %d", __func__, ldcp->ldc_id,
6689 				    dring_pkt->start_idx, dring_pkt->end_idx);
6690 
6691 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
6692 				    sizeof (vio_dring_msg_t), B_FALSE);
6693 
6694 			} else {
6695 				mutex_exit(&priv_addr->dstate_lock);
6696 				dp->restart_reqd = B_TRUE;
6697 			}
6698 			mutex_exit(&dp->restart_lock);
6699 		}
6700 		RW_EXIT(&ldcp->lane_out.dlistrw);
6701 
6702 		/* only do channel reset after dropping dlistrw lock */
6703 		if (msg_rv == ECONNRESET)
6704 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
6705 
6706 		break;
6707 
6708 	case VIO_SUBTYPE_NACK:
6709 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
6710 		    __func__, ldcp->ldc_id);
6711 		/*
6712 		 * Something is badly wrong if we are getting NACK's
6713 		 * for our data pkts. So reset the channel.
6714 		 */
6715 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
6716 
6717 		break;
6718 
6719 	default:
6720 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
6721 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
6722 	}
6723 
6724 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
6725 }
6726 
6727 /*
6728  * VIO_PKT_DATA (a.k.a raw data mode )
6729  *
6730  * Note - currently not supported. Do nothing.
6731  */
6732 static void
6733 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
6734 {
6735 	_NOTE(ARGUNUSED(dpkt))
6736 
6737 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
6738 	DERR(NULL, "%s (%lld): currently unsupported", __func__, ldcp->ldc_id);
6739 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
6740 }
6741 
6742 /*
6743  * Process an in-band descriptor message (most likely from
6744  * OBP).
6745  */
6746 static void
6747 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
6748 {
6749 	vnet_ibnd_desc_t	*ibnd_desc;
6750 	dring_info_t		*dp = NULL;
6751 	vsw_private_desc_t	*priv_addr = NULL;
6752 	vsw_t			*vswp = ldcp->ldc_vswp;
6753 	mblk_t			*mp = NULL;
6754 	size_t			nbytes = 0;
6755 	size_t			off = 0;
6756 	uint64_t		idx = 0;
6757 	uint32_t		num = 1, len, datalen = 0;
6758 	uint64_t		ncookies = 0;
6759 	int			i, rv;
6760 	int			j = 0;
6761 
6762 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6763 
6764 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
6765 
6766 	switch (ibnd_desc->hdr.tag.vio_subtype) {
6767 	case VIO_SUBTYPE_INFO:
6768 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
6769 
6770 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
6771 			return;
6772 
6773 		/*
6774 		 * Data is padded to align on a 8 byte boundary,
6775 		 * nbytes is actual data length, i.e. minus that
6776 		 * padding.
6777 		 */
6778 		datalen = ibnd_desc->nbytes;
6779 
6780 		D2(vswp, "%s(%lld): processing inband desc : "
6781 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
6782 
6783 		ncookies = ibnd_desc->ncookies;
6784 
6785 		/*
6786 		 * allocb(9F) returns an aligned data block. We
6787 		 * need to ensure that we ask ldc for an aligned
6788 		 * number of bytes also.
6789 		 */
6790 		nbytes = datalen;
6791 		if (nbytes & 0x7) {
6792 			off = 8 - (nbytes & 0x7);
6793 			nbytes += off;
6794 		}
6795 
6796 		mp = allocb(datalen, BPRI_MED);
6797 		if (mp == NULL) {
6798 			DERR(vswp, "%s(%lld): allocb failed",
6799 			    __func__, ldcp->ldc_id);
6800 			return;
6801 		}
6802 
6803 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
6804 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
6805 		    LDC_COPY_IN);
6806 
6807 		if (rv != 0) {
6808 			DERR(vswp, "%s(%d): unable to copy in data from "
6809 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
6810 			freemsg(mp);
6811 			return;
6812 		}
6813 
6814 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
6815 		    __func__, ldcp->ldc_id, nbytes, ncookies);
6816 
6817 		/* point to the actual end of data */
6818 		mp->b_wptr = mp->b_rptr + datalen;
6819 
6820 		/*
6821 		 * We ACK back every in-band descriptor message we process
6822 		 */
6823 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
6824 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
6825 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
6826 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
6827 
6828 		/* send the packet to be switched */
6829 		vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
6830 		    ldcp->ldc_port, NULL);
6831 
6832 		break;
6833 
6834 	case VIO_SUBTYPE_ACK:
6835 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
6836 
6837 		/* Verify the ACK is valid */
6838 		idx = ibnd_desc->hdr.desc_handle;
6839 
6840 		if (idx >= VSW_RING_NUM_EL) {
6841 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
6842 			    "(idx %ld)", vswp->instance, idx);
6843 			return;
6844 		}
6845 
6846 		if ((dp = ldcp->lane_out.dringp) == NULL) {
6847 			DERR(vswp, "%s: no dring found", __func__);
6848 			return;
6849 		}
6850 
6851 		len = dp->num_descriptors;
6852 		/*
6853 		 * If the descriptor we are being ACK'ed for is not the
6854 		 * one we expected, then pkts were lost somwhere, either
6855 		 * when we tried to send a msg, or a previous ACK msg from
6856 		 * our peer. In either case we now reclaim the descriptors
6857 		 * in the range from the last ACK we received up to the
6858 		 * current ACK.
6859 		 */
6860 		if (idx != dp->last_ack_recv) {
6861 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
6862 			    __func__, dp->last_ack_recv, idx);
6863 			num = idx >= dp->last_ack_recv ?
6864 			    idx - dp->last_ack_recv + 1:
6865 			    (len - dp->last_ack_recv + 1) + idx;
6866 		}
6867 
6868 		/*
6869 		 * When we sent the in-band message to our peer we
6870 		 * marked the copy in our private ring as READY. We now
6871 		 * check that the descriptor we are being ACK'ed for is in
6872 		 * fact READY, i.e. it is one we have shared with our peer.
6873 		 *
6874 		 * If its not we flag an error, but still reset the descr
6875 		 * back to FREE.
6876 		 */
6877 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
6878 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6879 			mutex_enter(&priv_addr->dstate_lock);
6880 			if (priv_addr->dstate != VIO_DESC_READY) {
6881 				DERR(vswp, "%s: (%ld) desc at index %ld not "
6882 				    "READY (0x%lx)", __func__,
6883 				    ldcp->ldc_id, idx, priv_addr->dstate);
6884 				DERR(vswp, "%s: bound %d: ncookies %ld : "
6885 				    "datalen %ld", __func__,
6886 				    priv_addr->bound, priv_addr->ncookies,
6887 				    priv_addr->datalen);
6888 			}
6889 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
6890 			    ldcp->ldc_id, idx);
6891 			/* release resources associated with sent msg */
6892 			bzero(priv_addr->datap, priv_addr->datalen);
6893 			priv_addr->datalen = 0;
6894 			priv_addr->dstate = VIO_DESC_FREE;
6895 			mutex_exit(&priv_addr->dstate_lock);
6896 		}
6897 		/* update to next expected value */
6898 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
6899 
6900 		break;
6901 
6902 	case VIO_SUBTYPE_NACK:
6903 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
6904 
6905 		/*
6906 		 * We should only get a NACK if our peer doesn't like
6907 		 * something about a message we have sent it. If this
6908 		 * happens we just release the resources associated with
6909 		 * the message. (We are relying on higher layers to decide
6910 		 * whether or not to resend.
6911 		 */
6912 
6913 		/* limit check */
6914 		idx = ibnd_desc->hdr.desc_handle;
6915 
6916 		if (idx >= VSW_RING_NUM_EL) {
6917 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
6918 			    __func__, idx);
6919 			return;
6920 		}
6921 
6922 		if ((dp = ldcp->lane_out.dringp) == NULL) {
6923 			DERR(vswp, "%s: no dring found", __func__);
6924 			return;
6925 		}
6926 
6927 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6928 
6929 		/* move to correct location in ring */
6930 		priv_addr += idx;
6931 
6932 		/* release resources associated with sent msg */
6933 		mutex_enter(&priv_addr->dstate_lock);
6934 		bzero(priv_addr->datap, priv_addr->datalen);
6935 		priv_addr->datalen = 0;
6936 		priv_addr->dstate = VIO_DESC_FREE;
6937 		mutex_exit(&priv_addr->dstate_lock);
6938 
6939 		break;
6940 
6941 	default:
6942 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
6943 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
6944 	}
6945 
6946 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
6947 }
6948 
6949 static void
6950 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
6951 {
6952 	_NOTE(ARGUNUSED(epkt))
6953 
6954 	vsw_t		*vswp = ldcp->ldc_vswp;
6955 	uint16_t	env = tag.vio_subtype_env;
6956 
6957 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
6958 
6959 	/*
6960 	 * Error vio_subtypes have yet to be defined. So for
6961 	 * the moment we can't do anything.
6962 	 */
6963 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
6964 
6965 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
6966 }
6967 
6968 /*
6969  * Switch the given ethernet frame when operating in layer 2 mode.
6970  *
6971  * vswp: pointer to the vsw instance
6972  * mp: pointer to chain of ethernet frame(s) to be switched
6973  * caller: identifies the source of this frame as:
6974  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
6975  *		2. VSW_PHYSDEV - the physical ethernet device
6976  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
6977  * arg: argument provided by the caller.
6978  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
6979  *		2. for PHYSDEV - NULL
6980  *		3. for LOCALDEV - pointer to to this vsw_t(self)
6981  */
6982 void
6983 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
6984 			vsw_port_t *arg, mac_resource_handle_t mrh)
6985 {
6986 	struct ether_header	*ehp;
6987 	vsw_port_t		*port = NULL;
6988 	mblk_t			*bp, *ret_m;
6989 	mblk_t			*nmp = NULL;
6990 	vsw_port_list_t		*plist = &vswp->plist;
6991 
6992 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
6993 
6994 	/*
6995 	 * PERF: rather than breaking up the chain here, scan it
6996 	 * to find all mblks heading to same destination and then
6997 	 * pass that sub-chain to the lower transmit functions.
6998 	 */
6999 
7000 	/* process the chain of packets */
7001 	bp = mp;
7002 	while (bp) {
7003 		mp = bp;
7004 		bp = bp->b_next;
7005 		mp->b_next = mp->b_prev = NULL;
7006 		ehp = (struct ether_header *)mp->b_rptr;
7007 
7008 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
7009 		    __func__, MBLKSIZE(mp), MBLKL(mp));
7010 
7011 		READ_ENTER(&vswp->if_lockrw);
7012 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
7013 			/*
7014 			 * If destination is VSW_LOCALDEV (vsw as an eth
7015 			 * interface) and if the device is up & running,
7016 			 * send the packet up the stack on this host.
7017 			 * If the virtual interface is down, drop the packet.
7018 			 */
7019 			if (caller != VSW_LOCALDEV) {
7020 				if (vswp->if_state & VSW_IF_UP) {
7021 					RW_EXIT(&vswp->if_lockrw);
7022 					mac_rx(vswp->if_mh, mrh, mp);
7023 				} else {
7024 					RW_EXIT(&vswp->if_lockrw);
7025 					/* Interface down, drop pkt */
7026 					freemsg(mp);
7027 				}
7028 			} else {
7029 				RW_EXIT(&vswp->if_lockrw);
7030 				freemsg(mp);
7031 			}
7032 			continue;
7033 		}
7034 		RW_EXIT(&vswp->if_lockrw);
7035 
7036 		READ_ENTER(&plist->lockrw);
7037 		port = vsw_lookup_fdb(vswp, ehp);
7038 		if (port) {
7039 			/*
7040 			 * Mark the port as in-use.
7041 			 */
7042 			mutex_enter(&port->ref_lock);
7043 			port->ref_cnt++;
7044 			mutex_exit(&port->ref_lock);
7045 			RW_EXIT(&plist->lockrw);
7046 
7047 			/*
7048 			 * If plumbed and in promisc mode then copy msg
7049 			 * and send up the stack.
7050 			 */
7051 			READ_ENTER(&vswp->if_lockrw);
7052 			if (VSW_U_P(vswp->if_state)) {
7053 				RW_EXIT(&vswp->if_lockrw);
7054 				nmp = copymsg(mp);
7055 				if (nmp)
7056 					mac_rx(vswp->if_mh, mrh, nmp);
7057 			} else {
7058 				RW_EXIT(&vswp->if_lockrw);
7059 			}
7060 
7061 			/*
7062 			 * If the destination is in FDB, the packet
7063 			 * should be forwarded to the correponding
7064 			 * vsw_port (connected to a vnet device -
7065 			 * VSW_VNETPORT)
7066 			 */
7067 			(void) vsw_portsend(port, mp);
7068 
7069 			/*
7070 			 * Decrement use count in port and check if
7071 			 * should wake delete thread.
7072 			 */
7073 			mutex_enter(&port->ref_lock);
7074 			port->ref_cnt--;
7075 			if (port->ref_cnt == 0)
7076 				cv_signal(&port->ref_cv);
7077 			mutex_exit(&port->ref_lock);
7078 		} else {
7079 			RW_EXIT(&plist->lockrw);
7080 			/*
7081 			 * Destination not in FDB.
7082 			 *
7083 			 * If the destination is broadcast or
7084 			 * multicast forward the packet to all
7085 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
7086 			 * except the caller.
7087 			 */
7088 			if (IS_BROADCAST(ehp)) {
7089 				D3(vswp, "%s: BROADCAST pkt", __func__);
7090 				(void) vsw_forward_all(vswp, mp, caller, arg);
7091 			} else if (IS_MULTICAST(ehp)) {
7092 				D3(vswp, "%s: MULTICAST pkt", __func__);
7093 				(void) vsw_forward_grp(vswp, mp, caller, arg);
7094 			} else {
7095 				/*
7096 				 * If the destination is unicast, and came
7097 				 * from either a logical network device or
7098 				 * the switch itself when it is plumbed, then
7099 				 * send it out on the physical device and also
7100 				 * up the stack if the logical interface is
7101 				 * in promiscious mode.
7102 				 *
7103 				 * NOTE:  The assumption here is that if we
7104 				 * cannot find the destination in our fdb, its
7105 				 * a unicast address, and came from either a
7106 				 * vnet or down the stack (when plumbed) it
7107 				 * must be destinded for an ethernet device
7108 				 * outside our ldoms.
7109 				 */
7110 				if (caller == VSW_VNETPORT) {
7111 					READ_ENTER(&vswp->if_lockrw);
7112 					if (VSW_U_P(vswp->if_state)) {
7113 						RW_EXIT(&vswp->if_lockrw);
7114 						nmp = copymsg(mp);
7115 						if (nmp)
7116 							mac_rx(vswp->if_mh,
7117 							    mrh, nmp);
7118 					} else {
7119 						RW_EXIT(&vswp->if_lockrw);
7120 					}
7121 					if ((ret_m = vsw_tx_msg(vswp, mp))
7122 					    != NULL) {
7123 						DERR(vswp, "%s: drop mblks to "
7124 						    "phys dev", __func__);
7125 						freemsg(ret_m);
7126 					}
7127 
7128 				} else if (caller == VSW_PHYSDEV) {
7129 					/*
7130 					 * Pkt seen because card in promisc
7131 					 * mode. Send up stack if plumbed in
7132 					 * promisc mode, else drop it.
7133 					 */
7134 					READ_ENTER(&vswp->if_lockrw);
7135 					if (VSW_U_P(vswp->if_state)) {
7136 						RW_EXIT(&vswp->if_lockrw);
7137 						mac_rx(vswp->if_mh, mrh, mp);
7138 					} else {
7139 						RW_EXIT(&vswp->if_lockrw);
7140 						freemsg(mp);
7141 					}
7142 
7143 				} else if (caller == VSW_LOCALDEV) {
7144 					/*
7145 					 * Pkt came down the stack, send out
7146 					 * over physical device.
7147 					 */
7148 					if ((ret_m = vsw_tx_msg(vswp, mp))
7149 					    != NULL) {
7150 						DERR(vswp, "%s: drop mblks to "
7151 						    "phys dev", __func__);
7152 						freemsg(ret_m);
7153 					}
7154 				}
7155 			}
7156 		}
7157 	}
7158 	D1(vswp, "%s: exit\n", __func__);
7159 }
7160 
7161 /*
7162  * Switch ethernet frame when in layer 3 mode (i.e. using IP
7163  * layer to do the routing).
7164  *
7165  * There is a large amount of overlap between this function and
7166  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
7167  * both these functions.
7168  */
7169 void
7170 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
7171 			vsw_port_t *arg, mac_resource_handle_t mrh)
7172 {
7173 	struct ether_header	*ehp;
7174 	vsw_port_t		*port = NULL;
7175 	mblk_t			*bp = NULL;
7176 	vsw_port_list_t		*plist = &vswp->plist;
7177 
7178 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
7179 
7180 	/*
7181 	 * In layer 3 mode should only ever be switching packets
7182 	 * between IP layer and vnet devices. So make sure thats
7183 	 * who is invoking us.
7184 	 */
7185 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
7186 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
7187 		freemsgchain(mp);
7188 		return;
7189 	}
7190 
7191 	/* process the chain of packets */
7192 	bp = mp;
7193 	while (bp) {
7194 		mp = bp;
7195 		bp = bp->b_next;
7196 		mp->b_next = mp->b_prev = NULL;
7197 		ehp = (struct ether_header *)mp->b_rptr;
7198 
7199 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
7200 		    __func__, MBLKSIZE(mp), MBLKL(mp));
7201 
7202 		READ_ENTER(&plist->lockrw);
7203 		port = vsw_lookup_fdb(vswp, ehp);
7204 		if (port) {
7205 			/*
7206 			 * Mark port as in-use.
7207 			 */
7208 			mutex_enter(&port->ref_lock);
7209 			port->ref_cnt++;
7210 			mutex_exit(&port->ref_lock);
7211 			RW_EXIT(&plist->lockrw);
7212 
7213 			D2(vswp, "%s: sending to target port", __func__);
7214 			(void) vsw_portsend(port, mp);
7215 
7216 			/*
7217 			 * Finished with port so decrement ref count and
7218 			 * check if should wake delete thread.
7219 			 */
7220 			mutex_enter(&port->ref_lock);
7221 			port->ref_cnt--;
7222 			if (port->ref_cnt == 0)
7223 				cv_signal(&port->ref_cv);
7224 			mutex_exit(&port->ref_lock);
7225 		} else {
7226 			RW_EXIT(&plist->lockrw);
7227 			/*
7228 			 * Destination not in FDB
7229 			 *
7230 			 * If the destination is broadcast or
7231 			 * multicast forward the packet to all
7232 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
7233 			 * except the caller.
7234 			 */
7235 			if (IS_BROADCAST(ehp)) {
7236 				D2(vswp, "%s: BROADCAST pkt", __func__);
7237 				(void) vsw_forward_all(vswp, mp, caller, arg);
7238 			} else if (IS_MULTICAST(ehp)) {
7239 				D2(vswp, "%s: MULTICAST pkt", __func__);
7240 				(void) vsw_forward_grp(vswp, mp, caller, arg);
7241 			} else {
7242 				/*
7243 				 * Unicast pkt from vnet that we don't have
7244 				 * an FDB entry for, so must be destinded for
7245 				 * the outside world. Attempt to send up to the
7246 				 * IP layer to allow it to deal with it.
7247 				 */
7248 				if (caller == VSW_VNETPORT) {
7249 					READ_ENTER(&vswp->if_lockrw);
7250 					if (vswp->if_state & VSW_IF_UP) {
7251 						RW_EXIT(&vswp->if_lockrw);
7252 						D2(vswp, "%s: sending up",
7253 						    __func__);
7254 						mac_rx(vswp->if_mh, mrh, mp);
7255 					} else {
7256 						RW_EXIT(&vswp->if_lockrw);
7257 						/* Interface down, drop pkt */
7258 						D2(vswp, "%s I/F down",
7259 						    __func__);
7260 						freemsg(mp);
7261 					}
7262 				}
7263 			}
7264 		}
7265 	}
7266 
7267 	D1(vswp, "%s: exit", __func__);
7268 }
7269 
7270 /*
7271  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
7272  * except the caller (port on which frame arrived).
7273  */
7274 static int
7275 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
7276 {
7277 	vsw_port_list_t	*plist = &vswp->plist;
7278 	vsw_port_t	*portp;
7279 	mblk_t		*nmp = NULL;
7280 	mblk_t		*ret_m = NULL;
7281 	int		skip_port = 0;
7282 
7283 	D1(vswp, "vsw_forward_all: enter\n");
7284 
7285 	/*
7286 	 * Broadcast message from inside ldoms so send to outside
7287 	 * world if in either of layer 2 modes.
7288 	 */
7289 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
7290 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
7291 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
7292 
7293 		nmp = dupmsg(mp);
7294 		if (nmp) {
7295 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
7296 				DERR(vswp, "%s: dropping pkt(s) "
7297 				    "consisting of %ld bytes of data for"
7298 				    " physical device", __func__, MBLKL(ret_m));
7299 				freemsg(ret_m);
7300 			}
7301 		}
7302 	}
7303 
7304 	if (caller == VSW_VNETPORT)
7305 		skip_port = 1;
7306 
7307 	/*
7308 	 * Broadcast message from other vnet (layer 2 or 3) or outside
7309 	 * world (layer 2 only), send up stack if plumbed.
7310 	 */
7311 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
7312 		READ_ENTER(&vswp->if_lockrw);
7313 		if (vswp->if_state & VSW_IF_UP) {
7314 			RW_EXIT(&vswp->if_lockrw);
7315 			nmp = copymsg(mp);
7316 			if (nmp)
7317 				mac_rx(vswp->if_mh, NULL, nmp);
7318 		} else {
7319 			RW_EXIT(&vswp->if_lockrw);
7320 		}
7321 	}
7322 
7323 	/* send it to all VNETPORTs */
7324 	READ_ENTER(&plist->lockrw);
7325 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
7326 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
7327 		/*
7328 		 * Caution ! - don't reorder these two checks as arg
7329 		 * will be NULL if the caller is PHYSDEV. skip_port is
7330 		 * only set if caller is VNETPORT.
7331 		 */
7332 		if ((skip_port) && (portp == arg))
7333 			continue;
7334 		else {
7335 			nmp = dupmsg(mp);
7336 			if (nmp) {
7337 				(void) vsw_portsend(portp, nmp);
7338 			} else {
7339 				DERR(vswp, "vsw_forward_all: nmp NULL");
7340 			}
7341 		}
7342 	}
7343 	RW_EXIT(&plist->lockrw);
7344 
7345 	freemsg(mp);
7346 
7347 	D1(vswp, "vsw_forward_all: exit\n");
7348 	return (0);
7349 }
7350 
7351 /*
7352  * Forward pkts to any devices or interfaces which have registered
7353  * an interest in them (i.e. multicast groups).
7354  */
7355 static int
7356 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
7357 {
7358 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
7359 	mfdb_ent_t		*entp = NULL;
7360 	mfdb_ent_t		*tpp = NULL;
7361 	vsw_port_t 		*port;
7362 	uint64_t		key = 0;
7363 	mblk_t			*nmp = NULL;
7364 	mblk_t			*ret_m = NULL;
7365 	boolean_t		check_if = B_TRUE;
7366 
7367 	/*
7368 	 * Convert address to hash table key
7369 	 */
7370 	KEY_HASH(key, ehp->ether_dhost);
7371 
7372 	D1(vswp, "%s: key 0x%llx", __func__, key);
7373 
7374 	/*
7375 	 * If pkt came from either a vnet or down the stack (if we are
7376 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
7377 	 * over the physical adapter, and then check to see if any other
7378 	 * vnets are interested in it.
7379 	 */
7380 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
7381 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
7382 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
7383 		nmp = dupmsg(mp);
7384 		if (nmp) {
7385 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
7386 				DERR(vswp, "%s: dropping pkt(s) consisting of "
7387 				    "%ld bytes of data for physical device",
7388 				    __func__, MBLKL(ret_m));
7389 				freemsg(ret_m);
7390 			}
7391 		}
7392 	}
7393 
7394 	READ_ENTER(&vswp->mfdbrw);
7395 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
7396 	    (mod_hash_val_t *)&entp) != 0) {
7397 		D3(vswp, "%s: no table entry found for addr 0x%llx",
7398 		    __func__, key);
7399 	} else {
7400 		/*
7401 		 * Send to list of devices associated with this address...
7402 		 */
7403 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
7404 
7405 			/* dont send to ourselves */
7406 			if ((caller == VSW_VNETPORT) &&
7407 			    (tpp->d_addr == (void *)arg)) {
7408 				port = (vsw_port_t *)tpp->d_addr;
7409 				D3(vswp, "%s: not sending to ourselves"
7410 				    " : port %d", __func__, port->p_instance);
7411 				continue;
7412 
7413 			} else if ((caller == VSW_LOCALDEV) &&
7414 			    (tpp->d_type == VSW_LOCALDEV)) {
7415 				D3(vswp, "%s: not sending back up stack",
7416 				    __func__);
7417 				continue;
7418 			}
7419 
7420 			if (tpp->d_type == VSW_VNETPORT) {
7421 				port = (vsw_port_t *)tpp->d_addr;
7422 				D3(vswp, "%s: sending to port %ld for addr "
7423 				    "0x%llx", __func__, port->p_instance, key);
7424 
7425 				nmp = dupmsg(mp);
7426 				if (nmp)
7427 					(void) vsw_portsend(port, nmp);
7428 			} else {
7429 				if (vswp->if_state & VSW_IF_UP) {
7430 					nmp = copymsg(mp);
7431 					if (nmp)
7432 						mac_rx(vswp->if_mh, NULL, nmp);
7433 					check_if = B_FALSE;
7434 					D3(vswp, "%s: sending up stack"
7435 					    " for addr 0x%llx", __func__, key);
7436 				}
7437 			}
7438 		}
7439 	}
7440 
7441 	RW_EXIT(&vswp->mfdbrw);
7442 
7443 	/*
7444 	 * If the pkt came from either a vnet or from physical device,
7445 	 * and if we havent already sent the pkt up the stack then we
7446 	 * check now if we can/should (i.e. the interface is plumbed
7447 	 * and in promisc mode).
7448 	 */
7449 	if ((check_if) &&
7450 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
7451 		READ_ENTER(&vswp->if_lockrw);
7452 		if (VSW_U_P(vswp->if_state)) {
7453 			RW_EXIT(&vswp->if_lockrw);
7454 			D3(vswp, "%s: (caller %d) finally sending up stack"
7455 			    " for addr 0x%llx", __func__, caller, key);
7456 			nmp = copymsg(mp);
7457 			if (nmp)
7458 				mac_rx(vswp->if_mh, NULL, nmp);
7459 		} else {
7460 			RW_EXIT(&vswp->if_lockrw);
7461 		}
7462 	}
7463 
7464 	freemsg(mp);
7465 
7466 	D1(vswp, "%s: exit", __func__);
7467 
7468 	return (0);
7469 }
7470 
7471 /* transmit the packet over the given port */
7472 static int
7473 vsw_portsend(vsw_port_t *port, mblk_t *mp)
7474 {
7475 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
7476 	vsw_ldc_t 	*ldcp;
7477 	int		status = 0;
7478 
7479 
7480 	READ_ENTER(&ldcl->lockrw);
7481 	/*
7482 	 * Note for now, we have a single channel.
7483 	 */
7484 	ldcp = ldcl->head;
7485 	if (ldcp == NULL) {
7486 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
7487 		freemsg(mp);
7488 		RW_EXIT(&ldcl->lockrw);
7489 		return (1);
7490 	}
7491 
7492 	/*
7493 	 * Send the message out using the appropriate
7494 	 * transmit function which will free mblock when it
7495 	 * is finished with it.
7496 	 */
7497 	mutex_enter(&port->tx_lock);
7498 	if (port->transmit != NULL)
7499 		status = (*port->transmit)(ldcp, mp);
7500 	else {
7501 		freemsg(mp);
7502 	}
7503 	mutex_exit(&port->tx_lock);
7504 
7505 	RW_EXIT(&ldcl->lockrw);
7506 
7507 	return (status);
7508 }
7509 
7510 /*
7511  * Send packet out via descriptor ring to a logical device.
7512  */
7513 static int
7514 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
7515 {
7516 	vio_dring_msg_t		dring_pkt;
7517 	dring_info_t		*dp = NULL;
7518 	vsw_private_desc_t	*priv_desc = NULL;
7519 	vnet_public_desc_t	*pub = NULL;
7520 	vsw_t			*vswp = ldcp->ldc_vswp;
7521 	mblk_t			*bp;
7522 	size_t			n, size;
7523 	caddr_t			bufp;
7524 	int			idx;
7525 	int			status = LDC_TX_SUCCESS;
7526 
7527 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
7528 
7529 	/* TODO: make test a macro */
7530 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
7531 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
7532 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
7533 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
7534 		    ldcp->lane_out.lstate);
7535 		freemsg(mp);
7536 		return (LDC_TX_FAILURE);
7537 	}
7538 
7539 	/*
7540 	 * Note - using first ring only, this may change
7541 	 * in the future.
7542 	 */
7543 	READ_ENTER(&ldcp->lane_out.dlistrw);
7544 	if ((dp = ldcp->lane_out.dringp) == NULL) {
7545 		RW_EXIT(&ldcp->lane_out.dlistrw);
7546 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
7547 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
7548 		freemsg(mp);
7549 		return (LDC_TX_FAILURE);
7550 	}
7551 
7552 	size = msgsize(mp);
7553 	if (size > (size_t)ETHERMAX) {
7554 		RW_EXIT(&ldcp->lane_out.dlistrw);
7555 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
7556 		    ldcp->ldc_id, size);
7557 		freemsg(mp);
7558 		return (LDC_TX_FAILURE);
7559 	}
7560 
7561 	/*
7562 	 * Find a free descriptor
7563 	 *
7564 	 * Note: for the moment we are assuming that we will only
7565 	 * have one dring going from the switch to each of its
7566 	 * peers. This may change in the future.
7567 	 */
7568 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
7569 		D2(vswp, "%s(%lld): no descriptor available for ring "
7570 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
7571 
7572 		/* nothing more we can do */
7573 		status = LDC_TX_NORESOURCES;
7574 		goto vsw_dringsend_free_exit;
7575 	} else {
7576 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
7577 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
7578 	}
7579 
7580 	/* copy data into the descriptor */
7581 	bufp = priv_desc->datap;
7582 	bufp += VNET_IPALIGN;
7583 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
7584 		n = MBLKL(bp);
7585 		bcopy(bp->b_rptr, bufp, n);
7586 		bufp += n;
7587 	}
7588 
7589 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
7590 
7591 	pub = priv_desc->descp;
7592 	pub->nbytes = priv_desc->datalen;
7593 
7594 	mutex_enter(&priv_desc->dstate_lock);
7595 	pub->hdr.dstate = VIO_DESC_READY;
7596 	mutex_exit(&priv_desc->dstate_lock);
7597 
7598 	/*
7599 	 * Determine whether or not we need to send a message to our
7600 	 * peer prompting them to read our newly updated descriptor(s).
7601 	 */
7602 	mutex_enter(&dp->restart_lock);
7603 	if (dp->restart_reqd) {
7604 		dp->restart_reqd = B_FALSE;
7605 		mutex_exit(&dp->restart_lock);
7606 
7607 		/*
7608 		 * Send a vio_dring_msg to peer to prompt them to read
7609 		 * the updated descriptor ring.
7610 		 */
7611 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
7612 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
7613 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
7614 		dring_pkt.tag.vio_sid = ldcp->local_session;
7615 
7616 		/* Note - for now using first ring */
7617 		dring_pkt.dring_ident = dp->ident;
7618 
7619 		mutex_enter(&ldcp->lane_out.seq_lock);
7620 		dring_pkt.seq_num = ldcp->lane_out.seq_num++;
7621 		mutex_exit(&ldcp->lane_out.seq_lock);
7622 
7623 		/*
7624 		 * If last_ack_recv is -1 then we know we've not
7625 		 * received any ack's yet, so this must be the first
7626 		 * msg sent, so set the start to the begining of the ring.
7627 		 */
7628 		mutex_enter(&dp->dlock);
7629 		if (dp->last_ack_recv == -1) {
7630 			dring_pkt.start_idx = 0;
7631 		} else {
7632 			dring_pkt.start_idx =
7633 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
7634 		}
7635 		dring_pkt.end_idx = -1;
7636 		mutex_exit(&dp->dlock);
7637 
7638 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
7639 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
7640 		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
7641 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
7642 		    dring_pkt.end_idx, dring_pkt.seq_num);
7643 
7644 		RW_EXIT(&ldcp->lane_out.dlistrw);
7645 
7646 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
7647 		    sizeof (vio_dring_msg_t), B_TRUE);
7648 
7649 		/* free the message block */
7650 		freemsg(mp);
7651 		return (status);
7652 
7653 	} else {
7654 		mutex_exit(&dp->restart_lock);
7655 		D2(vswp, "%s(%lld): updating descp %d", __func__,
7656 		    ldcp->ldc_id, idx);
7657 	}
7658 
7659 vsw_dringsend_free_exit:
7660 
7661 	RW_EXIT(&ldcp->lane_out.dlistrw);
7662 
7663 	/* free the message block */
7664 	freemsg(mp);
7665 
7666 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
7667 	return (status);
7668 }
7669 
7670 /*
7671  * Send an in-band descriptor message over ldc.
7672  */
7673 static int
7674 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
7675 {
7676 	vsw_t			*vswp = ldcp->ldc_vswp;
7677 	vnet_ibnd_desc_t	ibnd_msg;
7678 	vsw_private_desc_t	*priv_desc = NULL;
7679 	dring_info_t		*dp = NULL;
7680 	size_t			n, size = 0;
7681 	caddr_t			bufp;
7682 	mblk_t			*bp;
7683 	int			idx, i;
7684 	int			status = LDC_TX_SUCCESS;
7685 	static int		warn_msg = 1;
7686 
7687 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
7688 
7689 	ASSERT(mp != NULL);
7690 
7691 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
7692 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
7693 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
7694 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
7695 		    ldcp->lane_out.lstate);
7696 		freemsg(mp);
7697 		return (LDC_TX_FAILURE);
7698 	}
7699 
7700 	/*
7701 	 * only expect single dring to exist, which we use
7702 	 * as an internal buffer, rather than a transfer channel.
7703 	 */
7704 	READ_ENTER(&ldcp->lane_out.dlistrw);
7705 	if ((dp = ldcp->lane_out.dringp) == NULL) {
7706 		DERR(vswp, "%s(%lld): no dring for outbound lane",
7707 		    __func__, ldcp->ldc_id);
7708 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
7709 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
7710 		RW_EXIT(&ldcp->lane_out.dlistrw);
7711 		freemsg(mp);
7712 		return (LDC_TX_FAILURE);
7713 	}
7714 
7715 	size = msgsize(mp);
7716 	if (size > (size_t)ETHERMAX) {
7717 		RW_EXIT(&ldcp->lane_out.dlistrw);
7718 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
7719 		    ldcp->ldc_id, size);
7720 		freemsg(mp);
7721 		return (LDC_TX_FAILURE);
7722 	}
7723 
7724 	/*
7725 	 * Find a free descriptor in our buffer ring
7726 	 */
7727 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
7728 		RW_EXIT(&ldcp->lane_out.dlistrw);
7729 		if (warn_msg) {
7730 			DERR(vswp, "%s(%lld): no descriptor available for ring "
7731 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
7732 			warn_msg = 0;
7733 		}
7734 
7735 		/* nothing more we can do */
7736 		status = LDC_TX_NORESOURCES;
7737 		goto vsw_descrsend_free_exit;
7738 	} else {
7739 		D2(vswp, "%s(%lld): free private descriptor found at pos "
7740 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
7741 		warn_msg = 1;
7742 	}
7743 
7744 	/* copy data into the descriptor */
7745 	bufp = priv_desc->datap;
7746 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
7747 		n = MBLKL(bp);
7748 		bcopy(bp->b_rptr, bufp, n);
7749 		bufp += n;
7750 	}
7751 
7752 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
7753 
7754 	/* create and send the in-band descp msg */
7755 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
7756 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
7757 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
7758 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
7759 
7760 	mutex_enter(&ldcp->lane_out.seq_lock);
7761 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
7762 	mutex_exit(&ldcp->lane_out.seq_lock);
7763 
7764 	/*
7765 	 * Copy the mem cookies describing the data from the
7766 	 * private region of the descriptor ring into the inband
7767 	 * descriptor.
7768 	 */
7769 	for (i = 0; i < priv_desc->ncookies; i++) {
7770 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
7771 		    sizeof (ldc_mem_cookie_t));
7772 	}
7773 
7774 	ibnd_msg.hdr.desc_handle = idx;
7775 	ibnd_msg.ncookies = priv_desc->ncookies;
7776 	ibnd_msg.nbytes = size;
7777 
7778 	RW_EXIT(&ldcp->lane_out.dlistrw);
7779 
7780 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
7781 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
7782 
7783 vsw_descrsend_free_exit:
7784 
7785 	/* free the allocated message blocks */
7786 	freemsg(mp);
7787 
7788 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
7789 	return (status);
7790 }
7791 
7792 static void
7793 vsw_send_ver(void *arg)
7794 {
7795 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
7796 	vsw_t		*vswp = ldcp->ldc_vswp;
7797 	lane_t		*lp = &ldcp->lane_out;
7798 	vio_ver_msg_t	ver_msg;
7799 
7800 	D1(vswp, "%s enter", __func__);
7801 
7802 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7803 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7804 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
7805 	ver_msg.tag.vio_sid = ldcp->local_session;
7806 
7807 	ver_msg.ver_major = vsw_versions[0].ver_major;
7808 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
7809 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
7810 
7811 	lp->lstate |= VSW_VER_INFO_SENT;
7812 	lp->ver_major = ver_msg.ver_major;
7813 	lp->ver_minor = ver_msg.ver_minor;
7814 
7815 	DUMP_TAG(ver_msg.tag);
7816 
7817 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
7818 
7819 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
7820 }
7821 
7822 static void
7823 vsw_send_attr(vsw_ldc_t *ldcp)
7824 {
7825 	vsw_t			*vswp = ldcp->ldc_vswp;
7826 	lane_t			*lp = &ldcp->lane_out;
7827 	vnet_attr_msg_t		attr_msg;
7828 
7829 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
7830 
7831 	/*
7832 	 * Subtype is set to INFO by default
7833 	 */
7834 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7835 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7836 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
7837 	attr_msg.tag.vio_sid = ldcp->local_session;
7838 
7839 	/* payload copied from default settings for lane */
7840 	attr_msg.mtu = lp->mtu;
7841 	attr_msg.addr_type = lp->addr_type;
7842 	attr_msg.xfer_mode = lp->xfer_mode;
7843 	attr_msg.ack_freq = lp->xfer_mode;
7844 
7845 	READ_ENTER(&vswp->if_lockrw);
7846 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
7847 	RW_EXIT(&vswp->if_lockrw);
7848 
7849 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
7850 
7851 	DUMP_TAG(attr_msg.tag);
7852 
7853 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
7854 
7855 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
7856 }
7857 
7858 /*
7859  * Create dring info msg (which also results in the creation of
7860  * a dring).
7861  */
7862 static vio_dring_reg_msg_t *
7863 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
7864 {
7865 	vio_dring_reg_msg_t	*mp;
7866 	dring_info_t		*dp;
7867 	vsw_t			*vswp = ldcp->ldc_vswp;
7868 
7869 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
7870 
7871 	/*
7872 	 * If we can't create a dring, obviously no point sending
7873 	 * a message.
7874 	 */
7875 	if ((dp = vsw_create_dring(ldcp)) == NULL)
7876 		return (NULL);
7877 
7878 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
7879 
7880 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
7881 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
7882 	mp->tag.vio_subtype_env = VIO_DRING_REG;
7883 	mp->tag.vio_sid = ldcp->local_session;
7884 
7885 	/* payload */
7886 	mp->num_descriptors = dp->num_descriptors;
7887 	mp->descriptor_size = dp->descriptor_size;
7888 	mp->options = dp->options;
7889 	mp->ncookies = dp->ncookies;
7890 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
7891 
7892 	mp->dring_ident = 0;
7893 
7894 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
7895 
7896 	return (mp);
7897 }
7898 
7899 static void
7900 vsw_send_dring_info(vsw_ldc_t *ldcp)
7901 {
7902 	vio_dring_reg_msg_t	*dring_msg;
7903 	vsw_t			*vswp = ldcp->ldc_vswp;
7904 
7905 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
7906 
7907 	dring_msg = vsw_create_dring_info_pkt(ldcp);
7908 	if (dring_msg == NULL) {
7909 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
7910 		    vswp->instance, __func__);
7911 		return;
7912 	}
7913 
7914 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
7915 
7916 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
7917 
7918 	(void) vsw_send_msg(ldcp, dring_msg,
7919 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
7920 
7921 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
7922 
7923 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
7924 }
7925 
7926 static void
7927 vsw_send_rdx(vsw_ldc_t *ldcp)
7928 {
7929 	vsw_t		*vswp = ldcp->ldc_vswp;
7930 	vio_rdx_msg_t	rdx_msg;
7931 
7932 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
7933 
7934 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7935 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7936 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
7937 	rdx_msg.tag.vio_sid = ldcp->local_session;
7938 
7939 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
7940 
7941 	DUMP_TAG(rdx_msg.tag);
7942 
7943 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
7944 
7945 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
7946 }
7947 
7948 /*
7949  * Generic routine to send message out over ldc channel.
7950  *
7951  * It is possible that when we attempt to write over the ldc channel
7952  * that we get notified that it has been reset. Depending on the value
7953  * of the handle_reset flag we either handle that event here or simply
7954  * notify the caller that the channel was reset.
7955  */
7956 static int
7957 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
7958 {
7959 	int		rv;
7960 	size_t		msglen = size;
7961 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
7962 	vsw_t		*vswp = ldcp->ldc_vswp;
7963 
7964 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
7965 	    ldcp->ldc_id, size);
7966 
7967 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
7968 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
7969 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
7970 
7971 	mutex_enter(&ldcp->ldc_txlock);
7972 	do {
7973 		msglen = size;
7974 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
7975 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
7976 
7977 	if ((rv != 0) || (msglen != size)) {
7978 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
7979 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
7980 	}
7981 	mutex_exit(&ldcp->ldc_txlock);
7982 
7983 	/*
7984 	 * If channel has been reset we either handle it here or
7985 	 * simply report back that it has been reset and let caller
7986 	 * decide what to do.
7987 	 */
7988 	if (rv == ECONNRESET) {
7989 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
7990 
7991 		/*
7992 		 * N.B - must never be holding the dlistrw lock when
7993 		 * we do a reset of the channel.
7994 		 */
7995 		if (handle_reset) {
7996 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
7997 		}
7998 	}
7999 
8000 	return (rv);
8001 }
8002 
8003 /*
8004  * Add an entry into FDB, for the given mac address and port_id.
8005  * Returns 0 on success, 1 on failure.
8006  *
8007  * Lock protecting FDB must be held by calling process.
8008  */
8009 static int
8010 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
8011 {
8012 	uint64_t	addr = 0;
8013 
8014 	D1(vswp, "%s: enter", __func__);
8015 
8016 	KEY_HASH(addr, port->p_macaddr);
8017 
8018 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
8019 
8020 	/*
8021 	 * Note: duplicate keys will be rejected by mod_hash.
8022 	 */
8023 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
8024 	    (mod_hash_val_t)port) != 0) {
8025 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
8026 		return (1);
8027 	}
8028 
8029 	D1(vswp, "%s: exit", __func__);
8030 	return (0);
8031 }
8032 
8033 /*
8034  * Remove an entry from FDB.
8035  * Returns 0 on success, 1 on failure.
8036  */
8037 static int
8038 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
8039 {
8040 	uint64_t	addr = 0;
8041 
8042 	D1(vswp, "%s: enter", __func__);
8043 
8044 	KEY_HASH(addr, port->p_macaddr);
8045 
8046 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
8047 
8048 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
8049 
8050 	D1(vswp, "%s: enter", __func__);
8051 
8052 	return (0);
8053 }
8054 
8055 /*
8056  * Search fdb for a given mac address.
8057  * Returns pointer to the entry if found, else returns NULL.
8058  */
8059 static vsw_port_t *
8060 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
8061 {
8062 	uint64_t	key = 0;
8063 	vsw_port_t	*port = NULL;
8064 
8065 	D1(vswp, "%s: enter", __func__);
8066 
8067 	KEY_HASH(key, ehp->ether_dhost);
8068 
8069 	D2(vswp, "%s: key = 0x%llx", __func__, key);
8070 
8071 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
8072 	    (mod_hash_val_t *)&port) != 0) {
8073 		D2(vswp, "%s: no port found", __func__);
8074 		return (NULL);
8075 	}
8076 
8077 	D1(vswp, "%s: exit", __func__);
8078 
8079 	return (port);
8080 }
8081 
8082 /*
8083  * Add or remove multicast address(es).
8084  *
8085  * Returns 0 on success, 1 on failure.
8086  */
8087 static int
8088 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
8089 {
8090 	mcst_addr_t		*mcst_p = NULL;
8091 	vsw_t			*vswp = port->p_vswp;
8092 	uint64_t		addr = 0x0;
8093 	int			i;
8094 
8095 	D1(vswp, "%s: enter", __func__);
8096 
8097 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
8098 
8099 	for (i = 0; i < mcst_pkt->count; i++) {
8100 		/*
8101 		 * Convert address into form that can be used
8102 		 * as hash table key.
8103 		 */
8104 		KEY_HASH(addr, mcst_pkt->mca[i]);
8105 
8106 		/*
8107 		 * Add or delete the specified address/port combination.
8108 		 */
8109 		if (mcst_pkt->set == 0x1) {
8110 			D3(vswp, "%s: adding multicast address 0x%llx for "
8111 			    "port %ld", __func__, addr, port->p_instance);
8112 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
8113 				/*
8114 				 * Update the list of multicast
8115 				 * addresses contained within the
8116 				 * port structure to include this new
8117 				 * one.
8118 				 */
8119 				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
8120 				    KM_NOSLEEP);
8121 				if (mcst_p == NULL) {
8122 					DERR(vswp, "%s: unable to alloc mem",
8123 					    __func__);
8124 					(void) vsw_del_mcst(vswp,
8125 					    VSW_VNETPORT, addr, port);
8126 					return (1);
8127 				}
8128 
8129 				mcst_p->nextp = NULL;
8130 				mcst_p->addr = addr;
8131 				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
8132 
8133 				/*
8134 				 * Program the address into HW. If the addr
8135 				 * has already been programmed then the MAC
8136 				 * just increments a ref counter (which is
8137 				 * used when the address is being deleted)
8138 				 */
8139 				mutex_enter(&vswp->mac_lock);
8140 				if (vswp->mh != NULL) {
8141 					if (mac_multicst_add(vswp->mh,
8142 					    (uchar_t *)&mcst_pkt->mca[i])) {
8143 						mutex_exit(&vswp->mac_lock);
8144 						cmn_err(CE_WARN, "!vsw%d: "
8145 						    "unable to add multicast "
8146 						    "address: %s\n",
8147 						    vswp->instance,
8148 						    ether_sprintf((void *)
8149 						    &mcst_p->mca));
8150 						(void) vsw_del_mcst(vswp,
8151 						    VSW_VNETPORT, addr, port);
8152 						kmem_free(mcst_p,
8153 						    sizeof (*mcst_p));
8154 						return (1);
8155 					}
8156 					mcst_p->mac_added = B_TRUE;
8157 				}
8158 				mutex_exit(&vswp->mac_lock);
8159 
8160 				mutex_enter(&port->mca_lock);
8161 				mcst_p->nextp = port->mcap;
8162 				port->mcap = mcst_p;
8163 				mutex_exit(&port->mca_lock);
8164 
8165 			} else {
8166 				DERR(vswp, "%s: error adding multicast "
8167 				    "address 0x%llx for port %ld",
8168 				    __func__, addr, port->p_instance);
8169 				return (1);
8170 			}
8171 		} else {
8172 			/*
8173 			 * Delete an entry from the multicast hash
8174 			 * table and update the address list
8175 			 * appropriately.
8176 			 */
8177 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
8178 				D3(vswp, "%s: deleting multicast address "
8179 				    "0x%llx for port %ld", __func__, addr,
8180 				    port->p_instance);
8181 
8182 				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
8183 				ASSERT(mcst_p != NULL);
8184 
8185 				/*
8186 				 * Remove the address from HW. The address
8187 				 * will actually only be removed once the ref
8188 				 * count within the MAC layer has dropped to
8189 				 * zero. I.e. we can safely call this fn even
8190 				 * if other ports are interested in this
8191 				 * address.
8192 				 */
8193 				mutex_enter(&vswp->mac_lock);
8194 				if (vswp->mh != NULL && mcst_p->mac_added) {
8195 					if (mac_multicst_remove(vswp->mh,
8196 					    (uchar_t *)&mcst_pkt->mca[i])) {
8197 						mutex_exit(&vswp->mac_lock);
8198 						cmn_err(CE_WARN, "!vsw%d: "
8199 						    "unable to remove mcast "
8200 						    "address: %s\n",
8201 						    vswp->instance,
8202 						    ether_sprintf((void *)
8203 						    &mcst_p->mca));
8204 						kmem_free(mcst_p,
8205 						    sizeof (*mcst_p));
8206 						return (1);
8207 					}
8208 					mcst_p->mac_added = B_FALSE;
8209 				}
8210 				mutex_exit(&vswp->mac_lock);
8211 				kmem_free(mcst_p, sizeof (*mcst_p));
8212 
8213 			} else {
8214 				DERR(vswp, "%s: error deleting multicast "
8215 				    "addr 0x%llx for port %ld",
8216 				    __func__, addr, port->p_instance);
8217 				return (1);
8218 			}
8219 		}
8220 	}
8221 	D1(vswp, "%s: exit", __func__);
8222 	return (0);
8223 }
8224 
8225 /*
8226  * Add a new multicast entry.
8227  *
8228  * Search hash table based on address. If match found then
8229  * update associated val (which is chain of ports), otherwise
8230  * create new key/val (addr/port) pair and insert into table.
8231  */
8232 static int
8233 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
8234 {
8235 	int		dup = 0;
8236 	int		rv = 0;
8237 	mfdb_ent_t	*ment = NULL;
8238 	mfdb_ent_t	*tmp_ent = NULL;
8239 	mfdb_ent_t	*new_ent = NULL;
8240 	void		*tgt = NULL;
8241 
8242 	if (devtype == VSW_VNETPORT) {
8243 		/*
8244 		 * Being invoked from a vnet.
8245 		 */
8246 		ASSERT(arg != NULL);
8247 		tgt = arg;
8248 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
8249 		    ((vsw_port_t *)arg)->p_instance, addr);
8250 	} else {
8251 		/*
8252 		 * We are being invoked via the m_multicst mac entry
8253 		 * point.
8254 		 */
8255 		D2(NULL, "%s: address 0x%llx", __func__, addr);
8256 		tgt = (void *)vswp;
8257 	}
8258 
8259 	WRITE_ENTER(&vswp->mfdbrw);
8260 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
8261 	    (mod_hash_val_t *)&ment) != 0) {
8262 
8263 		/* address not currently in table */
8264 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
8265 		ment->d_addr = (void *)tgt;
8266 		ment->d_type = devtype;
8267 		ment->nextp = NULL;
8268 
8269 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
8270 		    (mod_hash_val_t)ment) != 0) {
8271 			DERR(vswp, "%s: hash table insertion failed", __func__);
8272 			kmem_free(ment, sizeof (mfdb_ent_t));
8273 			rv = 1;
8274 		} else {
8275 			D2(vswp, "%s: added initial entry for 0x%llx to "
8276 			    "table", __func__, addr);
8277 		}
8278 	} else {
8279 		/*
8280 		 * Address in table. Check to see if specified port
8281 		 * is already associated with the address. If not add
8282 		 * it now.
8283 		 */
8284 		tmp_ent = ment;
8285 		while (tmp_ent != NULL) {
8286 			if (tmp_ent->d_addr == (void *)tgt) {
8287 				if (devtype == VSW_VNETPORT) {
8288 					DERR(vswp, "%s: duplicate port entry "
8289 					    "found for portid %ld and key "
8290 					    "0x%llx", __func__,
8291 					    ((vsw_port_t *)arg)->p_instance,
8292 					    addr);
8293 				} else {
8294 					DERR(vswp, "%s: duplicate entry found"
8295 					    "for key 0x%llx", __func__, addr);
8296 				}
8297 				rv = 1;
8298 				dup = 1;
8299 				break;
8300 			}
8301 			tmp_ent = tmp_ent->nextp;
8302 		}
8303 
8304 		/*
8305 		 * Port not on list so add it to end now.
8306 		 */
8307 		if (0 == dup) {
8308 			D2(vswp, "%s: added entry for 0x%llx to table",
8309 			    __func__, addr);
8310 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
8311 			new_ent->d_addr = (void *)tgt;
8312 			new_ent->d_type = devtype;
8313 			new_ent->nextp = NULL;
8314 
8315 			tmp_ent = ment;
8316 			while (tmp_ent->nextp != NULL)
8317 				tmp_ent = tmp_ent->nextp;
8318 
8319 			tmp_ent->nextp = new_ent;
8320 		}
8321 	}
8322 
8323 	RW_EXIT(&vswp->mfdbrw);
8324 	return (rv);
8325 }
8326 
8327 /*
8328  * Remove a multicast entry from the hashtable.
8329  *
8330  * Search hash table based on address. If match found, scan
8331  * list of ports associated with address. If specified port
8332  * found remove it from list.
8333  */
8334 static int
8335 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
8336 {
8337 	mfdb_ent_t	*ment = NULL;
8338 	mfdb_ent_t	*curr_p, *prev_p;
8339 	void		*tgt = NULL;
8340 
8341 	D1(vswp, "%s: enter", __func__);
8342 
8343 	if (devtype == VSW_VNETPORT) {
8344 		tgt = (vsw_port_t *)arg;
8345 		D2(vswp, "%s: removing port %d from mFDB for address"
8346 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
8347 	} else {
8348 		D2(vswp, "%s: removing entry", __func__);
8349 		tgt = (void *)vswp;
8350 	}
8351 
8352 	WRITE_ENTER(&vswp->mfdbrw);
8353 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
8354 	    (mod_hash_val_t *)&ment) != 0) {
8355 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
8356 		RW_EXIT(&vswp->mfdbrw);
8357 		return (1);
8358 	}
8359 
8360 	prev_p = curr_p = ment;
8361 
8362 	while (curr_p != NULL) {
8363 		if (curr_p->d_addr == (void *)tgt) {
8364 			if (devtype == VSW_VNETPORT) {
8365 				D2(vswp, "%s: port %d found", __func__,
8366 				    ((vsw_port_t *)tgt)->p_instance);
8367 			} else {
8368 				D2(vswp, "%s: instance found", __func__);
8369 			}
8370 
8371 			if (prev_p == curr_p) {
8372 				/*
8373 				 * head of list, if no other element is in
8374 				 * list then destroy this entry, otherwise
8375 				 * just replace it with updated value.
8376 				 */
8377 				ment = curr_p->nextp;
8378 				if (ment == NULL) {
8379 					(void) mod_hash_destroy(vswp->mfdb,
8380 					    (mod_hash_val_t)addr);
8381 				} else {
8382 					(void) mod_hash_replace(vswp->mfdb,
8383 					    (mod_hash_key_t)addr,
8384 					    (mod_hash_val_t)ment);
8385 				}
8386 			} else {
8387 				/*
8388 				 * Not head of list, no need to do
8389 				 * replacement, just adjust list pointers.
8390 				 */
8391 				prev_p->nextp = curr_p->nextp;
8392 			}
8393 			break;
8394 		}
8395 
8396 		prev_p = curr_p;
8397 		curr_p = curr_p->nextp;
8398 	}
8399 
8400 	RW_EXIT(&vswp->mfdbrw);
8401 
8402 	D1(vswp, "%s: exit", __func__);
8403 
8404 	if (curr_p == NULL)
8405 		return (1);
8406 	kmem_free(curr_p, sizeof (mfdb_ent_t));
8407 	return (0);
8408 }
8409 
8410 /*
8411  * Port is being deleted, but has registered an interest in one
8412  * or more multicast groups. Using the list of addresses maintained
8413  * within the port structure find the appropriate entry in the hash
8414  * table and remove this port from the list of interested ports.
8415  */
8416 static void
8417 vsw_del_mcst_port(vsw_port_t *port)
8418 {
8419 	mcst_addr_t	*mcap = NULL;
8420 	vsw_t		*vswp = port->p_vswp;
8421 
8422 	D1(vswp, "%s: enter", __func__);
8423 
8424 	mutex_enter(&port->mca_lock);
8425 
8426 	while ((mcap = port->mcap) != NULL) {
8427 
8428 		port->mcap = mcap->nextp;
8429 
8430 		mutex_exit(&port->mca_lock);
8431 
8432 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
8433 		    mcap->addr, port);
8434 
8435 		/*
8436 		 * Remove the address from HW. The address
8437 		 * will actually only be removed once the ref
8438 		 * count within the MAC layer has dropped to
8439 		 * zero. I.e. we can safely call this fn even
8440 		 * if other ports are interested in this
8441 		 * address.
8442 		 */
8443 		mutex_enter(&vswp->mac_lock);
8444 		if (vswp->mh != NULL && mcap->mac_added) {
8445 			(void) mac_multicst_remove(vswp->mh,
8446 			    (uchar_t *)&mcap->mca);
8447 		}
8448 		mutex_exit(&vswp->mac_lock);
8449 
8450 		kmem_free(mcap, sizeof (*mcap));
8451 
8452 		mutex_enter(&port->mca_lock);
8453 
8454 	}
8455 
8456 	mutex_exit(&port->mca_lock);
8457 
8458 	D1(vswp, "%s: exit", __func__);
8459 }
8460 
8461 /*
8462  * This vsw instance is detaching, but has registered an interest in one
8463  * or more multicast groups. Using the list of addresses maintained
8464  * within the vsw structure find the appropriate entry in the hash
8465  * table and remove this instance from the list of interested ports.
8466  */
8467 static void
8468 vsw_del_mcst_vsw(vsw_t *vswp)
8469 {
8470 	mcst_addr_t	*next_p = NULL;
8471 
8472 	D1(vswp, "%s: enter", __func__);
8473 
8474 	mutex_enter(&vswp->mca_lock);
8475 
8476 	while (vswp->mcap != NULL) {
8477 		DERR(vswp, "%s: deleting addr 0x%llx",
8478 		    __func__, vswp->mcap->addr);
8479 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
8480 
8481 		next_p = vswp->mcap->nextp;
8482 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
8483 		vswp->mcap = next_p;
8484 	}
8485 
8486 	vswp->mcap = NULL;
8487 	mutex_exit(&vswp->mca_lock);
8488 
8489 	D1(vswp, "%s: exit", __func__);
8490 }
8491 
8492 /*
8493  * Remove the specified address from the list of address maintained
8494  * in this port node.
8495  */
8496 static mcst_addr_t *
8497 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
8498 {
8499 	vsw_t		*vswp = NULL;
8500 	vsw_port_t	*port = NULL;
8501 	mcst_addr_t	*prev_p = NULL;
8502 	mcst_addr_t	*curr_p = NULL;
8503 
8504 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
8505 	    __func__, devtype, addr);
8506 
8507 	if (devtype == VSW_VNETPORT) {
8508 		port = (vsw_port_t *)arg;
8509 		mutex_enter(&port->mca_lock);
8510 		prev_p = curr_p = port->mcap;
8511 	} else {
8512 		vswp = (vsw_t *)arg;
8513 		mutex_enter(&vswp->mca_lock);
8514 		prev_p = curr_p = vswp->mcap;
8515 	}
8516 
8517 	while (curr_p != NULL) {
8518 		if (curr_p->addr == addr) {
8519 			D2(NULL, "%s: address found", __func__);
8520 			/* match found */
8521 			if (prev_p == curr_p) {
8522 				/* list head */
8523 				if (devtype == VSW_VNETPORT)
8524 					port->mcap = curr_p->nextp;
8525 				else
8526 					vswp->mcap = curr_p->nextp;
8527 			} else {
8528 				prev_p->nextp = curr_p->nextp;
8529 			}
8530 			break;
8531 		} else {
8532 			prev_p = curr_p;
8533 			curr_p = curr_p->nextp;
8534 		}
8535 	}
8536 
8537 	if (devtype == VSW_VNETPORT)
8538 		mutex_exit(&port->mca_lock);
8539 	else
8540 		mutex_exit(&vswp->mca_lock);
8541 
8542 	D1(NULL, "%s: exit", __func__);
8543 
8544 	return (curr_p);
8545 }
8546 
8547 /*
8548  * Creates a descriptor ring (dring) and links it into the
8549  * link of outbound drings for this channel.
8550  *
8551  * Returns NULL if creation failed.
8552  */
8553 static dring_info_t *
8554 vsw_create_dring(vsw_ldc_t *ldcp)
8555 {
8556 	vsw_private_desc_t	*priv_addr = NULL;
8557 	vsw_t			*vswp = ldcp->ldc_vswp;
8558 	ldc_mem_info_t		minfo;
8559 	dring_info_t		*dp, *tp;
8560 	int			i;
8561 
8562 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
8563 
8564 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
8565 
8566 	/* create public section of ring */
8567 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
8568 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
8569 
8570 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
8571 		    "failed", ldcp->ldc_id);
8572 		goto create_fail_exit;
8573 	}
8574 
8575 	ASSERT(dp->handle != NULL);
8576 
8577 	/*
8578 	 * Get the base address of the public section of the ring.
8579 	 */
8580 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
8581 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
8582 		    ldcp->ldc_id);
8583 		goto dring_fail_exit;
8584 	} else {
8585 		ASSERT(minfo.vaddr != 0);
8586 		dp->pub_addr = minfo.vaddr;
8587 	}
8588 
8589 	dp->num_descriptors = VSW_RING_NUM_EL;
8590 	dp->descriptor_size = VSW_PUB_SIZE;
8591 	dp->options = VIO_TX_DRING;
8592 	dp->ncookies = 1;	/* guaranteed by ldc */
8593 
8594 	/*
8595 	 * create private portion of ring
8596 	 */
8597 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
8598 	    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
8599 
8600 	if (vsw_setup_ring(ldcp, dp)) {
8601 		DERR(vswp, "%s: unable to setup ring", __func__);
8602 		goto dring_fail_exit;
8603 	}
8604 
8605 	/* haven't used any descriptors yet */
8606 	dp->end_idx = 0;
8607 	dp->last_ack_recv = -1;
8608 
8609 	/* bind dring to the channel */
8610 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
8611 	    LDC_SHADOW_MAP, LDC_MEM_RW,
8612 	    &dp->cookie[0], &dp->ncookies)) != 0) {
8613 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
8614 		    "%lld", ldcp->ldc_id);
8615 		goto dring_fail_exit;
8616 	}
8617 
8618 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
8619 	dp->restart_reqd = B_TRUE;
8620 
8621 	/*
8622 	 * Only ever create rings for outgoing lane. Link it onto
8623 	 * end of list.
8624 	 */
8625 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
8626 	if (ldcp->lane_out.dringp == NULL) {
8627 		D2(vswp, "vsw_create_dring: adding first outbound ring");
8628 		ldcp->lane_out.dringp = dp;
8629 	} else {
8630 		tp = ldcp->lane_out.dringp;
8631 		while (tp->next != NULL)
8632 			tp = tp->next;
8633 
8634 		tp->next = dp;
8635 	}
8636 	RW_EXIT(&ldcp->lane_out.dlistrw);
8637 
8638 	return (dp);
8639 
8640 dring_fail_exit:
8641 	(void) ldc_mem_dring_destroy(dp->handle);
8642 
8643 create_fail_exit:
8644 	if (dp->priv_addr != NULL) {
8645 		priv_addr = dp->priv_addr;
8646 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
8647 			if (priv_addr->memhandle != NULL)
8648 				(void) ldc_mem_free_handle(
8649 				    priv_addr->memhandle);
8650 			priv_addr++;
8651 		}
8652 		kmem_free(dp->priv_addr,
8653 		    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
8654 	}
8655 	mutex_destroy(&dp->dlock);
8656 
8657 	kmem_free(dp, sizeof (dring_info_t));
8658 	return (NULL);
8659 }
8660 
8661 /*
8662  * Create a ring consisting of just a private portion and link
8663  * it into the list of rings for the outbound lane.
8664  *
8665  * These type of rings are used primarily for temporary data
8666  * storage (i.e. as data buffers).
8667  */
8668 void
8669 vsw_create_privring(vsw_ldc_t *ldcp)
8670 {
8671 	dring_info_t		*dp, *tp;
8672 	vsw_t			*vswp = ldcp->ldc_vswp;
8673 
8674 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
8675 
8676 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
8677 
8678 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
8679 
8680 	/* no public section */
8681 	dp->pub_addr = NULL;
8682 
8683 	dp->priv_addr = kmem_zalloc(
8684 	    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
8685 
8686 	dp->num_descriptors = VSW_RING_NUM_EL;
8687 
8688 	if (vsw_setup_ring(ldcp, dp)) {
8689 		DERR(vswp, "%s: setup of ring failed", __func__);
8690 		kmem_free(dp->priv_addr,
8691 		    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
8692 		mutex_destroy(&dp->dlock);
8693 		kmem_free(dp, sizeof (dring_info_t));
8694 		return;
8695 	}
8696 
8697 	/* haven't used any descriptors yet */
8698 	dp->end_idx = 0;
8699 
8700 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
8701 	dp->restart_reqd = B_TRUE;
8702 
8703 	/*
8704 	 * Only ever create rings for outgoing lane. Link it onto
8705 	 * end of list.
8706 	 */
8707 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
8708 	if (ldcp->lane_out.dringp == NULL) {
8709 		D2(vswp, "%s: adding first outbound privring", __func__);
8710 		ldcp->lane_out.dringp = dp;
8711 	} else {
8712 		tp = ldcp->lane_out.dringp;
8713 		while (tp->next != NULL)
8714 			tp = tp->next;
8715 
8716 		tp->next = dp;
8717 	}
8718 	RW_EXIT(&ldcp->lane_out.dlistrw);
8719 
8720 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
8721 }
8722 
8723 /*
8724  * Setup the descriptors in the dring. Returns 0 on success, 1 on
8725  * failure.
8726  */
8727 int
8728 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
8729 {
8730 	vnet_public_desc_t	*pub_addr = NULL;
8731 	vsw_private_desc_t	*priv_addr = NULL;
8732 	vsw_t			*vswp = ldcp->ldc_vswp;
8733 	uint64_t		*tmpp;
8734 	uint64_t		offset = 0;
8735 	uint32_t		ncookies = 0;
8736 	static char		*name = "vsw_setup_ring";
8737 	int			i, j, nc, rv;
8738 
8739 	priv_addr = dp->priv_addr;
8740 	pub_addr = dp->pub_addr;
8741 
8742 	/* public section may be null but private should never be */
8743 	ASSERT(priv_addr != NULL);
8744 
8745 	/*
8746 	 * Allocate the region of memory which will be used to hold
8747 	 * the data the descriptors will refer to.
8748 	 */
8749 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
8750 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
8751 
8752 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
8753 	    dp->data_sz, dp->data_addr);
8754 
8755 	tmpp = (uint64_t *)dp->data_addr;
8756 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
8757 
8758 	/*
8759 	 * Initialise some of the private and public (if they exist)
8760 	 * descriptor fields.
8761 	 */
8762 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
8763 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
8764 
8765 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
8766 		    &priv_addr->memhandle)) != 0) {
8767 			DERR(vswp, "%s: alloc mem handle failed", name);
8768 			goto setup_ring_cleanup;
8769 		}
8770 
8771 		priv_addr->datap = (void *)tmpp;
8772 
8773 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
8774 		    (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
8775 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
8776 		    &(priv_addr->memcookie[0]), &ncookies);
8777 		if (rv != 0) {
8778 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
8779 			    "(rv %d)", name, ldcp->ldc_id, rv);
8780 			goto setup_ring_cleanup;
8781 		}
8782 		priv_addr->bound = 1;
8783 
8784 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
8785 		    name, i, priv_addr->memcookie[0].addr,
8786 		    priv_addr->memcookie[0].size);
8787 
8788 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
8789 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
8790 			    "invalid num of cookies (%d) for size 0x%llx",
8791 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
8792 
8793 			goto setup_ring_cleanup;
8794 		} else {
8795 			for (j = 1; j < ncookies; j++) {
8796 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
8797 				    &(priv_addr->memcookie[j]));
8798 				if (rv != 0) {
8799 					DERR(vswp, "%s: ldc_mem_nextcookie "
8800 					    "failed rv (%d)", name, rv);
8801 					goto setup_ring_cleanup;
8802 				}
8803 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
8804 				    "size 0x%llx", name, j,
8805 				    priv_addr->memcookie[j].addr,
8806 				    priv_addr->memcookie[j].size);
8807 			}
8808 
8809 		}
8810 		priv_addr->ncookies = ncookies;
8811 		priv_addr->dstate = VIO_DESC_FREE;
8812 
8813 		if (pub_addr != NULL) {
8814 
8815 			/* link pub and private sides */
8816 			priv_addr->descp = pub_addr;
8817 
8818 			pub_addr->ncookies = priv_addr->ncookies;
8819 
8820 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
8821 				bcopy(&priv_addr->memcookie[nc],
8822 				    &pub_addr->memcookie[nc],
8823 				    sizeof (ldc_mem_cookie_t));
8824 			}
8825 
8826 			pub_addr->hdr.dstate = VIO_DESC_FREE;
8827 			pub_addr++;
8828 		}
8829 
8830 		/*
8831 		 * move to next element in the dring and the next
8832 		 * position in the data buffer.
8833 		 */
8834 		priv_addr++;
8835 		tmpp += offset;
8836 	}
8837 
8838 	return (0);
8839 
8840 setup_ring_cleanup:
8841 	priv_addr = dp->priv_addr;
8842 
8843 	for (j = 0; j < i; j++) {
8844 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
8845 		(void) ldc_mem_free_handle(priv_addr->memhandle);
8846 
8847 		mutex_destroy(&priv_addr->dstate_lock);
8848 
8849 		priv_addr++;
8850 	}
8851 	kmem_free(dp->data_addr, dp->data_sz);
8852 
8853 	return (1);
8854 }
8855 
8856 /*
8857  * Searches the private section of a ring for a free descriptor,
8858  * starting at the location of the last free descriptor found
8859  * previously.
8860  *
8861  * Returns 0 if free descriptor is available, and updates state
8862  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
8863  *
8864  * FUTURE: might need to return contiguous range of descriptors
8865  * as dring info msg assumes all will be contiguous.
8866  */
8867 static int
8868 vsw_dring_find_free_desc(dring_info_t *dringp,
8869 		vsw_private_desc_t **priv_p, int *idx)
8870 {
8871 	vsw_private_desc_t	*addr = NULL;
8872 	int			num = VSW_RING_NUM_EL;
8873 	int			ret = 1;
8874 
8875 	D1(NULL, "%s enter\n", __func__);
8876 
8877 	ASSERT(dringp->priv_addr != NULL);
8878 
8879 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
8880 	    __func__, dringp, dringp->end_idx);
8881 
8882 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
8883 
8884 	mutex_enter(&addr->dstate_lock);
8885 	if (addr->dstate == VIO_DESC_FREE) {
8886 		addr->dstate = VIO_DESC_READY;
8887 		*priv_p = addr;
8888 		*idx = dringp->end_idx;
8889 		dringp->end_idx = (dringp->end_idx + 1) % num;
8890 		ret = 0;
8891 
8892 	}
8893 	mutex_exit(&addr->dstate_lock);
8894 
8895 	/* ring full */
8896 	if (ret == 1) {
8897 		D2(NULL, "%s: no desp free: started at %d", __func__,
8898 		    dringp->end_idx);
8899 	}
8900 
8901 	D1(NULL, "%s: exit\n", __func__);
8902 
8903 	return (ret);
8904 }
8905 
8906 /*
8907  * Map from a dring identifier to the ring itself. Returns
8908  * pointer to ring or NULL if no match found.
8909  *
8910  * Should be called with dlistrw rwlock held as reader.
8911  */
8912 static dring_info_t *
8913 vsw_ident2dring(lane_t *lane, uint64_t ident)
8914 {
8915 	dring_info_t	*dp = NULL;
8916 
8917 	if ((dp = lane->dringp) == NULL) {
8918 		return (NULL);
8919 	} else {
8920 		if (dp->ident == ident)
8921 			return (dp);
8922 
8923 		while (dp != NULL) {
8924 			if (dp->ident == ident)
8925 				break;
8926 			dp = dp->next;
8927 		}
8928 	}
8929 
8930 	return (dp);
8931 }
8932 
8933 /*
8934  * Set the default lane attributes. These are copied into
8935  * the attr msg we send to our peer. If they are not acceptable
8936  * then (currently) the handshake ends.
8937  */
8938 static void
8939 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
8940 {
8941 	bzero(lp, sizeof (lane_t));
8942 
8943 	READ_ENTER(&vswp->if_lockrw);
8944 	ether_copy(&(vswp->if_addr), &(lp->addr));
8945 	RW_EXIT(&vswp->if_lockrw);
8946 
8947 	lp->mtu = VSW_MTU;
8948 	lp->addr_type = ADDR_TYPE_MAC;
8949 	lp->xfer_mode = VIO_DRING_MODE;
8950 	lp->ack_freq = 0;	/* for shared mode */
8951 
8952 	mutex_enter(&lp->seq_lock);
8953 	lp->seq_num = VNET_ISS;
8954 	mutex_exit(&lp->seq_lock);
8955 }
8956 
8957 /*
8958  * Verify that the attributes are acceptable.
8959  *
8960  * FUTURE: If some attributes are not acceptable, change them
8961  * our desired values.
8962  */
8963 static int
8964 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
8965 {
8966 	int	ret = 0;
8967 
8968 	D1(NULL, "vsw_check_attr enter\n");
8969 
8970 	/*
8971 	 * Note we currently only support in-band descriptors
8972 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
8973 	 */
8974 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
8975 	    (pkt->xfer_mode != VIO_DRING_MODE)) {
8976 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
8977 		ret = 1;
8978 	}
8979 
8980 	/* Only support MAC addresses at moment. */
8981 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
8982 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
8983 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
8984 		ret = 1;
8985 	}
8986 
8987 	/*
8988 	 * MAC address supplied by device should match that stored
8989 	 * in the vsw-port OBP node. Need to decide what to do if they
8990 	 * don't match, for the moment just warn but don't fail.
8991 	 */
8992 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
8993 		DERR(NULL, "vsw_check_attr: device supplied address "
8994 		    "0x%llx doesn't match node address 0x%llx\n",
8995 		    pkt->addr, port->p_macaddr);
8996 	}
8997 
8998 	/*
8999 	 * Ack freq only makes sense in pkt mode, in shared
9000 	 * mode the ring descriptors say whether or not to
9001 	 * send back an ACK.
9002 	 */
9003 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
9004 	    (pkt->ack_freq > 0)) {
9005 		D2(NULL, "vsw_check_attr: non zero ack freq "
9006 		    " in SHM mode\n");
9007 		ret = 1;
9008 	}
9009 
9010 	/*
9011 	 * Note: for the moment we only support ETHER
9012 	 * frames. This may change in the future.
9013 	 */
9014 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
9015 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
9016 		    pkt->mtu);
9017 		ret = 1;
9018 	}
9019 
9020 	D1(NULL, "vsw_check_attr exit\n");
9021 
9022 	return (ret);
9023 }
9024 
9025 /*
9026  * Returns 1 if there is a problem, 0 otherwise.
9027  */
9028 static int
9029 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
9030 {
9031 	_NOTE(ARGUNUSED(pkt))
9032 
9033 	int	ret = 0;
9034 
9035 	D1(NULL, "vsw_check_dring_info enter\n");
9036 
9037 	if ((pkt->num_descriptors == 0) ||
9038 	    (pkt->descriptor_size == 0) ||
9039 	    (pkt->ncookies != 1)) {
9040 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
9041 		ret = 1;
9042 	}
9043 
9044 	D1(NULL, "vsw_check_dring_info exit\n");
9045 
9046 	return (ret);
9047 }
9048 
9049 /*
9050  * Returns 1 if two memory cookies match. Otherwise returns 0.
9051  */
9052 static int
9053 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
9054 {
9055 	if ((m1->addr != m2->addr) ||
9056 	    (m2->size != m2->size)) {
9057 		return (0);
9058 	} else {
9059 		return (1);
9060 	}
9061 }
9062 
9063 /*
9064  * Returns 1 if ring described in reg message matches that
9065  * described by dring_info structure. Otherwise returns 0.
9066  */
9067 static int
9068 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
9069 {
9070 	if ((msg->descriptor_size != dp->descriptor_size) ||
9071 	    (msg->num_descriptors != dp->num_descriptors) ||
9072 	    (msg->ncookies != dp->ncookies) ||
9073 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
9074 		return (0);
9075 	} else {
9076 		return (1);
9077 	}
9078 
9079 }
9080 
9081 static caddr_t
9082 vsw_print_ethaddr(uint8_t *a, char *ebuf)
9083 {
9084 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
9085 	    a[0], a[1], a[2], a[3], a[4], a[5]);
9086 	return (ebuf);
9087 }
9088 
9089 /*
9090  * Reset and free all the resources associated with
9091  * the channel.
9092  */
9093 static void
9094 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
9095 {
9096 	dring_info_t		*dp, *dpp;
9097 	lane_t			*lp = NULL;
9098 	int			rv = 0;
9099 
9100 	ASSERT(ldcp != NULL);
9101 
9102 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
9103 
9104 	if (dir == INBOUND) {
9105 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
9106 		    " of channel %lld", __func__, ldcp->ldc_id);
9107 		lp = &ldcp->lane_in;
9108 	} else {
9109 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
9110 		    " of channel %lld", __func__, ldcp->ldc_id);
9111 		lp = &ldcp->lane_out;
9112 	}
9113 
9114 	lp->lstate = VSW_LANE_INACTIV;
9115 	mutex_enter(&lp->seq_lock);
9116 	lp->seq_num = VNET_ISS;
9117 	mutex_exit(&lp->seq_lock);
9118 	if (lp->dringp) {
9119 		if (dir == INBOUND) {
9120 			WRITE_ENTER(&lp->dlistrw);
9121 			dp = lp->dringp;
9122 			while (dp != NULL) {
9123 				dpp = dp->next;
9124 				if (dp->handle != NULL)
9125 					(void) ldc_mem_dring_unmap(dp->handle);
9126 				kmem_free(dp, sizeof (dring_info_t));
9127 				dp = dpp;
9128 			}
9129 			RW_EXIT(&lp->dlistrw);
9130 		} else {
9131 			/*
9132 			 * unbind, destroy exported dring, free dring struct
9133 			 */
9134 			WRITE_ENTER(&lp->dlistrw);
9135 			dp = lp->dringp;
9136 			rv = vsw_free_ring(dp);
9137 			RW_EXIT(&lp->dlistrw);
9138 		}
9139 		if (rv == 0) {
9140 			lp->dringp = NULL;
9141 		}
9142 	}
9143 
9144 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
9145 }
9146 
9147 /*
9148  * Free ring and all associated resources.
9149  *
9150  * Should be called with dlistrw rwlock held as writer.
9151  */
9152 static int
9153 vsw_free_ring(dring_info_t *dp)
9154 {
9155 	vsw_private_desc_t	*paddr = NULL;
9156 	dring_info_t		*dpp;
9157 	int			i, rv = 1;
9158 
9159 	while (dp != NULL) {
9160 		mutex_enter(&dp->dlock);
9161 		dpp = dp->next;
9162 		if (dp->priv_addr != NULL) {
9163 			/*
9164 			 * First unbind and free the memory handles
9165 			 * stored in each descriptor within the ring.
9166 			 */
9167 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
9168 				paddr = (vsw_private_desc_t *)
9169 				    dp->priv_addr + i;
9170 				if (paddr->memhandle != NULL) {
9171 					if (paddr->bound == 1) {
9172 						rv = ldc_mem_unbind_handle(
9173 						    paddr->memhandle);
9174 
9175 						if (rv != 0) {
9176 							DERR(NULL, "error "
9177 							"unbinding handle for "
9178 							"ring 0x%llx at pos %d",
9179 							    dp, i);
9180 							mutex_exit(&dp->dlock);
9181 							return (rv);
9182 						}
9183 						paddr->bound = 0;
9184 					}
9185 
9186 					rv = ldc_mem_free_handle(
9187 					    paddr->memhandle);
9188 					if (rv != 0) {
9189 						DERR(NULL, "error freeing "
9190 						    "handle for ring 0x%llx "
9191 						    "at pos %d", dp, i);
9192 						mutex_exit(&dp->dlock);
9193 						return (rv);
9194 					}
9195 					paddr->memhandle = NULL;
9196 				}
9197 				mutex_destroy(&paddr->dstate_lock);
9198 			}
9199 			kmem_free(dp->priv_addr,
9200 			    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
9201 		}
9202 
9203 		/*
9204 		 * Now unbind and destroy the ring itself.
9205 		 */
9206 		if (dp->handle != NULL) {
9207 			(void) ldc_mem_dring_unbind(dp->handle);
9208 			(void) ldc_mem_dring_destroy(dp->handle);
9209 		}
9210 
9211 		if (dp->data_addr != NULL) {
9212 			kmem_free(dp->data_addr, dp->data_sz);
9213 		}
9214 
9215 		mutex_exit(&dp->dlock);
9216 		mutex_destroy(&dp->dlock);
9217 		mutex_destroy(&dp->restart_lock);
9218 		kmem_free(dp, sizeof (dring_info_t));
9219 
9220 		dp = dpp;
9221 	}
9222 	return (0);
9223 }
9224 
9225 /*
9226  * Debugging routines
9227  */
9228 static void
9229 display_state(void)
9230 {
9231 	vsw_t		*vswp;
9232 	vsw_port_list_t	*plist;
9233 	vsw_port_t 	*port;
9234 	vsw_ldc_list_t	*ldcl;
9235 	vsw_ldc_t 	*ldcp;
9236 
9237 	cmn_err(CE_NOTE, "***** system state *****");
9238 
9239 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
9240 		plist = &vswp->plist;
9241 		READ_ENTER(&plist->lockrw);
9242 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
9243 		    vswp->instance, plist->num_ports);
9244 
9245 		for (port = plist->head; port != NULL; port = port->p_next) {
9246 			ldcl = &port->p_ldclist;
9247 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
9248 			    port->p_instance, ldcl->num_ldcs);
9249 			READ_ENTER(&ldcl->lockrw);
9250 			ldcp = ldcl->head;
9251 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
9252 				cmn_err(CE_CONT, "chan %lu : dev %d : "
9253 				    "status %d : phase %u\n",
9254 				    ldcp->ldc_id, ldcp->dev_class,
9255 				    ldcp->ldc_status, ldcp->hphase);
9256 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
9257 				    "psession %lu\n", ldcp->ldc_id,
9258 				    ldcp->local_session, ldcp->peer_session);
9259 
9260 				cmn_err(CE_CONT, "Inbound lane:\n");
9261 				display_lane(&ldcp->lane_in);
9262 				cmn_err(CE_CONT, "Outbound lane:\n");
9263 				display_lane(&ldcp->lane_out);
9264 			}
9265 			RW_EXIT(&ldcl->lockrw);
9266 		}
9267 		RW_EXIT(&plist->lockrw);
9268 	}
9269 	cmn_err(CE_NOTE, "***** system state *****");
9270 }
9271 
9272 static void
9273 display_lane(lane_t *lp)
9274 {
9275 	dring_info_t	*drp;
9276 
9277 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
9278 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
9279 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
9280 	    lp->addr_type, lp->addr, lp->xfer_mode);
9281 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
9282 
9283 	cmn_err(CE_CONT, "Dring info:\n");
9284 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
9285 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
9286 		    drp->num_descriptors, drp->descriptor_size);
9287 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
9288 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
9289 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
9290 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
9291 		    drp->ident, drp->end_idx);
9292 		display_ring(drp);
9293 	}
9294 }
9295 
9296 static void
9297 display_ring(dring_info_t *dringp)
9298 {
9299 	uint64_t		i;
9300 	uint64_t		priv_count = 0;
9301 	uint64_t		pub_count = 0;
9302 	vnet_public_desc_t	*pub_addr = NULL;
9303 	vsw_private_desc_t	*priv_addr = NULL;
9304 
9305 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
9306 		if (dringp->pub_addr != NULL) {
9307 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
9308 
9309 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
9310 				pub_count++;
9311 		}
9312 
9313 		if (dringp->priv_addr != NULL) {
9314 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
9315 
9316 			if (priv_addr->dstate == VIO_DESC_FREE)
9317 				priv_count++;
9318 		}
9319 	}
9320 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
9321 	    i, priv_count, pub_count);
9322 }
9323 
9324 static void
9325 dump_flags(uint64_t state)
9326 {
9327 	int	i;
9328 
9329 	typedef struct flag_name {
9330 		int	flag_val;
9331 		char	*flag_name;
9332 	} flag_name_t;
9333 
9334 	flag_name_t	flags[] = {
9335 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
9336 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
9337 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
9338 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
9339 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
9340 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
9341 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
9342 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
9343 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
9344 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
9345 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
9346 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
9347 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
9348 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
9349 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
9350 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
9351 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
9352 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
9353 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
9354 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
9355 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
9356 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
9357 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
9358 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
9359 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
9360 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
9361 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
9362 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
9363 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
9364 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
9365 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
9366 
9367 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
9368 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
9369 		if (state & flags[i].flag_val)
9370 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
9371 	}
9372 }
9373