xref: /titanic_50/usr/src/uts/sun4v/io/vsw.c (revision d577a05052763490983ba19d548a252fed6f75d9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 
74 /*
75  * Function prototypes.
76  */
77 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
78 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
79 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
80 static	int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
81 static	int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *);
82 static	int vsw_get_physaddr(vsw_t *);
83 static	int vsw_setup_switching(vsw_t *);
84 static	int vsw_setup_layer2(vsw_t *);
85 static	int vsw_setup_layer3(vsw_t *);
86 
87 /* MAC Ring table functions. */
88 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
89 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
90 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
91 static void vsw_queue_stop(vsw_queue_t *vqp);
92 static vsw_queue_t *vsw_queue_create();
93 static void vsw_queue_destroy(vsw_queue_t *vqp);
94 
95 /* MAC layer routines */
96 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
97 		mac_resource_t *mrp);
98 static	int vsw_get_hw_maddr(vsw_t *);
99 static	int vsw_set_hw(vsw_t *, vsw_port_t *, int);
100 static	int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
101 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
102 static	int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
103 static	int vsw_unset_hw_addr(vsw_t *, int);
104 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
105 static void vsw_reconfig_hw(vsw_t *);
106 static int vsw_prog_if(vsw_t *);
107 static int vsw_prog_ports(vsw_t *);
108 static int vsw_mac_attach(vsw_t *vswp);
109 static void vsw_mac_detach(vsw_t *vswp);
110 
111 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
112 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
113 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
114 static int vsw_mac_register(vsw_t *);
115 static int vsw_mac_unregister(vsw_t *);
116 static int vsw_m_stat(void *, uint_t, uint64_t *);
117 static void vsw_m_stop(void *arg);
118 static int vsw_m_start(void *arg);
119 static int vsw_m_unicst(void *arg, const uint8_t *);
120 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
121 static int vsw_m_promisc(void *arg, boolean_t);
122 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
123 
124 /* MDEG routines */
125 static	int vsw_mdeg_register(vsw_t *vswp);
126 static	void vsw_mdeg_unregister(vsw_t *vswp);
127 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
128 static	int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
129 static	void vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
130 static	void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);
131 
132 /* Port add/deletion routines */
133 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
134 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
135 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
136 static	int vsw_detach_ports(vsw_t *vswp);
137 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
138 static	int vsw_port_delete(vsw_port_t *port);
139 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
140 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
141 static	int vsw_init_ldcs(vsw_port_t *port);
142 static	int vsw_uninit_ldcs(vsw_port_t *port);
143 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
144 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
145 static	int vsw_drain_ldcs(vsw_port_t *port);
146 static	int vsw_drain_port_taskq(vsw_port_t *port);
147 static	void vsw_marker_task(void *);
148 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
149 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
150 
151 /* Interrupt routines */
152 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
153 
154 /* Handshake routines */
155 static	void vsw_ldc_reinit(vsw_ldc_t *);
156 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
157 static	void vsw_conn_task(void *);
158 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
159 static	void vsw_next_milestone(vsw_ldc_t *);
160 static	int vsw_supported_version(vio_ver_msg_t *);
161 
162 /* Data processing routines */
163 static void vsw_process_pkt(void *);
164 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
165 static void vsw_process_ctrl_pkt(void *);
166 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
167 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
168 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
169 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
170 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
171 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
172 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
173 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
174 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
175 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
176 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
177 
178 /* Switching/data transmit routines */
179 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
180 	    vsw_port_t *port, mac_resource_handle_t);
181 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
182 	    vsw_port_t *port, mac_resource_handle_t);
183 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
184 	    vsw_port_t *port);
185 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
186 	    vsw_port_t *port);
187 static	int vsw_portsend(vsw_port_t *, mblk_t *);
188 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
189 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
190 
191 /* Packet creation routines */
192 static void vsw_send_ver(void *);
193 static void vsw_send_attr(vsw_ldc_t *);
194 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
195 static void vsw_send_dring_info(vsw_ldc_t *);
196 static void vsw_send_rdx(vsw_ldc_t *);
197 
198 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
199 
200 /* Forwarding database (FDB) routines */
201 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
202 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
203 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
204 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
205 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
206 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
207 static	void vsw_del_addr(uint8_t, void *, uint64_t);
208 static	void vsw_del_mcst_port(vsw_port_t *);
209 static	void vsw_del_mcst_vsw(vsw_t *);
210 
211 /* Dring routines */
212 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
213 static void vsw_create_privring(vsw_ldc_t *);
214 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
215 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
216     int *);
217 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
218 
219 static void vsw_set_lane_attr(vsw_t *, lane_t *);
220 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
221 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
222 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
223 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
224 
225 /* Misc support routines */
226 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
227 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
228 static int vsw_free_ring(dring_info_t *);
229 
230 /* Debugging routines */
231 static void dump_flags(uint64_t);
232 static void display_state(void);
233 static void display_lane(lane_t *);
234 static void display_ring(dring_info_t *);
235 
236 int	vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */
237 int	vsw_wretries = 100;		/* # of write attempts */
238 int	vsw_chain_len = 150;		/* max # of mblks in msg chain */
239 int	vsw_desc_delay = 0;		/* delay in us */
240 int	vsw_read_attempts = 5;		/* # of reads of descriptor */
241 
242 uint32_t	vsw_mblk_size = VSW_MBLK_SIZE;
243 uint32_t	vsw_num_mblks = VSW_NUM_MBLKS;
244 
245 static	mac_callbacks_t	vsw_m_callbacks = {
246 	0,
247 	vsw_m_stat,
248 	vsw_m_start,
249 	vsw_m_stop,
250 	vsw_m_promisc,
251 	vsw_m_multicst,
252 	vsw_m_unicst,
253 	vsw_m_tx,
254 	NULL,
255 	NULL,
256 	NULL
257 };
258 
259 static	struct	cb_ops	vsw_cb_ops = {
260 	nulldev,			/* cb_open */
261 	nulldev,			/* cb_close */
262 	nodev,				/* cb_strategy */
263 	nodev,				/* cb_print */
264 	nodev,				/* cb_dump */
265 	nodev,				/* cb_read */
266 	nodev,				/* cb_write */
267 	nodev,				/* cb_ioctl */
268 	nodev,				/* cb_devmap */
269 	nodev,				/* cb_mmap */
270 	nodev,				/* cb_segmap */
271 	nochpoll,			/* cb_chpoll */
272 	ddi_prop_op,			/* cb_prop_op */
273 	NULL,				/* cb_stream */
274 	D_MP,				/* cb_flag */
275 	CB_REV,				/* rev */
276 	nodev,				/* int (*cb_aread)() */
277 	nodev				/* int (*cb_awrite)() */
278 };
279 
280 static	struct	dev_ops	vsw_ops = {
281 	DEVO_REV,		/* devo_rev */
282 	0,			/* devo_refcnt */
283 	vsw_getinfo,		/* devo_getinfo */
284 	nulldev,		/* devo_identify */
285 	nulldev,		/* devo_probe */
286 	vsw_attach,		/* devo_attach */
287 	vsw_detach,		/* devo_detach */
288 	nodev,			/* devo_reset */
289 	&vsw_cb_ops,		/* devo_cb_ops */
290 	(struct bus_ops *)NULL,	/* devo_bus_ops */
291 	ddi_power		/* devo_power */
292 };
293 
294 extern	struct	mod_ops	mod_driverops;
295 static struct modldrv vswmodldrv = {
296 	&mod_driverops,
297 	"sun4v Virtual Switch",
298 	&vsw_ops,
299 };
300 
301 #define	LDC_ENTER_LOCK(ldcp)	\
302 				mutex_enter(&((ldcp)->ldc_cblock));\
303 				mutex_enter(&((ldcp)->ldc_txlock));
304 #define	LDC_EXIT_LOCK(ldcp)	\
305 				mutex_exit(&((ldcp)->ldc_txlock));\
306 				mutex_exit(&((ldcp)->ldc_cblock));
307 
308 /* Driver soft state ptr  */
309 static void	*vsw_state;
310 
311 /*
312  * Linked list of "vsw_t" structures - one per instance.
313  */
314 vsw_t		*vsw_head = NULL;
315 krwlock_t	vsw_rw;
316 
317 /*
318  * Property names
319  */
320 static char vdev_propname[] = "virtual-device";
321 static char vsw_propname[] = "virtual-network-switch";
322 static char physdev_propname[] = "vsw-phys-dev";
323 static char smode_propname[] = "vsw-switch-mode";
324 static char macaddr_propname[] = "local-mac-address";
325 static char remaddr_propname[] = "remote-mac-address";
326 static char ldcids_propname[] = "ldc-ids";
327 static char chan_propname[] = "channel-endpoint";
328 static char id_propname[] = "id";
329 static char reg_propname[] = "reg";
330 
331 /* supported versions */
332 static	ver_sup_t	vsw_versions[] = { {1, 0} };
333 
334 /*
335  * Matching criteria passed to the MDEG to register interest
336  * in changes to 'virtual-device-port' nodes identified by their
337  * 'id' property.
338  */
339 static md_prop_match_t vport_prop_match[] = {
340 	{ MDET_PROP_VAL,    "id"   },
341 	{ MDET_LIST_END,    NULL    }
342 };
343 
344 static mdeg_node_match_t vport_match = { "virtual-device-port",
345 						vport_prop_match };
346 
347 /*
348  * Matching criteria passed to the MDEG to register interest
349  * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified
350  * by their 'name' and 'cfg-handle' properties.
351  */
352 static md_prop_match_t vdev_prop_match[] = {
353 	{ MDET_PROP_STR,    "name"   },
354 	{ MDET_PROP_VAL,    "cfg-handle" },
355 	{ MDET_LIST_END,    NULL    }
356 };
357 
358 static mdeg_node_match_t vdev_match = { "virtual-device",
359 						vdev_prop_match };
360 
361 
362 /*
363  * Specification of an MD node passed to the MDEG to filter any
364  * 'vport' nodes that do not belong to the specified node. This
365  * template is copied for each vsw instance and filled in with
366  * the appropriate 'cfg-handle' value before being passed to the MDEG.
367  */
368 static mdeg_prop_spec_t vsw_prop_template[] = {
369 	{ MDET_PROP_STR,    "name",		vsw_propname },
370 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
371 	{ MDET_LIST_END,    NULL,		NULL	}
372 };
373 
374 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
375 
376 /*
377  * From /etc/system enable/disable thread per ring. This is a mode
378  * selection that is done a vsw driver attach time.
379  */
380 boolean_t vsw_multi_ring_enable = B_FALSE;
381 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;
382 
383 /*
384  * Print debug messages - set to 0x1f to enable all msgs
385  * or 0x0 to turn all off.
386  */
387 int vswdbg = 0x0;
388 
389 /*
390  * debug levels:
391  * 0x01:	Function entry/exit tracing
392  * 0x02:	Internal function messages
393  * 0x04:	Verbose internal messages
394  * 0x08:	Warning messages
395  * 0x10:	Error messages
396  */
397 
398 static void
399 vswdebug(vsw_t *vswp, const char *fmt, ...)
400 {
401 	char buf[512];
402 	va_list ap;
403 
404 	va_start(ap, fmt);
405 	(void) vsprintf(buf, fmt, ap);
406 	va_end(ap);
407 
408 	if (vswp == NULL)
409 		cmn_err(CE_CONT, "%s\n", buf);
410 	else
411 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
412 }
413 
414 /*
415  * For the moment the state dump routines have their own
416  * private flag.
417  */
418 #define	DUMP_STATE	0
419 
420 #if DUMP_STATE
421 
422 #define	DUMP_TAG(tag) \
423 {			\
424 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
425 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
426 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
427 }
428 
429 #define	DUMP_TAG_PTR(tag) \
430 {			\
431 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
432 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
433 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
434 }
435 
436 #define	DUMP_FLAGS(flags) dump_flags(flags);
437 #define	DISPLAY_STATE()	display_state()
438 
439 #else
440 
441 #define	DUMP_TAG(tag)
442 #define	DUMP_TAG_PTR(tag)
443 #define	DUMP_FLAGS(state)
444 #define	DISPLAY_STATE()
445 
446 #endif	/* DUMP_STATE */
447 
448 #ifdef DEBUG
449 
450 #define	D1		\
451 if (vswdbg & 0x01)	\
452 	vswdebug
453 
454 #define	D2		\
455 if (vswdbg & 0x02)	\
456 	vswdebug
457 
458 #define	D3		\
459 if (vswdbg & 0x04)	\
460 	vswdebug
461 
462 #define	DWARN		\
463 if (vswdbg & 0x08)	\
464 	vswdebug
465 
466 #define	DERR		\
467 if (vswdbg & 0x10)	\
468 	vswdebug
469 
470 #else
471 
472 #define	DERR		if (0)	vswdebug
473 #define	DWARN		if (0)	vswdebug
474 #define	D1		if (0)	vswdebug
475 #define	D2		if (0)	vswdebug
476 #define	D3		if (0)	vswdebug
477 
478 #endif	/* DEBUG */
479 
480 static struct modlinkage modlinkage = {
481 	MODREV_1,
482 	&vswmodldrv,
483 	NULL
484 };
485 
486 int
487 _init(void)
488 {
489 	int status;
490 
491 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
492 
493 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
494 	if (status != 0) {
495 		return (status);
496 	}
497 
498 	mac_init_ops(&vsw_ops, "vsw");
499 	status = mod_install(&modlinkage);
500 	if (status != 0) {
501 		ddi_soft_state_fini(&vsw_state);
502 	}
503 	return (status);
504 }
505 
506 int
507 _fini(void)
508 {
509 	int status;
510 
511 	status = mod_remove(&modlinkage);
512 	if (status != 0)
513 		return (status);
514 	mac_fini_ops(&vsw_ops);
515 	ddi_soft_state_fini(&vsw_state);
516 
517 	rw_destroy(&vsw_rw);
518 
519 	return (status);
520 }
521 
522 int
523 _info(struct modinfo *modinfop)
524 {
525 	return (mod_info(&modlinkage, modinfop));
526 }
527 
528 static int
529 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
530 {
531 	vsw_t		*vswp;
532 	int		instance;
533 	char		hashname[MAXNAMELEN];
534 	char		qname[TASKQ_NAMELEN];
535 	enum		{ PROG_init = 0x00,
536 				PROG_if_lock = 0x01,
537 				PROG_fdb = 0x02,
538 				PROG_mfdb = 0x04,
539 				PROG_report_dev = 0x08,
540 				PROG_plist = 0x10,
541 				PROG_taskq = 0x20}
542 			progress;
543 
544 	progress = PROG_init;
545 
546 	switch (cmd) {
547 	case DDI_ATTACH:
548 		break;
549 	case DDI_RESUME:
550 		/* nothing to do for this non-device */
551 		return (DDI_SUCCESS);
552 	case DDI_PM_RESUME:
553 	default:
554 		return (DDI_FAILURE);
555 	}
556 
557 	instance = ddi_get_instance(dip);
558 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
559 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
560 		return (DDI_FAILURE);
561 	}
562 	vswp = ddi_get_soft_state(vsw_state, instance);
563 
564 	if (vswp == NULL) {
565 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
566 		goto vsw_attach_fail;
567 	}
568 
569 	vswp->dip = dip;
570 	vswp->instance = instance;
571 	ddi_set_driver_private(dip, (caddr_t)vswp);
572 
573 	mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL);
574 	mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
575 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
576 	progress |= PROG_if_lock;
577 
578 	/* setup the unicast forwarding database  */
579 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
580 	    vswp->instance);
581 	D2(vswp, "creating unicast hash table (%s)...", hashname);
582 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
583 	    mod_hash_null_valdtor, sizeof (void *));
584 
585 	progress |= PROG_fdb;
586 
587 	/* setup the multicast fowarding database */
588 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
589 	    vswp->instance);
590 	D2(vswp, "creating multicast hash table %s)...", hashname);
591 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
592 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
593 	    mod_hash_null_valdtor, sizeof (void *));
594 
595 	progress |= PROG_mfdb;
596 
597 	/*
598 	 * create lock protecting list of multicast addresses
599 	 * which could come via m_multicst() entry point when plumbed.
600 	 */
601 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
602 	vswp->mcap = NULL;
603 
604 	ddi_report_dev(vswp->dip);
605 
606 	progress |= PROG_report_dev;
607 
608 	WRITE_ENTER(&vsw_rw);
609 	vswp->next = vsw_head;
610 	vsw_head = vswp;
611 	RW_EXIT(&vsw_rw);
612 
613 	/* setup the port list */
614 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
615 	vswp->plist.head = NULL;
616 
617 	progress |= PROG_plist;
618 
619 	/*
620 	 * Create the taskq which will process all the VIO
621 	 * control messages.
622 	 */
623 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
624 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
625 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
626 		cmn_err(CE_WARN, "!vsw%d: Unable to create task queue",
627 		    vswp->instance);
628 		goto vsw_attach_fail;
629 	}
630 
631 	progress |= PROG_taskq;
632 
633 	/* prevent auto-detaching */
634 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
635 	    DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
636 		cmn_err(CE_NOTE, "!Unable to set \"%s\" property for "
637 		    "instance %u", DDI_NO_AUTODETACH, instance);
638 	}
639 
640 	/*
641 	 * Now we have everything setup, register an interest in
642 	 * specific MD nodes.
643 	 *
644 	 * The callback is invoked in 2 cases, firstly if upon mdeg
645 	 * registration there are existing nodes which match our specified
646 	 * criteria, and secondly if the MD is changed (and again, there
647 	 * are nodes which we are interested in present within it. Note
648 	 * that our callback will be invoked even if our specified nodes
649 	 * have not actually changed).
650 	 *
651 	 * Until the callback is invoked we cannot switch any pkts as
652 	 * we don't know basic information such as what mode we are
653 	 * operating in. However we expect the callback to be invoked
654 	 * immediately upon registration as this driver should only
655 	 * be attaching if there are vsw nodes in the MD.
656 	 */
657 	if (vsw_mdeg_register(vswp))
658 		goto vsw_attach_fail;
659 
660 	return (DDI_SUCCESS);
661 
662 vsw_attach_fail:
663 	DERR(NULL, "vsw_attach: failed");
664 
665 	if (progress & PROG_taskq)
666 		ddi_taskq_destroy(vswp->taskq_p);
667 
668 	if (progress & PROG_plist)
669 		rw_destroy(&vswp->plist.lockrw);
670 
671 	if (progress & PROG_report_dev) {
672 		ddi_remove_minor_node(dip, NULL);
673 		mutex_destroy(&vswp->mca_lock);
674 	}
675 
676 	if (progress & PROG_mfdb) {
677 		mod_hash_destroy_hash(vswp->mfdb);
678 		vswp->mfdb = NULL;
679 		rw_destroy(&vswp->mfdbrw);
680 	}
681 
682 	if (progress & PROG_fdb) {
683 		mod_hash_destroy_hash(vswp->fdb);
684 		vswp->fdb = NULL;
685 	}
686 
687 	if (progress & PROG_if_lock) {
688 		rw_destroy(&vswp->if_lockrw);
689 		mutex_destroy(&vswp->mac_lock);
690 		mutex_destroy(&vswp->hw_lock);
691 	}
692 
693 	ddi_soft_state_free(vsw_state, instance);
694 	return (DDI_FAILURE);
695 }
696 
697 static int
698 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
699 {
700 	vio_mblk_pool_t		*poolp, *npoolp;
701 	vsw_t			**vswpp, *vswp;
702 	int 			instance;
703 
704 	instance = ddi_get_instance(dip);
705 	vswp = ddi_get_soft_state(vsw_state, instance);
706 
707 	if (vswp == NULL) {
708 		return (DDI_FAILURE);
709 	}
710 
711 	switch (cmd) {
712 	case DDI_DETACH:
713 		break;
714 	case DDI_SUSPEND:
715 	case DDI_PM_SUSPEND:
716 	default:
717 		return (DDI_FAILURE);
718 	}
719 
720 	D2(vswp, "detaching instance %d", instance);
721 
722 	if (vswp->if_state & VSW_IF_REG) {
723 		if (vsw_mac_unregister(vswp) != 0) {
724 			cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
725 			    "MAC layer", vswp->instance);
726 			return (DDI_FAILURE);
727 		}
728 	}
729 
730 	vsw_mdeg_unregister(vswp);
731 
732 	/* remove mac layer callback */
733 	mutex_enter(&vswp->mac_lock);
734 	if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
735 		mac_rx_remove(vswp->mh, vswp->mrh);
736 		vswp->mrh = NULL;
737 	}
738 	mutex_exit(&vswp->mac_lock);
739 
740 	if (vsw_detach_ports(vswp) != 0) {
741 		cmn_err(CE_WARN, "!vsw%d: Unable to detach ports",
742 		    vswp->instance);
743 		return (DDI_FAILURE);
744 	}
745 
746 	rw_destroy(&vswp->if_lockrw);
747 
748 	mutex_destroy(&vswp->hw_lock);
749 
750 	/*
751 	 * Now that the ports have been deleted, stop and close
752 	 * the physical device.
753 	 */
754 	mutex_enter(&vswp->mac_lock);
755 	if (vswp->mh != NULL) {
756 		if (vswp->mstarted)
757 			mac_stop(vswp->mh);
758 		if (vswp->mresources)
759 			mac_resource_set(vswp->mh, NULL, NULL);
760 		mac_close(vswp->mh);
761 
762 		vswp->mh = NULL;
763 		vswp->txinfo = NULL;
764 	}
765 	mutex_exit(&vswp->mac_lock);
766 	mutex_destroy(&vswp->mac_lock);
767 
768 	/*
769 	 * Destroy any free pools that may still exist.
770 	 */
771 	poolp = vswp->rxh;
772 	while (poolp != NULL) {
773 		npoolp = vswp->rxh = poolp->nextp;
774 		if (vio_destroy_mblks(poolp) != 0) {
775 			vswp->rxh = poolp;
776 			return (DDI_FAILURE);
777 		}
778 		poolp = npoolp;
779 	}
780 
781 	/*
782 	 * Remove this instance from any entries it may be on in
783 	 * the hash table by using the list of addresses maintained
784 	 * in the vsw_t structure.
785 	 */
786 	vsw_del_mcst_vsw(vswp);
787 
788 	vswp->mcap = NULL;
789 	mutex_destroy(&vswp->mca_lock);
790 
791 	/*
792 	 * By now any pending tasks have finished and the underlying
793 	 * ldc's have been destroyed, so its safe to delete the control
794 	 * message taskq.
795 	 */
796 	if (vswp->taskq_p != NULL)
797 		ddi_taskq_destroy(vswp->taskq_p);
798 
799 	/*
800 	 * At this stage all the data pointers in the hash table
801 	 * should be NULL, as all the ports have been removed and will
802 	 * have deleted themselves from the port lists which the data
803 	 * pointers point to. Hence we can destroy the table using the
804 	 * default destructors.
805 	 */
806 	D2(vswp, "vsw_detach: destroying hash tables..");
807 	mod_hash_destroy_hash(vswp->fdb);
808 	vswp->fdb = NULL;
809 
810 	WRITE_ENTER(&vswp->mfdbrw);
811 	mod_hash_destroy_hash(vswp->mfdb);
812 	vswp->mfdb = NULL;
813 	RW_EXIT(&vswp->mfdbrw);
814 	rw_destroy(&vswp->mfdbrw);
815 
816 	ddi_remove_minor_node(dip, NULL);
817 
818 	rw_destroy(&vswp->plist.lockrw);
819 	WRITE_ENTER(&vsw_rw);
820 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
821 		if (*vswpp == vswp) {
822 			*vswpp = vswp->next;
823 			break;
824 		}
825 	}
826 	RW_EXIT(&vsw_rw);
827 	ddi_soft_state_free(vsw_state, instance);
828 
829 	return (DDI_SUCCESS);
830 }
831 
832 static int
833 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
834 {
835 	_NOTE(ARGUNUSED(dip))
836 
837 	vsw_t	*vswp = NULL;
838 	dev_t	dev = (dev_t)arg;
839 	int	instance;
840 
841 	instance = getminor(dev);
842 
843 	switch (infocmd) {
844 	case DDI_INFO_DEVT2DEVINFO:
845 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
846 			*result = NULL;
847 			return (DDI_FAILURE);
848 		}
849 		*result = vswp->dip;
850 		return (DDI_SUCCESS);
851 
852 	case DDI_INFO_DEVT2INSTANCE:
853 		*result = (void *)(uintptr_t)instance;
854 		return (DDI_SUCCESS);
855 
856 	default:
857 		*result = NULL;
858 		return (DDI_FAILURE);
859 	}
860 }
861 
862 /*
863  * Get the value of the "vsw-phys-dev" property in the specified
864  * node. This property is the name of the physical device that
865  * the virtual switch will use to talk to the outside world.
866  *
867  * Note it is valid for this property to be NULL (but the property
868  * itself must exist). Callers of this routine should verify that
869  * the value returned is what they expected (i.e. either NULL or non NULL).
870  *
871  * On success returns value of the property in region pointed to by
872  * the 'name' argument, and with return value of 0. Otherwise returns 1.
873  */
874 static int
875 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
876 {
877 	int	len = 0;
878 	char	*physname = NULL;
879 	char	*dev;
880 
881 	if (md_get_prop_data(mdp, node, physdev_propname,
882 	    (uint8_t **)(&physname), &len) != 0) {
883 		cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical "
884 		    "device(s) from MD", vswp->instance);
885 		return (1);
886 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
887 		cmn_err(CE_WARN, "!vsw%d: %s is too long a device name",
888 		    vswp->instance, physname);
889 		return (1);
890 	} else {
891 		(void) strncpy(name, physname, strlen(physname) + 1);
892 		D2(vswp, "%s: using first device specified (%s)",
893 		    __func__, physname);
894 	}
895 
896 #ifdef DEBUG
897 	/*
898 	 * As a temporary measure to aid testing we check to see if there
899 	 * is a vsw.conf file present. If there is we use the value of the
900 	 * vsw_physname property in the file as the name of the physical
901 	 * device, overriding the value from the MD.
902 	 *
903 	 * There may be multiple devices listed, but for the moment
904 	 * we just use the first one.
905 	 */
906 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
907 	    "vsw_physname", &dev) == DDI_PROP_SUCCESS) {
908 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
909 			cmn_err(CE_WARN, "vsw%d: %s is too long a device name",
910 			    vswp->instance, dev);
911 			ddi_prop_free(dev);
912 			return (1);
913 		} else {
914 			cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from "
915 			    "config file", vswp->instance, dev);
916 
917 			(void) strncpy(name, dev, strlen(dev) + 1);
918 		}
919 
920 		ddi_prop_free(dev);
921 	}
922 #endif
923 
924 	return (0);
925 }
926 
927 /*
928  * Read the 'vsw-switch-mode' property from the specified MD node.
929  *
930  * Returns 0 on success and the number of modes found in 'found',
931  * otherwise returns 1.
932  */
933 static int
934 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
935 						uint8_t *modes, int *found)
936 {
937 	int		len = 0;
938 	int		smode_num = 0;
939 	char		*smode = NULL;
940 	char		*curr_mode = NULL;
941 
942 	D1(vswp, "%s: enter", __func__);
943 
944 	/*
945 	 * Get the switch-mode property. The modes are listed in
946 	 * decreasing order of preference, i.e. prefered mode is
947 	 * first item in list.
948 	 */
949 	len = 0;
950 	smode_num = 0;
951 	if (md_get_prop_data(mdp, node, smode_propname,
952 	    (uint8_t **)(&smode), &len) != 0) {
953 		/*
954 		 * Unable to get switch-mode property from MD, nothing
955 		 * more we can do.
956 		 */
957 		cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
958 		    " from the MD", vswp->instance);
959 		*found = 0;
960 		return (1);
961 	}
962 
963 	curr_mode = smode;
964 	/*
965 	 * Modes of operation:
966 	 * 'switched'	 - layer 2 switching, underlying HW in
967 	 *			programmed mode.
968 	 * 'promiscuous' - layer 2 switching, underlying HW in
969 	 *			promiscuous mode.
970 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
971 	 *			in non-promiscuous mode.
972 	 */
973 	while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) {
974 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
975 		if (strcmp(curr_mode, "switched") == 0) {
976 			modes[smode_num++] = VSW_LAYER2;
977 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
978 			modes[smode_num++] = VSW_LAYER2_PROMISC;
979 		} else if (strcmp(curr_mode, "routed") == 0) {
980 			modes[smode_num++] = VSW_LAYER3;
981 		} else {
982 			cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
983 			    "setting to default switched mode",
984 			    vswp->instance, curr_mode);
985 			modes[smode_num++] = VSW_LAYER2;
986 		}
987 		curr_mode += strlen(curr_mode) + 1;
988 	}
989 	*found = smode_num;
990 
991 	D2(vswp, "%s: %d modes found", __func__, smode_num);
992 
993 	D1(vswp, "%s: exit", __func__);
994 
995 	return (0);
996 }
997 
998 /*
999  * Get the mac address of the physical device.
1000  *
1001  * Returns 0 on success, 1 on failure.
1002  */
1003 static int
1004 vsw_get_physaddr(vsw_t *vswp)
1005 {
1006 	mac_handle_t	mh;
1007 	char		drv[LIFNAMSIZ];
1008 	uint_t		ddi_instance;
1009 
1010 	D1(vswp, "%s: enter", __func__);
1011 
1012 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS)
1013 		return (1);
1014 
1015 	if (mac_open(vswp->physname, ddi_instance, &mh) != 0) {
1016 		cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
1017 		    vswp->instance, vswp->physname);
1018 		return (1);
1019 	}
1020 
1021 	READ_ENTER(&vswp->if_lockrw);
1022 	mac_unicst_get(mh, vswp->if_addr.ether_addr_octet);
1023 	RW_EXIT(&vswp->if_lockrw);
1024 
1025 	mac_close(mh);
1026 
1027 	vswp->mdprops |= VSW_DEV_MACADDR;
1028 
1029 	D1(vswp, "%s: exit", __func__);
1030 
1031 	return (0);
1032 }
1033 
1034 /*
1035  * Check to see if the card supports the setting of multiple unicst
1036  * addresses.
1037  *
1038  * Returns 0 if card supports the programming of multiple unicast addresses,
1039  * otherwise returns 1.
1040  */
1041 static int
1042 vsw_get_hw_maddr(vsw_t *vswp)
1043 {
1044 	D1(vswp, "%s: enter", __func__);
1045 
1046 	mutex_enter(&vswp->mac_lock);
1047 	if (vswp->mh == NULL) {
1048 		mutex_exit(&vswp->mac_lock);
1049 		return (1);
1050 	}
1051 
1052 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
1053 		cmn_err(CE_WARN, "!vsw%d: device (%s) does not support "
1054 		    "setting multiple unicast addresses", vswp->instance,
1055 		    vswp->physname);
1056 		mutex_exit(&vswp->mac_lock);
1057 		return (1);
1058 	}
1059 	mutex_exit(&vswp->mac_lock);
1060 
1061 	D2(vswp, "%s: %d addrs : %d free", __func__,
1062 	    vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
1063 
1064 	D1(vswp, "%s: exit", __func__);
1065 
1066 	return (0);
1067 }
1068 
1069 /*
1070  * Setup the required switching mode.
1071  *
1072  * Returns 0 on success, 1 on failure.
1073  */
1074 static int
1075 vsw_setup_switching(vsw_t *vswp)
1076 {
1077 	int	i, rv = 1;
1078 
1079 	D1(vswp, "%s: enter", __func__);
1080 
1081 	/* select best switching mode */
1082 	for (i = 0; i < vswp->smode_num; i++) {
1083 		vswp->smode_idx = i;
1084 		switch (vswp->smode[i]) {
1085 		case VSW_LAYER2:
1086 		case VSW_LAYER2_PROMISC:
1087 			rv = vsw_setup_layer2(vswp);
1088 			break;
1089 
1090 		case VSW_LAYER3:
1091 			rv = vsw_setup_layer3(vswp);
1092 			break;
1093 
1094 		default:
1095 			DERR(vswp, "unknown switch mode");
1096 			rv = 1;
1097 			break;
1098 		}
1099 
1100 		if (rv == 0)
1101 			break;
1102 	}
1103 
1104 	if (rv == 1) {
1105 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
1106 		    "switching mode", vswp->instance);
1107 		return (rv);
1108 	}
1109 
1110 	D2(vswp, "%s: Operating in mode %d", __func__,
1111 	    vswp->smode[vswp->smode_idx]);
1112 
1113 	D1(vswp, "%s: exit", __func__);
1114 
1115 	return (0);
1116 }
1117 
1118 /*
1119  * Setup for layer 2 switching.
1120  *
1121  * Returns 0 on success, 1 on failure.
1122  */
1123 static int
1124 vsw_setup_layer2(vsw_t *vswp)
1125 {
1126 	D1(vswp, "%s: enter", __func__);
1127 
1128 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
1129 
1130 	/*
1131 	 * Attempt to link into the MAC layer so we can get
1132 	 * and send packets out over the physical adapter.
1133 	 */
1134 	if (vswp->mdprops & VSW_MD_PHYSNAME) {
1135 		if (vsw_mac_attach(vswp) != 0) {
1136 			/*
1137 			 * Registration with the MAC layer has failed,
1138 			 * so return 1 so that can fall back to next
1139 			 * prefered switching method.
1140 			 */
1141 			cmn_err(CE_WARN, "!vsw%d: Unable to join as MAC layer "
1142 			    "client", vswp->instance);
1143 			return (1);
1144 		}
1145 
1146 		if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
1147 			/*
1148 			 * Verify that underlying device can support multiple
1149 			 * unicast mac addresses.
1150 			 */
1151 			if (vsw_get_hw_maddr(vswp) != 0) {
1152 				cmn_err(CE_WARN, "!vsw%d: Unable to setup "
1153 				    "layer2 switching", vswp->instance);
1154 				vsw_mac_detach(vswp);
1155 				return (1);
1156 			}
1157 		}
1158 
1159 	} else {
1160 		/*
1161 		 * No physical device name found in MD which is
1162 		 * required for layer 2.
1163 		 */
1164 		cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
1165 		    vswp->instance);
1166 		return (1);
1167 	}
1168 
1169 	D1(vswp, "%s: exit", __func__);
1170 
1171 	return (0);
1172 }
1173 
1174 static int
1175 vsw_setup_layer3(vsw_t *vswp)
1176 {
1177 	D1(vswp, "%s: enter", __func__);
1178 
1179 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1180 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
1181 
1182 	D1(vswp, "%s: exit", __func__);
1183 
1184 	return (0);
1185 }
1186 
1187 /*
1188  * Link into the MAC layer to gain access to the services provided by
1189  * the underlying physical device driver (which should also have
1190  * registered with the MAC layer).
1191  *
1192  * Only when in layer 2 mode.
1193  */
1194 static int
1195 vsw_mac_attach(vsw_t *vswp)
1196 {
1197 	char	drv[LIFNAMSIZ];
1198 	uint_t	ddi_instance;
1199 
1200 	D1(vswp, "%s: enter", __func__);
1201 
1202 	ASSERT(vswp->mh == NULL);
1203 	ASSERT(vswp->mrh == NULL);
1204 	ASSERT(vswp->mstarted == B_FALSE);
1205 	ASSERT(vswp->mresources == B_FALSE);
1206 
1207 	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
1208 
1209 	mutex_enter(&vswp->mac_lock);
1210 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
1211 		cmn_err(CE_WARN, "!vsw%d: invalid device name: %s",
1212 		    vswp->instance, vswp->physname);
1213 		goto mac_fail_exit;
1214 	}
1215 
1216 	if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
1217 		cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
1218 		    vswp->instance, vswp->physname);
1219 		goto mac_fail_exit;
1220 	}
1221 
1222 	ASSERT(vswp->mh != NULL);
1223 
1224 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1225 
1226 	if (vsw_multi_ring_enable) {
1227 		/*
1228 		 * Initialize the ring table.
1229 		 */
1230 		vsw_mac_ring_tbl_init(vswp);
1231 
1232 		/*
1233 		 * Register our rx callback function.
1234 		 */
1235 		vswp->mrh = mac_rx_add(vswp->mh,
1236 		    vsw_rx_queue_cb, (void *)vswp);
1237 		ASSERT(vswp->mrh != NULL);
1238 
1239 		/*
1240 		 * Register our mac resource callback.
1241 		 */
1242 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
1243 		vswp->mresources = B_TRUE;
1244 
1245 		/*
1246 		 * Get the ring resources available to us from
1247 		 * the mac below us.
1248 		 */
1249 		mac_resources(vswp->mh);
1250 	} else {
1251 		/*
1252 		 * Just register our rx callback function
1253 		 */
1254 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1255 		ASSERT(vswp->mrh != NULL);
1256 	}
1257 
1258 	/* Get the MAC tx fn */
1259 	vswp->txinfo = mac_tx_get(vswp->mh);
1260 
1261 	/* start the interface */
1262 	if (mac_start(vswp->mh) != 0) {
1263 		cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
1264 		    vswp->instance);
1265 		goto mac_fail_exit;
1266 	}
1267 
1268 	mutex_exit(&vswp->mac_lock);
1269 
1270 	vswp->mstarted = B_TRUE;
1271 
1272 	D1(vswp, "%s: exit", __func__);
1273 	return (0);
1274 
1275 mac_fail_exit:
1276 	mutex_exit(&vswp->mac_lock);
1277 	vsw_mac_detach(vswp);
1278 
1279 	D1(vswp, "%s: exit", __func__);
1280 	return (1);
1281 }
1282 
1283 static void
1284 vsw_mac_detach(vsw_t *vswp)
1285 {
1286 	D1(vswp, "vsw_mac_detach: enter");
1287 
1288 	ASSERT(vswp != NULL);
1289 
1290 	if (vsw_multi_ring_enable) {
1291 		vsw_mac_ring_tbl_destroy(vswp);
1292 	}
1293 
1294 	mutex_enter(&vswp->mac_lock);
1295 
1296 	if (vswp->mh != NULL) {
1297 		if (vswp->mstarted)
1298 			mac_stop(vswp->mh);
1299 		if (vswp->mrh != NULL)
1300 			mac_rx_remove(vswp->mh, vswp->mrh);
1301 		if (vswp->mresources)
1302 			mac_resource_set(vswp->mh, NULL, NULL);
1303 		mac_close(vswp->mh);
1304 	}
1305 
1306 	vswp->mrh = NULL;
1307 	vswp->mh = NULL;
1308 	vswp->txinfo = NULL;
1309 	vswp->mstarted = B_FALSE;
1310 
1311 	mutex_exit(&vswp->mac_lock);
1312 
1313 	D1(vswp, "vsw_mac_detach: exit");
1314 }
1315 
1316 /*
1317  * Depending on the mode specified, the capabilites and capacity
1318  * of the underlying device setup the physical device.
1319  *
1320  * If in layer 3 mode, then do nothing.
1321  *
1322  * If in layer 2 programmed mode attempt to program the unicast address
1323  * associated with the port into the physical device. If this is not
1324  * possible due to resource exhaustion or simply because the device does
1325  * not support multiple unicast addresses then if required fallback onto
1326  * putting the card into promisc mode.
1327  *
1328  * If in promisc mode then simply set the card into promisc mode.
1329  *
1330  * Returns 0 success, 1 on failure.
1331  */
1332 static int
1333 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
1334 {
1335 	mac_multi_addr_t	mac_addr;
1336 	int			err;
1337 
1338 	D1(vswp, "%s: enter", __func__);
1339 
1340 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1341 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1342 
1343 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1344 		return (0);
1345 
1346 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
1347 		return (vsw_set_hw_promisc(vswp, port, type));
1348 	}
1349 
1350 	/*
1351 	 * Attempt to program the unicast address into the HW.
1352 	 */
1353 	mac_addr.mma_addrlen = ETHERADDRL;
1354 	if (type == VSW_VNETPORT) {
1355 		ASSERT(port != NULL);
1356 		ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
1357 	} else {
1358 		READ_ENTER(&vswp->if_lockrw);
1359 		/*
1360 		 * Don't program if the interface is not UP. This
1361 		 * is possible if the address has just been changed
1362 		 * in the MD node, but the interface has not yet been
1363 		 * plumbed.
1364 		 */
1365 		if (!(vswp->if_state & VSW_IF_UP)) {
1366 			RW_EXIT(&vswp->if_lockrw);
1367 			return (0);
1368 		}
1369 		ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
1370 		RW_EXIT(&vswp->if_lockrw);
1371 	}
1372 
1373 	err = vsw_set_hw_addr(vswp, &mac_addr);
1374 	if (err != 0) {
1375 		/*
1376 		 * Mark that attempt should be made to re-config sometime
1377 		 * in future if a port is deleted.
1378 		 */
1379 		vswp->recfg_reqd = B_TRUE;
1380 
1381 		/*
1382 		 * Only 1 mode specified, nothing more to do.
1383 		 */
1384 		if (vswp->smode_num == 1)
1385 			return (err);
1386 
1387 		/*
1388 		 * If promiscuous was next mode specified try to
1389 		 * set the card into that mode.
1390 		 */
1391 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
1392 		    (vswp->smode[vswp->smode_idx + 1] ==
1393 		    VSW_LAYER2_PROMISC)) {
1394 			vswp->smode_idx += 1;
1395 			return (vsw_set_hw_promisc(vswp, port, type));
1396 		}
1397 		return (err);
1398 	}
1399 
1400 	if (type == VSW_VNETPORT) {
1401 		port->addr_slot = mac_addr.mma_slot;
1402 		port->addr_set = VSW_ADDR_HW;
1403 	} else {
1404 		vswp->addr_slot = mac_addr.mma_slot;
1405 		vswp->addr_set = VSW_ADDR_HW;
1406 	}
1407 
1408 	D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x into slot %d "
1409 	    "of device %s",
1410 	    mac_addr.mma_addr[0], mac_addr.mma_addr[1],
1411 	    mac_addr.mma_addr[2], mac_addr.mma_addr[3],
1412 	    mac_addr.mma_addr[4], mac_addr.mma_addr[5],
1413 	    mac_addr.mma_slot, vswp->physname);
1414 
1415 	D1(vswp, "%s: exit", __func__);
1416 
1417 	return (0);
1418 }
1419 
1420 /*
1421  * If in layer 3 mode do nothing.
1422  *
1423  * If in layer 2 switched mode remove the address from the physical
1424  * device.
1425  *
1426  * If in layer 2 promiscuous mode disable promisc mode.
1427  *
1428  * Returns 0 on success.
1429  */
1430 static int
1431 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
1432 {
1433 	mac_addr_slot_t	slot;
1434 	int		rv;
1435 
1436 	D1(vswp, "%s: enter", __func__);
1437 
1438 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1439 
1440 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1441 		return (0);
1442 
1443 	switch (type) {
1444 	case VSW_VNETPORT:
1445 		ASSERT(port != NULL);
1446 
1447 		if (port->addr_set == VSW_ADDR_PROMISC) {
1448 			return (vsw_unset_hw_promisc(vswp, port, type));
1449 
1450 		} else if (port->addr_set == VSW_ADDR_HW) {
1451 			slot = port->addr_slot;
1452 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
1453 				port->addr_set = VSW_ADDR_UNSET;
1454 		}
1455 
1456 		break;
1457 
1458 	case VSW_LOCALDEV:
1459 		if (vswp->addr_set == VSW_ADDR_PROMISC) {
1460 			return (vsw_unset_hw_promisc(vswp, NULL, type));
1461 
1462 		} else if (vswp->addr_set == VSW_ADDR_HW) {
1463 			slot = vswp->addr_slot;
1464 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
1465 				vswp->addr_set = VSW_ADDR_UNSET;
1466 		}
1467 
1468 		break;
1469 
1470 	default:
1471 		/* should never happen */
1472 		DERR(vswp, "%s: unknown type %d", __func__, type);
1473 		ASSERT(0);
1474 		return (1);
1475 	}
1476 
1477 	D1(vswp, "%s: exit", __func__);
1478 	return (rv);
1479 }
1480 
1481 /*
1482  * Attempt to program a unicast address into HW.
1483  *
1484  * Returns 0 on sucess, 1 on failure.
1485  */
1486 static int
1487 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
1488 {
1489 	void	*mah;
1490 	int	rv;
1491 
1492 	D1(vswp, "%s: enter", __func__);
1493 
1494 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1495 
1496 	if (vswp->maddr.maddr_handle == NULL)
1497 		return (1);
1498 
1499 	mah = vswp->maddr.maddr_handle;
1500 
1501 	rv = vswp->maddr.maddr_add(mah, mac);
1502 
1503 	if (rv == 0)
1504 		return (0);
1505 
1506 	/*
1507 	 * Its okay for the add to fail because we have exhausted
1508 	 * all the resouces in the hardware device. Any other error
1509 	 * we want to flag.
1510 	 */
1511 	if (rv != ENOSPC) {
1512 		cmn_err(CE_WARN, "!vsw%d: error programming "
1513 		    "address %x:%x:%x:%x:%x:%x into HW "
1514 		    "err (%d)", vswp->instance,
1515 		    mac->mma_addr[0], mac->mma_addr[1],
1516 		    mac->mma_addr[2], mac->mma_addr[3],
1517 		    mac->mma_addr[4], mac->mma_addr[5], rv);
1518 	}
1519 	D1(vswp, "%s: exit", __func__);
1520 	return (1);
1521 }
1522 
1523 /*
1524  * Remove a unicast mac address which has previously been programmed
1525  * into HW.
1526  *
1527  * Returns 0 on sucess, 1 on failure.
1528  */
1529 static int
1530 vsw_unset_hw_addr(vsw_t *vswp, int slot)
1531 {
1532 	void	*mah;
1533 	int	rv;
1534 
1535 	D1(vswp, "%s: enter", __func__);
1536 
1537 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1538 	ASSERT(slot >= 0);
1539 
1540 	if (vswp->maddr.maddr_handle == NULL)
1541 		return (1);
1542 
1543 	mah = vswp->maddr.maddr_handle;
1544 
1545 	rv = vswp->maddr.maddr_remove(mah, slot);
1546 	if (rv != 0) {
1547 		cmn_err(CE_WARN, "!vsw%d: unable to remove address "
1548 		    "from slot %d in device %s (err %d)",
1549 		    vswp->instance, slot, vswp->physname, rv);
1550 		return (1);
1551 	}
1552 
1553 	D2(vswp, "removed addr from slot %d in device %s",
1554 	    slot, vswp->physname);
1555 
1556 	D1(vswp, "%s: exit", __func__);
1557 	return (0);
1558 }
1559 
1560 /*
1561  * Set network card into promisc mode.
1562  *
1563  * Returns 0 on success, 1 on failure.
1564  */
1565 static int
1566 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
1567 {
1568 	D1(vswp, "%s: enter", __func__);
1569 
1570 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1571 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1572 
1573 	mutex_enter(&vswp->mac_lock);
1574 	if (vswp->mh == NULL) {
1575 		mutex_exit(&vswp->mac_lock);
1576 		return (1);
1577 	}
1578 
1579 	if (vswp->promisc_cnt++ == 0) {
1580 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1581 			vswp->promisc_cnt--;
1582 			mutex_exit(&vswp->mac_lock);
1583 			return (1);
1584 		}
1585 		cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
1586 		    "promiscuous mode", vswp->instance, vswp->physname);
1587 	}
1588 	mutex_exit(&vswp->mac_lock);
1589 
1590 	if (type == VSW_VNETPORT) {
1591 		ASSERT(port != NULL);
1592 		port->addr_set = VSW_ADDR_PROMISC;
1593 	} else {
1594 		vswp->addr_set = VSW_ADDR_PROMISC;
1595 	}
1596 
1597 	D1(vswp, "%s: exit", __func__);
1598 
1599 	return (0);
1600 }
1601 
1602 /*
1603  * Turn off promiscuous mode on network card.
1604  *
1605  * Returns 0 on success, 1 on failure.
1606  */
1607 static int
1608 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
1609 {
1610 	vsw_port_list_t 	*plist = &vswp->plist;
1611 
1612 	D2(vswp, "%s: enter", __func__);
1613 
1614 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1615 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1616 
1617 	mutex_enter(&vswp->mac_lock);
1618 	if (vswp->mh == NULL) {
1619 		mutex_exit(&vswp->mac_lock);
1620 		return (1);
1621 	}
1622 
1623 	if (--vswp->promisc_cnt == 0) {
1624 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
1625 			vswp->promisc_cnt++;
1626 			mutex_exit(&vswp->mac_lock);
1627 			return (1);
1628 		}
1629 
1630 		/*
1631 		 * We are exiting promisc mode either because we were
1632 		 * only in promisc mode because we had failed over from
1633 		 * switched mode due to HW resource issues, or the user
1634 		 * wanted the card in promisc mode for all the ports and
1635 		 * the last port is now being deleted. Tweak the message
1636 		 * accordingly.
1637 		 */
1638 		if (plist->num_ports != 0) {
1639 			cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
1640 			    "programmed mode", vswp->instance, vswp->physname);
1641 		} else {
1642 			cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
1643 			    "promiscuous mode", vswp->instance, vswp->physname);
1644 		}
1645 	}
1646 	mutex_exit(&vswp->mac_lock);
1647 
1648 	if (type == VSW_VNETPORT) {
1649 		ASSERT(port != NULL);
1650 		ASSERT(port->addr_set == VSW_ADDR_PROMISC);
1651 		port->addr_set = VSW_ADDR_UNSET;
1652 	} else {
1653 		ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
1654 		vswp->addr_set = VSW_ADDR_UNSET;
1655 	}
1656 
1657 	D1(vswp, "%s: exit", __func__);
1658 	return (0);
1659 }
1660 
1661 /*
1662  * Determine whether or not we are operating in our prefered
1663  * mode and if not whether the physical resources now allow us
1664  * to operate in it.
1665  *
1666  * If a port is being removed should only be invoked after port has been
1667  * removed from the port list.
1668  */
1669 static void
1670 vsw_reconfig_hw(vsw_t *vswp)
1671 {
1672 	int			s_idx;
1673 
1674 	D1(vswp, "%s: enter", __func__);
1675 
1676 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1677 
1678 	if (vswp->maddr.maddr_handle == NULL) {
1679 		return;
1680 	}
1681 
1682 	/*
1683 	 * If we are in layer 2 (i.e. switched) or would like to be
1684 	 * in layer 2 then check if any ports or the vswitch itself
1685 	 * need to be programmed into the HW.
1686 	 *
1687 	 * This can happen in two cases - switched was specified as
1688 	 * the prefered mode of operation but we exhausted the HW
1689 	 * resources and so failed over to the next specifed mode,
1690 	 * or switched was the only mode specified so after HW
1691 	 * resources were exhausted there was nothing more we
1692 	 * could do.
1693 	 */
1694 	if (vswp->smode_idx > 0)
1695 		s_idx = vswp->smode_idx - 1;
1696 	else
1697 		s_idx = vswp->smode_idx;
1698 
1699 	if (vswp->smode[s_idx] != VSW_LAYER2) {
1700 		return;
1701 	}
1702 
1703 	D2(vswp, "%s: attempting reconfig..", __func__);
1704 
1705 	/*
1706 	 * First, attempt to set the vswitch mac address into HW,
1707 	 * if required.
1708 	 */
1709 	if (vsw_prog_if(vswp)) {
1710 		return;
1711 	}
1712 
1713 	/*
1714 	 * Next, attempt to set any ports which have not yet been
1715 	 * programmed into HW.
1716 	 */
1717 	if (vsw_prog_ports(vswp)) {
1718 		return;
1719 	}
1720 
1721 	/*
1722 	 * By now we know that have programmed all desired ports etc
1723 	 * into HW, so safe to mark reconfiguration as complete.
1724 	 */
1725 	vswp->recfg_reqd = B_FALSE;
1726 
1727 	vswp->smode_idx = s_idx;
1728 
1729 	D1(vswp, "%s: exit", __func__);
1730 }
1731 
1732 /*
1733  * Check to see if vsw itself is plumbed, and if so whether or not
1734  * its mac address should be written into HW.
1735  *
1736  * Returns 0 if could set address, or didn't have to set it.
1737  * Returns 1 if failed to set address.
1738  */
1739 static int
1740 vsw_prog_if(vsw_t *vswp)
1741 {
1742 	mac_multi_addr_t	addr;
1743 
1744 	D1(vswp, "%s: enter", __func__);
1745 
1746 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1747 
1748 	READ_ENTER(&vswp->if_lockrw);
1749 	if ((vswp->if_state & VSW_IF_UP) &&
1750 	    (vswp->addr_set != VSW_ADDR_HW)) {
1751 
1752 		addr.mma_addrlen = ETHERADDRL;
1753 		ether_copy(&vswp->if_addr, &addr.mma_addr);
1754 
1755 		if (vsw_set_hw_addr(vswp, &addr) != 0) {
1756 			RW_EXIT(&vswp->if_lockrw);
1757 			return (1);
1758 		}
1759 
1760 		vswp->addr_slot = addr.mma_slot;
1761 
1762 		/*
1763 		 * If previously when plumbed had had to place
1764 		 * interface into promisc mode, now reverse that.
1765 		 *
1766 		 * Note that interface will only actually be set into
1767 		 * non-promisc mode when last port/interface has been
1768 		 * programmed into HW.
1769 		 */
1770 		if (vswp->addr_set == VSW_ADDR_PROMISC)
1771 			(void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
1772 
1773 		vswp->addr_set = VSW_ADDR_HW;
1774 	}
1775 	RW_EXIT(&vswp->if_lockrw);
1776 
1777 	D1(vswp, "%s: exit", __func__);
1778 	return (0);
1779 }
1780 
1781 /*
1782  * Scan the port list for any ports which have not yet been set
1783  * into HW. For those found attempt to program their mac addresses
1784  * into the physical device.
1785  *
1786  * Returns 0 if able to program all required ports (can be 0) into HW.
1787  * Returns 1 if failed to set at least one mac address.
1788  */
1789 static int
1790 vsw_prog_ports(vsw_t *vswp)
1791 {
1792 	mac_multi_addr_t	addr;
1793 	vsw_port_list_t		*plist = &vswp->plist;
1794 	vsw_port_t		*tp;
1795 	int			rv = 0;
1796 
1797 	D1(vswp, "%s: enter", __func__);
1798 
1799 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1800 
1801 	READ_ENTER(&plist->lockrw);
1802 	for (tp = plist->head; tp != NULL; tp = tp->p_next) {
1803 		if (tp->addr_set != VSW_ADDR_HW) {
1804 			addr.mma_addrlen = ETHERADDRL;
1805 			ether_copy(&tp->p_macaddr, &addr.mma_addr);
1806 
1807 			if (vsw_set_hw_addr(vswp, &addr) != 0) {
1808 				rv = 1;
1809 				break;
1810 			}
1811 
1812 			tp->addr_slot = addr.mma_slot;
1813 
1814 			/*
1815 			 * If when this port had first attached we had
1816 			 * had to place the interface into promisc mode,
1817 			 * then now reverse that.
1818 			 *
1819 			 * Note that the interface will not actually
1820 			 * change to non-promisc mode until all ports
1821 			 * have been programmed.
1822 			 */
1823 			if (tp->addr_set == VSW_ADDR_PROMISC)
1824 				(void) vsw_unset_hw_promisc(vswp,
1825 				    tp, VSW_VNETPORT);
1826 
1827 			tp->addr_set = VSW_ADDR_HW;
1828 		}
1829 	}
1830 	RW_EXIT(&plist->lockrw);
1831 
1832 	D1(vswp, "%s: exit", __func__);
1833 	return (rv);
1834 }
1835 
1836 static void
1837 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
1838 {
1839 	ringp->ring_state = VSW_MAC_RING_FREE;
1840 	ringp->ring_arg = NULL;
1841 	ringp->ring_blank = NULL;
1842 	ringp->ring_vqp = NULL;
1843 	ringp->ring_vswp = vswp;
1844 }
1845 
1846 static void
1847 vsw_mac_ring_tbl_init(vsw_t *vswp)
1848 {
1849 	int		i;
1850 
1851 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
1852 
1853 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
1854 	vswp->mac_ring_tbl  =
1855 	    kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP);
1856 
1857 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
1858 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
1859 }
1860 
1861 static void
1862 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
1863 {
1864 	int		i;
1865 	vsw_mac_ring_t	*ringp;
1866 
1867 	mutex_enter(&vswp->mac_ring_lock);
1868 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1869 		ringp = &vswp->mac_ring_tbl[i];
1870 
1871 		if (ringp->ring_state != VSW_MAC_RING_FREE) {
1872 			/*
1873 			 * Destroy the queue.
1874 			 */
1875 			vsw_queue_stop(ringp->ring_vqp);
1876 			vsw_queue_destroy(ringp->ring_vqp);
1877 
1878 			/*
1879 			 * Re-initialize the structure.
1880 			 */
1881 			vsw_mac_ring_tbl_entry_init(vswp, ringp);
1882 		}
1883 	}
1884 	mutex_exit(&vswp->mac_ring_lock);
1885 
1886 	mutex_destroy(&vswp->mac_ring_lock);
1887 	kmem_free(vswp->mac_ring_tbl,
1888 	    vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
1889 	vswp->mac_ring_tbl_sz = 0;
1890 }
1891 
1892 /*
1893  * Handle resource add callbacks from the driver below.
1894  */
1895 static mac_resource_handle_t
1896 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
1897 {
1898 	vsw_t		*vswp = (vsw_t *)arg;
1899 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
1900 	vsw_mac_ring_t	*ringp;
1901 	vsw_queue_t	*vqp;
1902 	int		i;
1903 
1904 	ASSERT(vswp != NULL);
1905 	ASSERT(mrp != NULL);
1906 	ASSERT(vswp->mac_ring_tbl != NULL);
1907 
1908 	D1(vswp, "%s: enter", __func__);
1909 
1910 	/*
1911 	 * Check to make sure we have the correct resource type.
1912 	 */
1913 	if (mrp->mr_type != MAC_RX_FIFO)
1914 		return (NULL);
1915 
1916 	/*
1917 	 * Find a open entry in the ring table.
1918 	 */
1919 	mutex_enter(&vswp->mac_ring_lock);
1920 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1921 		ringp = &vswp->mac_ring_tbl[i];
1922 
1923 		/*
1924 		 * Check for an empty slot, if found, then setup queue
1925 		 * and thread.
1926 		 */
1927 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
1928 			/*
1929 			 * Create the queue for this ring.
1930 			 */
1931 			vqp = vsw_queue_create();
1932 
1933 			/*
1934 			 * Initialize the ring data structure.
1935 			 */
1936 			ringp->ring_vqp = vqp;
1937 			ringp->ring_arg = mrfp->mrf_arg;
1938 			ringp->ring_blank = mrfp->mrf_blank;
1939 			ringp->ring_state = VSW_MAC_RING_INUSE;
1940 
1941 			/*
1942 			 * Create the worker thread.
1943 			 */
1944 			vqp->vq_worker = thread_create(NULL, 0,
1945 			    vsw_queue_worker, ringp, 0, &p0,
1946 			    TS_RUN, minclsyspri);
1947 			if (vqp->vq_worker == NULL) {
1948 				vsw_queue_destroy(vqp);
1949 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
1950 				ringp = NULL;
1951 			}
1952 
1953 			if (ringp != NULL) {
1954 				/*
1955 				 * Make sure thread get's running state for
1956 				 * this ring.
1957 				 */
1958 				mutex_enter(&vqp->vq_lock);
1959 				while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
1960 				    (vqp->vq_state != VSW_QUEUE_DRAINED)) {
1961 					cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1962 				}
1963 
1964 				/*
1965 				 * If the thread is not running, cleanup.
1966 				 */
1967 				if (vqp->vq_state == VSW_QUEUE_DRAINED) {
1968 					vsw_queue_destroy(vqp);
1969 					vsw_mac_ring_tbl_entry_init(vswp,
1970 					    ringp);
1971 					ringp = NULL;
1972 				}
1973 				mutex_exit(&vqp->vq_lock);
1974 			}
1975 
1976 			mutex_exit(&vswp->mac_ring_lock);
1977 			D1(vswp, "%s: exit", __func__);
1978 			return ((mac_resource_handle_t)ringp);
1979 		}
1980 	}
1981 	mutex_exit(&vswp->mac_ring_lock);
1982 
1983 	/*
1984 	 * No slots in the ring table available.
1985 	 */
1986 	D1(vswp, "%s: exit", __func__);
1987 	return (NULL);
1988 }
1989 
1990 static void
1991 vsw_queue_stop(vsw_queue_t *vqp)
1992 {
1993 	mutex_enter(&vqp->vq_lock);
1994 
1995 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
1996 		vqp->vq_state = VSW_QUEUE_STOP;
1997 		cv_signal(&vqp->vq_cv);
1998 
1999 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
2000 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
2001 	}
2002 
2003 	vqp->vq_state = VSW_QUEUE_STOPPED;
2004 
2005 	mutex_exit(&vqp->vq_lock);
2006 }
2007 
2008 static vsw_queue_t *
2009 vsw_queue_create()
2010 {
2011 	vsw_queue_t *vqp;
2012 
2013 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
2014 
2015 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
2016 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
2017 	vqp->vq_first = NULL;
2018 	vqp->vq_last = NULL;
2019 	vqp->vq_state = VSW_QUEUE_STOPPED;
2020 
2021 	return (vqp);
2022 }
2023 
2024 static void
2025 vsw_queue_destroy(vsw_queue_t *vqp)
2026 {
2027 	cv_destroy(&vqp->vq_cv);
2028 	mutex_destroy(&vqp->vq_lock);
2029 	kmem_free(vqp, sizeof (vsw_queue_t));
2030 }
2031 
2032 static void
2033 vsw_queue_worker(vsw_mac_ring_t *rrp)
2034 {
2035 	mblk_t		*mp;
2036 	vsw_queue_t	*vqp = rrp->ring_vqp;
2037 	vsw_t		*vswp = rrp->ring_vswp;
2038 
2039 	mutex_enter(&vqp->vq_lock);
2040 
2041 	ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
2042 
2043 	/*
2044 	 * Set the state to running, since the thread is now active.
2045 	 */
2046 	vqp->vq_state = VSW_QUEUE_RUNNING;
2047 	cv_signal(&vqp->vq_cv);
2048 
2049 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
2050 		/*
2051 		 * Wait for work to do or the state has changed
2052 		 * to not running.
2053 		 */
2054 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
2055 		    (vqp->vq_first == NULL)) {
2056 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
2057 		}
2058 
2059 		/*
2060 		 * Process packets that we received from the interface.
2061 		 */
2062 		if (vqp->vq_first != NULL) {
2063 			mp = vqp->vq_first;
2064 
2065 			vqp->vq_first = NULL;
2066 			vqp->vq_last = NULL;
2067 
2068 			mutex_exit(&vqp->vq_lock);
2069 
2070 			/* switch the chain of packets received */
2071 			vswp->vsw_switch_frame(vswp, mp,
2072 			    VSW_PHYSDEV, NULL, NULL);
2073 
2074 			mutex_enter(&vqp->vq_lock);
2075 		}
2076 	}
2077 
2078 	/*
2079 	 * We are drained and signal we are done.
2080 	 */
2081 	vqp->vq_state = VSW_QUEUE_DRAINED;
2082 	cv_signal(&vqp->vq_cv);
2083 
2084 	/*
2085 	 * Exit lock and drain the remaining packets.
2086 	 */
2087 	mutex_exit(&vqp->vq_lock);
2088 
2089 	/*
2090 	 * Exit the thread
2091 	 */
2092 	thread_exit();
2093 }
2094 
2095 /*
2096  * static void
2097  * vsw_rx_queue_cb() - Receive callback routine when
2098  *	vsw_multi_ring_enable is non-zero.  Queue the packets
2099  *	to a packet queue for a worker thread to process.
2100  */
2101 static void
2102 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
2103 {
2104 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
2105 	vsw_t		*vswp = (vsw_t *)arg;
2106 	vsw_queue_t	*vqp;
2107 	mblk_t		*bp, *last;
2108 
2109 	ASSERT(mrh != NULL);
2110 	ASSERT(vswp != NULL);
2111 	ASSERT(mp != NULL);
2112 
2113 	D1(vswp, "%s: enter", __func__);
2114 
2115 	/*
2116 	 * Find the last element in the mblk chain.
2117 	 */
2118 	bp = mp;
2119 	do {
2120 		last = bp;
2121 		bp = bp->b_next;
2122 	} while (bp != NULL);
2123 
2124 	/* Get the queue for the packets */
2125 	vqp = ringp->ring_vqp;
2126 
2127 	/*
2128 	 * Grab the lock such we can queue the packets.
2129 	 */
2130 	mutex_enter(&vqp->vq_lock);
2131 
2132 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
2133 		freemsg(mp);
2134 		mutex_exit(&vqp->vq_lock);
2135 		goto vsw_rx_queue_cb_exit;
2136 	}
2137 
2138 	/*
2139 	 * Add the mblk chain to the queue.  If there
2140 	 * is some mblks in the queue, then add the new
2141 	 * chain to the end.
2142 	 */
2143 	if (vqp->vq_first == NULL)
2144 		vqp->vq_first = mp;
2145 	else
2146 		vqp->vq_last->b_next = mp;
2147 
2148 	vqp->vq_last = last;
2149 
2150 	/*
2151 	 * Signal the worker thread that there is work to
2152 	 * do.
2153 	 */
2154 	cv_signal(&vqp->vq_cv);
2155 
2156 	/*
2157 	 * Let go of the lock and exit.
2158 	 */
2159 	mutex_exit(&vqp->vq_lock);
2160 
2161 vsw_rx_queue_cb_exit:
2162 	D1(vswp, "%s: exit", __func__);
2163 }
2164 
2165 /*
2166  * receive callback routine. Invoked by MAC layer when there
2167  * are pkts being passed up from physical device.
2168  *
2169  * PERF: It may be more efficient when the card is in promisc
2170  * mode to check the dest address of the pkts here (against
2171  * the FDB) rather than checking later. Needs to be investigated.
2172  */
2173 static void
2174 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
2175 {
2176 	_NOTE(ARGUNUSED(mrh))
2177 
2178 	vsw_t		*vswp = (vsw_t *)arg;
2179 
2180 	ASSERT(vswp != NULL);
2181 
2182 	D1(vswp, "vsw_rx_cb: enter");
2183 
2184 	/* switch the chain of packets received */
2185 	vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
2186 
2187 	D1(vswp, "vsw_rx_cb: exit");
2188 }
2189 
2190 /*
2191  * Send a message out over the physical device via the MAC layer.
2192  *
2193  * Returns any mblks that it was unable to transmit.
2194  */
2195 static mblk_t *
2196 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
2197 {
2198 	const mac_txinfo_t	*mtp;
2199 	mblk_t			*nextp;
2200 
2201 	mutex_enter(&vswp->mac_lock);
2202 	if (vswp->mh == NULL) {
2203 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
2204 		mutex_exit(&vswp->mac_lock);
2205 		return (mp);
2206 	} else {
2207 		for (;;) {
2208 			nextp = mp->b_next;
2209 			mp->b_next = NULL;
2210 
2211 			mtp = vswp->txinfo;
2212 
2213 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
2214 				mp->b_next = nextp;
2215 				break;
2216 			}
2217 
2218 			if ((mp = nextp) == NULL)
2219 				break;
2220 		}
2221 	}
2222 	mutex_exit(&vswp->mac_lock);
2223 
2224 	return (mp);
2225 }
2226 
2227 /*
2228  * Register with the MAC layer as a network device, so we
2229  * can be plumbed if necessary.
2230  */
2231 static int
2232 vsw_mac_register(vsw_t *vswp)
2233 {
2234 	mac_register_t	*macp;
2235 	int		rv;
2236 
2237 	D1(vswp, "%s: enter", __func__);
2238 
2239 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
2240 		return (EINVAL);
2241 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2242 	macp->m_driver = vswp;
2243 	macp->m_dip = vswp->dip;
2244 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
2245 	macp->m_callbacks = &vsw_m_callbacks;
2246 	macp->m_min_sdu = 0;
2247 	macp->m_max_sdu = ETHERMTU;
2248 	rv = mac_register(macp, &vswp->if_mh);
2249 	mac_free(macp);
2250 	if (rv == 0)
2251 		vswp->if_state |= VSW_IF_REG;
2252 
2253 	D1(vswp, "%s: exit", __func__);
2254 
2255 	return (rv);
2256 }
2257 
2258 static int
2259 vsw_mac_unregister(vsw_t *vswp)
2260 {
2261 	int		rv = 0;
2262 
2263 	D1(vswp, "%s: enter", __func__);
2264 
2265 	WRITE_ENTER(&vswp->if_lockrw);
2266 
2267 	if (vswp->if_state & VSW_IF_REG) {
2268 		rv = mac_unregister(vswp->if_mh);
2269 		if (rv != 0) {
2270 			DWARN(vswp, "%s: unable to unregister from MAC "
2271 			    "framework", __func__);
2272 
2273 			RW_EXIT(&vswp->if_lockrw);
2274 			D1(vswp, "%s: fail exit", __func__);
2275 			return (rv);
2276 		}
2277 
2278 		/* mark i/f as down and unregistered */
2279 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
2280 	}
2281 	RW_EXIT(&vswp->if_lockrw);
2282 
2283 	D1(vswp, "%s: exit", __func__);
2284 
2285 	return (rv);
2286 }
2287 
2288 static int
2289 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
2290 {
2291 	vsw_t			*vswp = (vsw_t *)arg;
2292 
2293 	D1(vswp, "%s: enter", __func__);
2294 
2295 	mutex_enter(&vswp->mac_lock);
2296 	if (vswp->mh == NULL) {
2297 		mutex_exit(&vswp->mac_lock);
2298 		return (EINVAL);
2299 	}
2300 
2301 	/* return stats from underlying device */
2302 	*val = mac_stat_get(vswp->mh, stat);
2303 
2304 	mutex_exit(&vswp->mac_lock);
2305 
2306 	return (0);
2307 }
2308 
2309 static void
2310 vsw_m_stop(void *arg)
2311 {
2312 	vsw_t		*vswp = (vsw_t *)arg;
2313 
2314 	D1(vswp, "%s: enter", __func__);
2315 
2316 	WRITE_ENTER(&vswp->if_lockrw);
2317 	vswp->if_state &= ~VSW_IF_UP;
2318 	RW_EXIT(&vswp->if_lockrw);
2319 
2320 	mutex_enter(&vswp->hw_lock);
2321 
2322 	(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
2323 
2324 	if (vswp->recfg_reqd)
2325 		vsw_reconfig_hw(vswp);
2326 
2327 	mutex_exit(&vswp->hw_lock);
2328 
2329 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2330 }
2331 
2332 static int
2333 vsw_m_start(void *arg)
2334 {
2335 	vsw_t		*vswp = (vsw_t *)arg;
2336 
2337 	D1(vswp, "%s: enter", __func__);
2338 
2339 	WRITE_ENTER(&vswp->if_lockrw);
2340 	vswp->if_state |= VSW_IF_UP;
2341 	RW_EXIT(&vswp->if_lockrw);
2342 
2343 	mutex_enter(&vswp->hw_lock);
2344 	(void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
2345 	mutex_exit(&vswp->hw_lock);
2346 
2347 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2348 	return (0);
2349 }
2350 
2351 /*
2352  * Change the local interface address.
2353  *
2354  * Note: we don't support this entry point. The local
2355  * mac address of the switch can only be changed via its
2356  * MD node properties.
2357  */
2358 static int
2359 vsw_m_unicst(void *arg, const uint8_t *macaddr)
2360 {
2361 	_NOTE(ARGUNUSED(arg, macaddr))
2362 
2363 	return (DDI_FAILURE);
2364 }
2365 
2366 static int
2367 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
2368 {
2369 	vsw_t		*vswp = (vsw_t *)arg;
2370 	mcst_addr_t	*mcst_p = NULL;
2371 	uint64_t	addr = 0x0;
2372 	int		i, ret = 0;
2373 
2374 	D1(vswp, "%s: enter", __func__);
2375 
2376 	/*
2377 	 * Convert address into form that can be used
2378 	 * as hash table key.
2379 	 */
2380 	for (i = 0; i < ETHERADDRL; i++) {
2381 		addr = (addr << 8) | mca[i];
2382 	}
2383 
2384 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
2385 
2386 	if (add) {
2387 		D2(vswp, "%s: adding multicast", __func__);
2388 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2389 			/*
2390 			 * Update the list of multicast addresses
2391 			 * contained within the vsw_t structure to
2392 			 * include this new one.
2393 			 */
2394 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
2395 			if (mcst_p == NULL) {
2396 				DERR(vswp, "%s unable to alloc mem", __func__);
2397 				return (1);
2398 			}
2399 			mcst_p->addr = addr;
2400 
2401 			mutex_enter(&vswp->mca_lock);
2402 			mcst_p->nextp = vswp->mcap;
2403 			vswp->mcap = mcst_p;
2404 			mutex_exit(&vswp->mca_lock);
2405 
2406 			/*
2407 			 * Call into the underlying driver to program the
2408 			 * address into HW.
2409 			 */
2410 			mutex_enter(&vswp->mac_lock);
2411 			if (vswp->mh != NULL) {
2412 				ret = mac_multicst_add(vswp->mh, mca);
2413 				if (ret != 0) {
2414 					cmn_err(CE_WARN, "!vsw%d: unable to "
2415 					    "add multicast address",
2416 					    vswp->instance);
2417 					mutex_exit(&vswp->mac_lock);
2418 					goto vsw_remove_addr;
2419 				}
2420 			}
2421 			mutex_exit(&vswp->mac_lock);
2422 		} else {
2423 			cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
2424 			    "address", vswp->instance);
2425 		}
2426 		return (ret);
2427 	}
2428 
2429 vsw_remove_addr:
2430 
2431 	D2(vswp, "%s: removing multicast", __func__);
2432 	/*
2433 	 * Remove the address from the hash table..
2434 	 */
2435 	if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2436 
2437 		/*
2438 		 * ..and then from the list maintained in the
2439 		 * vsw_t structure.
2440 		 */
2441 		vsw_del_addr(VSW_LOCALDEV, vswp, addr);
2442 
2443 		mutex_enter(&vswp->mac_lock);
2444 		if (vswp->mh != NULL)
2445 			(void) mac_multicst_remove(vswp->mh, mca);
2446 		mutex_exit(&vswp->mac_lock);
2447 	}
2448 
2449 	D1(vswp, "%s: exit", __func__);
2450 
2451 	return (0);
2452 }
2453 
2454 static int
2455 vsw_m_promisc(void *arg, boolean_t on)
2456 {
2457 	vsw_t		*vswp = (vsw_t *)arg;
2458 
2459 	D1(vswp, "%s: enter", __func__);
2460 
2461 	WRITE_ENTER(&vswp->if_lockrw);
2462 	if (on)
2463 		vswp->if_state |= VSW_IF_PROMISC;
2464 	else
2465 		vswp->if_state &= ~VSW_IF_PROMISC;
2466 	RW_EXIT(&vswp->if_lockrw);
2467 
2468 	D1(vswp, "%s: exit", __func__);
2469 
2470 	return (0);
2471 }
2472 
2473 static mblk_t *
2474 vsw_m_tx(void *arg, mblk_t *mp)
2475 {
2476 	vsw_t		*vswp = (vsw_t *)arg;
2477 
2478 	D1(vswp, "%s: enter", __func__);
2479 
2480 	vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
2481 
2482 	D1(vswp, "%s: exit", __func__);
2483 
2484 	return (NULL);
2485 }
2486 
2487 /*
2488  * Register for machine description (MD) updates.
2489  *
2490  * Returns 0 on success, 1 on failure.
2491  */
2492 static int
2493 vsw_mdeg_register(vsw_t *vswp)
2494 {
2495 	mdeg_prop_spec_t	*pspecp;
2496 	mdeg_node_spec_t	*inst_specp;
2497 	mdeg_handle_t		mdeg_hdl, mdeg_port_hdl;
2498 	size_t			templatesz;
2499 	int			inst, rv;
2500 
2501 	D1(vswp, "%s: enter", __func__);
2502 
2503 	/*
2504 	 * In each 'virtual-device' node in the MD there is a
2505 	 * 'cfg-handle' property which is the MD's concept of
2506 	 * an instance number (this may be completely different from
2507 	 * the device drivers instance #). OBP reads that value and
2508 	 * stores it in the 'reg' property of the appropriate node in
2509 	 * the device tree. So we use the 'reg' value when registering
2510 	 * with the mdeg framework, to ensure we get events for the
2511 	 * correct nodes.
2512 	 */
2513 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
2514 	    DDI_PROP_DONTPASS, reg_propname, -1);
2515 	if (inst == -1) {
2516 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from "
2517 		    "OBP device tree", vswp->instance, reg_propname);
2518 		return (1);
2519 	}
2520 
2521 	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
2522 
2523 	/*
2524 	 * Allocate and initialize a per-instance copy
2525 	 * of the global property spec array that will
2526 	 * uniquely identify this vsw instance.
2527 	 */
2528 	templatesz = sizeof (vsw_prop_template);
2529 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
2530 
2531 	bcopy(vsw_prop_template, pspecp, templatesz);
2532 
2533 	VSW_SET_MDEG_PROP_INST(pspecp, inst);
2534 
2535 	/* initialize the complete prop spec structure */
2536 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
2537 	inst_specp->namep = "virtual-device";
2538 	inst_specp->specp = pspecp;
2539 
2540 	/*
2541 	 * Register an interest in 'virtual-device' nodes with a
2542 	 * 'name' property of 'virtual-network-switch'
2543 	 */
2544 	rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb,
2545 	    (void *)vswp, &mdeg_hdl);
2546 	if (rv != MDEG_SUCCESS) {
2547 		DERR(vswp, "%s: mdeg_register failed (%d) for vsw node",
2548 		    __func__, rv);
2549 		goto mdeg_reg_fail;
2550 	}
2551 
2552 	/*
2553 	 * Register an interest in 'vsw-port' nodes.
2554 	 */
2555 	rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb,
2556 	    (void *)vswp, &mdeg_port_hdl);
2557 	if (rv != MDEG_SUCCESS) {
2558 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
2559 		(void) mdeg_unregister(mdeg_hdl);
2560 		goto mdeg_reg_fail;
2561 	}
2562 
2563 	/* save off data that will be needed later */
2564 	vswp->inst_spec = inst_specp;
2565 	vswp->mdeg_hdl = mdeg_hdl;
2566 	vswp->mdeg_port_hdl = mdeg_port_hdl;
2567 
2568 	D1(vswp, "%s: exit", __func__);
2569 	return (0);
2570 
2571 mdeg_reg_fail:
2572 	cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks",
2573 	    vswp->instance);
2574 	kmem_free(pspecp, templatesz);
2575 	kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
2576 
2577 	vswp->mdeg_hdl = NULL;
2578 	vswp->mdeg_port_hdl = NULL;
2579 
2580 	return (1);
2581 }
2582 
2583 static void
2584 vsw_mdeg_unregister(vsw_t *vswp)
2585 {
2586 	D1(vswp, "vsw_mdeg_unregister: enter");
2587 
2588 	if (vswp->mdeg_hdl != NULL)
2589 		(void) mdeg_unregister(vswp->mdeg_hdl);
2590 
2591 	if (vswp->mdeg_port_hdl != NULL)
2592 		(void) mdeg_unregister(vswp->mdeg_port_hdl);
2593 
2594 	if (vswp->inst_spec != NULL) {
2595 		if (vswp->inst_spec->specp != NULL) {
2596 			(void) kmem_free(vswp->inst_spec->specp,
2597 			    sizeof (vsw_prop_template));
2598 			vswp->inst_spec->specp = NULL;
2599 		}
2600 
2601 		(void) kmem_free(vswp->inst_spec, sizeof (mdeg_node_spec_t));
2602 		vswp->inst_spec = NULL;
2603 	}
2604 
2605 	D1(vswp, "vsw_mdeg_unregister: exit");
2606 }
2607 
2608 /*
2609  * Mdeg callback invoked for the vsw node itself.
2610  */
2611 static int
2612 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2613 {
2614 	vsw_t		*vswp;
2615 	int		idx;
2616 	md_t		*mdp;
2617 	mde_cookie_t	node;
2618 	uint64_t	inst;
2619 	char		*node_name = NULL;
2620 
2621 	if (resp == NULL)
2622 		return (MDEG_FAILURE);
2623 
2624 	vswp = (vsw_t *)cb_argp;
2625 
2626 	D1(vswp, "%s: added %d : removed %d : curr matched %d"
2627 	    " : prev matched %d", __func__, resp->added.nelem,
2628 	    resp->removed.nelem, resp->match_curr.nelem,
2629 	    resp->match_prev.nelem);
2630 
2631 	/*
2632 	 * Expect 'added' to be non-zero if virtual-network-switch
2633 	 * nodes exist in the MD when the driver attaches.
2634 	 */
2635 	for (idx = 0; idx < resp->added.nelem; idx++) {
2636 		mdp = resp->added.mdp;
2637 		node = resp->added.mdep[idx];
2638 
2639 		if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
2640 			DERR(vswp, "%s: unable to get node name for "
2641 			    "node(%d) 0x%lx", __func__, idx, node);
2642 			continue;
2643 		}
2644 
2645 		if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
2646 			DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
2647 			    __func__, idx);
2648 			continue;
2649 		}
2650 
2651 		D2(vswp, "%s: added node(%d) 0x%lx with name %s "
2652 		    "and inst %d", __func__, idx, node, node_name, inst);
2653 
2654 		vsw_get_initial_md_properties(vswp, mdp, node);
2655 	}
2656 
2657 	/*
2658 	 * A non-zero 'match' value indicates that the MD has been
2659 	 * updated and that a virtual-network-switch node is present
2660 	 * which may or may not have been updated. It is up to the clients
2661 	 * to examine their own nodes and determine if they have changed.
2662 	 */
2663 	for (idx = 0; idx < resp->match_curr.nelem; idx++) {
2664 		mdp = resp->match_curr.mdp;
2665 		node = resp->match_curr.mdep[idx];
2666 
2667 		if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
2668 			DERR(vswp, "%s: unable to get node name for "
2669 			    "node(%d) 0x%lx", __func__, idx, node);
2670 			continue;
2671 		}
2672 
2673 		if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
2674 			DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
2675 			    __func__, idx);
2676 			continue;
2677 		}
2678 
2679 		D2(vswp, "%s: changed node(%d) 0x%lx with name %s "
2680 		    "and inst %d", __func__, idx, node, node_name, inst);
2681 
2682 		vsw_update_md_prop(vswp, mdp, node);
2683 	}
2684 
2685 	return (MDEG_SUCCESS);
2686 }
2687 
2688 /*
2689  * Mdeg callback invoked for changes to the vsw-port nodes
2690  * under the vsw node.
2691  */
2692 static int
2693 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2694 {
2695 	vsw_t		*vswp;
2696 	int		idx;
2697 	md_t		*mdp;
2698 	mde_cookie_t	node;
2699 	uint64_t	inst;
2700 
2701 	if ((resp == NULL) || (cb_argp == NULL))
2702 		return (MDEG_FAILURE);
2703 
2704 	vswp = (vsw_t *)cb_argp;
2705 
2706 	D2(vswp, "%s: added %d : removed %d : curr matched %d"
2707 	    " : prev matched %d", __func__, resp->added.nelem,
2708 	    resp->removed.nelem, resp->match_curr.nelem,
2709 	    resp->match_prev.nelem);
2710 
2711 	/* process added ports */
2712 	for (idx = 0; idx < resp->added.nelem; idx++) {
2713 		mdp = resp->added.mdp;
2714 		node = resp->added.mdep[idx];
2715 
2716 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
2717 
2718 		if (vsw_port_add(vswp, mdp, &node) != 0) {
2719 			cmn_err(CE_WARN, "!vsw%d: Unable to add new port "
2720 			    "(0x%lx)", vswp->instance, node);
2721 		}
2722 	}
2723 
2724 	/* process removed ports */
2725 	for (idx = 0; idx < resp->removed.nelem; idx++) {
2726 		mdp = resp->removed.mdp;
2727 		node = resp->removed.mdep[idx];
2728 
2729 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
2730 			DERR(vswp, "%s: prop(%s) not found in port(%d)",
2731 			    __func__, id_propname, idx);
2732 			continue;
2733 		}
2734 
2735 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
2736 
2737 		if (vsw_port_detach(vswp, inst) != 0) {
2738 			cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld",
2739 			    vswp->instance, inst);
2740 		}
2741 	}
2742 
2743 	/*
2744 	 * Currently no support for updating already active ports.
2745 	 * So, ignore the match_curr and match_priv arrays for now.
2746 	 */
2747 
2748 	D1(vswp, "%s: exit", __func__);
2749 
2750 	return (MDEG_SUCCESS);
2751 }
2752 
2753 /*
2754  * Read the initial start-of-day values from the specified MD node.
2755  */
2756 static void
2757 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
2758 {
2759 	int		i;
2760 	uint64_t 	macaddr = 0;
2761 
2762 	D1(vswp, "%s: enter", __func__);
2763 
2764 	if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) == 0) {
2765 		/*
2766 		 * Note it is valid for the physname property to
2767 		 * be NULL so check actual name length to determine
2768 		 * if we have a actual device name.
2769 		 */
2770 		if (strlen(vswp->physname) > 0)
2771 			vswp->mdprops |= VSW_MD_PHYSNAME;
2772 	} else {
2773 		cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
2774 		    "device from MD", vswp->instance);
2775 		return;
2776 	}
2777 
2778 	/* mac address for vswitch device itself */
2779 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
2780 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
2781 		    vswp->instance);
2782 
2783 		/*
2784 		 * Fallback to using the mac address of the physical
2785 		 * device.
2786 		 */
2787 		if (vsw_get_physaddr(vswp) == 0) {
2788 			cmn_err(CE_NOTE, "!vsw%d: Using MAC address from "
2789 			    "physical device (%s)", vswp->instance,
2790 			    vswp->physname);
2791 		} else {
2792 			cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address"
2793 			    "from device %s", vswp->instance, vswp->physname);
2794 		}
2795 	} else {
2796 		WRITE_ENTER(&vswp->if_lockrw);
2797 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2798 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
2799 			macaddr >>= 8;
2800 		}
2801 		RW_EXIT(&vswp->if_lockrw);
2802 		vswp->mdprops |= VSW_MD_MACADDR;
2803 	}
2804 
2805 	if (vsw_get_md_smodes(vswp, mdp, node, vswp->smode, &vswp->smode_num)) {
2806 		cmn_err(CE_WARN, "vsw%d: Unable to read %s property from "
2807 		    "MD, defaulting to programmed mode", vswp->instance,
2808 		    smode_propname);
2809 
2810 		for (i = 0; i < NUM_SMODES; i++)
2811 			vswp->smode[i] = VSW_LAYER2;
2812 
2813 		vswp->smode_num = NUM_SMODES;
2814 	} else {
2815 		ASSERT(vswp->smode_num != 0);
2816 		vswp->mdprops |= VSW_MD_SMODE;
2817 	}
2818 
2819 	/*
2820 	 * Unable to setup any switching mode, nothing more
2821 	 * we can do.
2822 	 */
2823 	if (vsw_setup_switching(vswp))
2824 		return;
2825 
2826 	WRITE_ENTER(&vswp->if_lockrw);
2827 	vswp->if_state &= ~VSW_IF_UP;
2828 	RW_EXIT(&vswp->if_lockrw);
2829 	if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
2830 		if (vsw_mac_register(vswp) != 0) {
2831 			/*
2832 			 * Treat this as a non-fatal error as we may be
2833 			 * able to operate in some other mode.
2834 			 */
2835 			cmn_err(CE_WARN, "vsw%d: Unable to register as "
2836 			    "provider with MAC layer", vswp->instance);
2837 		}
2838 	}
2839 
2840 	D1(vswp, "%s: exit", __func__);
2841 }
2842 
2843 /*
2844  * Check to see if the relevant properties in the specified node have
2845  * changed, and if so take the appropriate action.
2846  *
2847  * If any of the properties are missing or invalid we don't take
2848  * any action, as this function should only be invoked when modifications
2849  * have been made to what we assume is a working configuration, which
2850  * we leave active.
2851  *
2852  * Note it is legal for this routine to be invoked even if none of the
2853  * properties in the port node within the MD have actually changed.
2854  */
2855 static void
2856 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
2857 {
2858 	char		physname[LIFNAMSIZ];
2859 	char		drv[LIFNAMSIZ];
2860 	uint_t		ddi_instance;
2861 	uint8_t		new_smode[NUM_SMODES];
2862 	int		i, smode_num = 0;
2863 	uint64_t 	macaddr = 0;
2864 	vsw_port_list_t *plist = &vswp->plist;
2865 	vsw_port_t	*port = NULL;
2866 	enum		{MD_init = 0x1,
2867 				MD_physname = 0x2,
2868 				MD_macaddr = 0x4,
2869 				MD_smode = 0x8} updated;
2870 
2871 	updated = MD_init;
2872 
2873 	D1(vswp, "%s: enter", __func__);
2874 
2875 	/*
2876 	 * Check if name of physical device in MD has changed.
2877 	 */
2878 	if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) {
2879 		/*
2880 		 * Do basic sanity check on new device name/instance,
2881 		 * if its non NULL. It is valid for the device name to
2882 		 * have changed from a non NULL to a NULL value, i.e.
2883 		 * the vsw is being changed to 'routed' mode.
2884 		 */
2885 		if ((strlen(physname) != 0) &&
2886 		    (ddi_parse(physname, drv, &ddi_instance) != DDI_SUCCESS)) {
2887 			cmn_err(CE_WARN, "!vsw%d: new device name %s is not"
2888 			    " a valid device name/instance",
2889 			    vswp->instance, physname);
2890 			goto fail_reconf;
2891 		}
2892 
2893 		if (strcmp(physname, vswp->physname)) {
2894 			D2(vswp, "%s: device name changed from %s to %s",
2895 			    __func__, vswp->physname, physname);
2896 
2897 			updated |= MD_physname;
2898 		} else {
2899 			D2(vswp, "%s: device name unchanged at %s",
2900 			    __func__, vswp->physname);
2901 		}
2902 	} else {
2903 		cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
2904 		    "device from updated MD.", vswp->instance);
2905 		goto fail_reconf;
2906 	}
2907 
2908 	/*
2909 	 * Check if MAC address has changed.
2910 	 */
2911 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
2912 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
2913 		    vswp->instance);
2914 		goto fail_reconf;
2915 	} else {
2916 		READ_ENTER(&vswp->if_lockrw);
2917 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2918 			if (vswp->if_addr.ether_addr_octet[i] !=
2919 			    (macaddr & 0xFF)) {
2920 				D2(vswp, "%s: octet[%d] 0x%x != 0x%x",
2921 				    __func__, i,
2922 				    vswp->if_addr.ether_addr_octet[i],
2923 				    (macaddr & 0xFF));
2924 				updated |= MD_macaddr;
2925 				break;
2926 			}
2927 			macaddr >>= 8;
2928 		}
2929 		RW_EXIT(&vswp->if_lockrw);
2930 	}
2931 
2932 	/*
2933 	 * Check if switching modes have changed.
2934 	 */
2935 	if (vsw_get_md_smodes(vswp, mdp, node, new_smode, &smode_num)) {
2936 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
2937 		    vswp->instance, smode_propname);
2938 		goto fail_reconf;
2939 	} else {
2940 		ASSERT(smode_num != 0);
2941 		if (smode_num != vswp->smode_num) {
2942 			D2(vswp, "%s: number of modes changed from %d to %d",
2943 			    __func__, vswp->smode_num, smode_num);
2944 		}
2945 
2946 		for (i = 0; i < smode_num; i++) {
2947 			if (new_smode[i] != vswp->smode[i]) {
2948 				D2(vswp, "%s: mode changed from %d to %d",
2949 				    __func__, vswp->smode[i], new_smode[i]);
2950 				updated |= MD_smode;
2951 				break;
2952 			}
2953 		}
2954 	}
2955 
2956 	/*
2957 	 * Now make any changes which are needed...
2958 	 */
2959 
2960 	if (updated & (MD_physname | MD_smode)) {
2961 		/*
2962 		 * Disconnect all ports from the current card
2963 		 */
2964 		WRITE_ENTER(&plist->lockrw);
2965 		for (port = plist->head; port != NULL; port = port->p_next) {
2966 			/* Remove address if was programmed into HW. */
2967 			mutex_enter(&vswp->hw_lock);
2968 			if (vsw_unset_hw(vswp, port, VSW_VNETPORT)) {
2969 				mutex_exit(&vswp->hw_lock);
2970 				RW_EXIT(&plist->lockrw);
2971 				goto fail_update;
2972 			}
2973 			mutex_exit(&vswp->hw_lock);
2974 		}
2975 		RW_EXIT(&plist->lockrw);
2976 
2977 		/*
2978 		 * Stop, detach the old device..
2979 		 */
2980 		vsw_mac_detach(vswp);
2981 
2982 		/*
2983 		 * Update phys name.
2984 		 */
2985 		if (updated & MD_physname) {
2986 			cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s",
2987 			    vswp->instance, vswp->physname, physname);
2988 			(void) strncpy(vswp->physname,
2989 			    physname, strlen(physname) + 1);
2990 
2991 			if (strlen(vswp->physname) > 0)
2992 				vswp->mdprops |= VSW_MD_PHYSNAME;
2993 		}
2994 
2995 		/*
2996 		 * Update array with the new switch mode values.
2997 		 */
2998 		if (updated & MD_smode) {
2999 			for (i = 0; i < smode_num; i++)
3000 				vswp->smode[i] = new_smode[i];
3001 
3002 			vswp->smode_num = smode_num;
3003 			vswp->smode_idx = 0;
3004 		}
3005 
3006 		/*
3007 		 * ..and attach, start the new device.
3008 		 */
3009 		if (vsw_setup_switching(vswp))
3010 			goto fail_update;
3011 
3012 		/*
3013 		 * Connect ports to new card.
3014 		 */
3015 		WRITE_ENTER(&plist->lockrw);
3016 		for (port = plist->head; port != NULL; port = port->p_next) {
3017 			mutex_enter(&vswp->hw_lock);
3018 			if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
3019 				mutex_exit(&vswp->hw_lock);
3020 				RW_EXIT(&plist->lockrw);
3021 				goto fail_update;
3022 			}
3023 			mutex_exit(&vswp->hw_lock);
3024 		}
3025 		RW_EXIT(&plist->lockrw);
3026 	}
3027 
3028 	if (updated & MD_macaddr) {
3029 		cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx",
3030 		    vswp->instance, macaddr);
3031 
3032 		WRITE_ENTER(&vswp->if_lockrw);
3033 		for (i = ETHERADDRL - 1; i >= 0; i--) {
3034 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
3035 			macaddr >>= 8;
3036 		}
3037 		RW_EXIT(&vswp->if_lockrw);
3038 
3039 		/*
3040 		 * Remove old address from HW (if programmed) and set
3041 		 * new address.
3042 		 */
3043 		mutex_enter(&vswp->hw_lock);
3044 		(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
3045 		(void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
3046 		mutex_exit(&vswp->hw_lock);
3047 
3048 		/*
3049 		 * Notify the MAC layer of the changed address.
3050 		 */
3051 		mac_unicst_update(vswp->if_mh, (uint8_t *)&vswp->if_addr);
3052 	}
3053 
3054 	return;
3055 
3056 fail_reconf:
3057 	cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance);
3058 	return;
3059 
3060 fail_update:
3061 	cmn_err(CE_WARN, "!vsw%d: update of configuration failed",
3062 	    vswp->instance);
3063 }
3064 
3065 /*
3066  * Add a new port to the system.
3067  *
3068  * Returns 0 on success, 1 on failure.
3069  */
3070 int
3071 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
3072 {
3073 	uint64_t		ldc_id;
3074 	uint8_t			*addrp;
3075 	int			i, addrsz;
3076 	int			num_nodes = 0, nchan = 0;
3077 	int			listsz = 0;
3078 	mde_cookie_t		*listp = NULL;
3079 	struct ether_addr	ea;
3080 	uint64_t		macaddr;
3081 	uint64_t		inst = 0;
3082 	vsw_port_t		*port;
3083 
3084 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
3085 		DWARN(vswp, "%s: prop(%s) not found", __func__,
3086 		    id_propname);
3087 		return (1);
3088 	}
3089 
3090 	/*
3091 	 * Find the channel endpoint node(s) (which should be under this
3092 	 * port node) which contain the channel id(s).
3093 	 */
3094 	if ((num_nodes = md_node_count(mdp)) <= 0) {
3095 		DERR(vswp, "%s: invalid number of nodes found (%d)",
3096 		    __func__, num_nodes);
3097 		return (1);
3098 	}
3099 
3100 	D2(vswp, "%s: %d nodes found", __func__, num_nodes);
3101 
3102 	/* allocate enough space for node list */
3103 	listsz = num_nodes * sizeof (mde_cookie_t);
3104 	listp = kmem_zalloc(listsz, KM_SLEEP);
3105 
3106 	nchan = md_scan_dag(mdp, *node, md_find_name(mdp, chan_propname),
3107 	    md_find_name(mdp, "fwd"), listp);
3108 
3109 	if (nchan <= 0) {
3110 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
3111 		kmem_free(listp, listsz);
3112 		return (1);
3113 	}
3114 
3115 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
3116 
3117 	/* use property from first node found */
3118 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
3119 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
3120 		    id_propname);
3121 		kmem_free(listp, listsz);
3122 		return (1);
3123 	}
3124 
3125 	/* don't need list any more */
3126 	kmem_free(listp, listsz);
3127 
3128 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
3129 
3130 	/* read mac-address property */
3131 	if (md_get_prop_data(mdp, *node, remaddr_propname,
3132 	    &addrp, &addrsz)) {
3133 		DWARN(vswp, "%s: prop(%s) not found",
3134 		    __func__, remaddr_propname);
3135 		return (1);
3136 	}
3137 
3138 	if (addrsz < ETHERADDRL) {
3139 		DWARN(vswp, "%s: invalid address size", __func__);
3140 		return (1);
3141 	}
3142 
3143 	macaddr = *((uint64_t *)addrp);
3144 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
3145 
3146 	for (i = ETHERADDRL - 1; i >= 0; i--) {
3147 		ea.ether_addr_octet[i] = macaddr & 0xFF;
3148 		macaddr >>= 8;
3149 	}
3150 
3151 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
3152 		DERR(vswp, "%s: failed to attach port", __func__);
3153 		return (1);
3154 	}
3155 
3156 	port = vsw_lookup_port(vswp, (int)inst);
3157 
3158 	/* just successfuly created the port, so it should exist */
3159 	ASSERT(port != NULL);
3160 
3161 	return (0);
3162 }
3163 
3164 /*
3165  * Attach the specified port.
3166  *
3167  * Returns 0 on success, 1 on failure.
3168  */
3169 static int
3170 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
3171 struct ether_addr *macaddr)
3172 {
3173 	vsw_port_list_t		*plist = &vswp->plist;
3174 	vsw_port_t		*port, **prev_port;
3175 	int			i;
3176 
3177 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
3178 
3179 	/* port already exists? */
3180 	READ_ENTER(&plist->lockrw);
3181 	for (port = plist->head; port != NULL; port = port->p_next) {
3182 		if (port->p_instance == p_instance) {
3183 			DWARN(vswp, "%s: port instance %d already attached",
3184 			    __func__, p_instance);
3185 			RW_EXIT(&plist->lockrw);
3186 			return (1);
3187 		}
3188 	}
3189 	RW_EXIT(&plist->lockrw);
3190 
3191 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
3192 	port->p_vswp = vswp;
3193 	port->p_instance = p_instance;
3194 	port->p_ldclist.num_ldcs = 0;
3195 	port->p_ldclist.head = NULL;
3196 	port->addr_set = VSW_ADDR_UNSET;
3197 
3198 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
3199 
3200 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
3201 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
3202 
3203 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
3204 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
3205 
3206 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
3207 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
3208 	port->state = VSW_PORT_INIT;
3209 
3210 	if (nids > VSW_PORT_MAX_LDCS) {
3211 		D2(vswp, "%s: using first of %d ldc ids", __func__, nids);
3212 		nids = VSW_PORT_MAX_LDCS;
3213 	}
3214 
3215 	D2(vswp, "%s: %d nids", __func__, nids);
3216 	for (i = 0; i < nids; i++) {
3217 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
3218 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
3219 			DERR(vswp, "%s: ldc_attach failed", __func__);
3220 
3221 			rw_destroy(&port->p_ldclist.lockrw);
3222 
3223 			cv_destroy(&port->ref_cv);
3224 			mutex_destroy(&port->ref_lock);
3225 
3226 			cv_destroy(&port->state_cv);
3227 			mutex_destroy(&port->state_lock);
3228 
3229 			mutex_destroy(&port->tx_lock);
3230 			mutex_destroy(&port->mca_lock);
3231 			kmem_free(port, sizeof (vsw_port_t));
3232 			return (1);
3233 		}
3234 	}
3235 
3236 	ether_copy(macaddr, &port->p_macaddr);
3237 
3238 	WRITE_ENTER(&plist->lockrw);
3239 
3240 	/* create the fdb entry for this port/mac address */
3241 	(void) vsw_add_fdb(vswp, port);
3242 
3243 	mutex_enter(&vswp->hw_lock);
3244 	(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
3245 	mutex_exit(&vswp->hw_lock);
3246 
3247 	/* link it into the list of ports for this vsw instance */
3248 	prev_port = (vsw_port_t **)(&plist->head);
3249 	port->p_next = *prev_port;
3250 	*prev_port = port;
3251 	plist->num_ports++;
3252 	RW_EXIT(&plist->lockrw);
3253 
3254 	/*
3255 	 * Initialise the port and any ldc's under it.
3256 	 */
3257 	(void) vsw_init_ldcs(port);
3258 
3259 	D1(vswp, "%s: exit", __func__);
3260 	return (0);
3261 }
3262 
3263 /*
3264  * Detach the specified port.
3265  *
3266  * Returns 0 on success, 1 on failure.
3267  */
3268 static int
3269 vsw_port_detach(vsw_t *vswp, int p_instance)
3270 {
3271 	vsw_port_t	*port = NULL;
3272 	vsw_port_list_t	*plist = &vswp->plist;
3273 
3274 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
3275 
3276 	WRITE_ENTER(&plist->lockrw);
3277 
3278 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
3279 		RW_EXIT(&plist->lockrw);
3280 		return (1);
3281 	}
3282 
3283 	if (vsw_plist_del_node(vswp, port)) {
3284 		RW_EXIT(&plist->lockrw);
3285 		return (1);
3286 	}
3287 
3288 	/* Remove the fdb entry for this port/mac address */
3289 	(void) vsw_del_fdb(vswp, port);
3290 
3291 	/* Remove any multicast addresses.. */
3292 	vsw_del_mcst_port(port);
3293 
3294 	/*
3295 	 * No longer need to hold writer lock on port list now
3296 	 * that we have unlinked the target port from the list.
3297 	 */
3298 	RW_EXIT(&plist->lockrw);
3299 
3300 	/* Remove address if was programmed into HW. */
3301 	mutex_enter(&vswp->hw_lock);
3302 	(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
3303 	if (vswp->recfg_reqd)
3304 		vsw_reconfig_hw(vswp);
3305 	mutex_exit(&vswp->hw_lock);
3306 
3307 	if (vsw_port_delete(port)) {
3308 		return (1);
3309 	}
3310 
3311 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
3312 	return (0);
3313 }
3314 
3315 /*
3316  * Detach all active ports.
3317  *
3318  * Returns 0 on success, 1 on failure.
3319  */
3320 static int
3321 vsw_detach_ports(vsw_t *vswp)
3322 {
3323 	vsw_port_list_t 	*plist = &vswp->plist;
3324 	vsw_port_t		*port = NULL;
3325 
3326 	D1(vswp, "%s: enter", __func__);
3327 
3328 	WRITE_ENTER(&plist->lockrw);
3329 
3330 	while ((port = plist->head) != NULL) {
3331 		if (vsw_plist_del_node(vswp, port)) {
3332 			DERR(vswp, "%s: Error deleting port %d"
3333 			    " from port list", __func__, port->p_instance);
3334 			RW_EXIT(&plist->lockrw);
3335 			return (1);
3336 		}
3337 
3338 		/* Remove address if was programmed into HW. */
3339 		mutex_enter(&vswp->hw_lock);
3340 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
3341 		mutex_exit(&vswp->hw_lock);
3342 
3343 		/* Remove the fdb entry for this port/mac address */
3344 		(void) vsw_del_fdb(vswp, port);
3345 
3346 		/* Remove any multicast addresses.. */
3347 		vsw_del_mcst_port(port);
3348 
3349 		/*
3350 		 * No longer need to hold the lock on the port list
3351 		 * now that we have unlinked the target port from the
3352 		 * list.
3353 		 */
3354 		RW_EXIT(&plist->lockrw);
3355 		if (vsw_port_delete(port)) {
3356 			DERR(vswp, "%s: Error deleting port %d",
3357 			    __func__, port->p_instance);
3358 			return (1);
3359 		}
3360 		WRITE_ENTER(&plist->lockrw);
3361 	}
3362 	RW_EXIT(&plist->lockrw);
3363 
3364 	D1(vswp, "%s: exit", __func__);
3365 
3366 	return (0);
3367 }
3368 
3369 /*
3370  * Delete the specified port.
3371  *
3372  * Returns 0 on success, 1 on failure.
3373  */
3374 static int
3375 vsw_port_delete(vsw_port_t *port)
3376 {
3377 	vsw_ldc_list_t 		*ldcl;
3378 	vsw_t			*vswp = port->p_vswp;
3379 
3380 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
3381 
3382 	(void) vsw_uninit_ldcs(port);
3383 
3384 	/*
3385 	 * Wait for any pending ctrl msg tasks which reference this
3386 	 * port to finish.
3387 	 */
3388 	if (vsw_drain_port_taskq(port))
3389 		return (1);
3390 
3391 	/*
3392 	 * Wait for port reference count to hit zero.
3393 	 */
3394 	mutex_enter(&port->ref_lock);
3395 	while (port->ref_cnt != 0)
3396 		cv_wait(&port->ref_cv, &port->ref_lock);
3397 	mutex_exit(&port->ref_lock);
3398 
3399 	/*
3400 	 * Wait for any active callbacks to finish
3401 	 */
3402 	if (vsw_drain_ldcs(port))
3403 		return (1);
3404 
3405 	ldcl = &port->p_ldclist;
3406 	WRITE_ENTER(&ldcl->lockrw);
3407 	while (ldcl->num_ldcs > 0) {
3408 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {
3409 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
3410 			    vswp->instance, ldcl->head->ldc_id);
3411 			RW_EXIT(&ldcl->lockrw);
3412 			return (1);
3413 		}
3414 	}
3415 	RW_EXIT(&ldcl->lockrw);
3416 
3417 	rw_destroy(&port->p_ldclist.lockrw);
3418 
3419 	mutex_destroy(&port->mca_lock);
3420 	mutex_destroy(&port->tx_lock);
3421 	cv_destroy(&port->ref_cv);
3422 	mutex_destroy(&port->ref_lock);
3423 
3424 	cv_destroy(&port->state_cv);
3425 	mutex_destroy(&port->state_lock);
3426 
3427 	kmem_free(port, sizeof (vsw_port_t));
3428 
3429 	D1(vswp, "%s: exit", __func__);
3430 
3431 	return (0);
3432 }
3433 
3434 /*
3435  * Attach a logical domain channel (ldc) under a specified port.
3436  *
3437  * Returns 0 on success, 1 on failure.
3438  */
3439 static int
3440 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
3441 {
3442 	vsw_t 		*vswp = port->p_vswp;
3443 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
3444 	vsw_ldc_t 	*ldcp = NULL;
3445 	ldc_attr_t 	attr;
3446 	ldc_status_t	istatus;
3447 	int 		status = DDI_FAILURE;
3448 	int		rv;
3449 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
3450 				PROG_callback = 0x2}
3451 			progress;
3452 
3453 	progress = PROG_init;
3454 
3455 	D1(vswp, "%s: enter", __func__);
3456 
3457 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
3458 	if (ldcp == NULL) {
3459 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
3460 		return (1);
3461 	}
3462 	ldcp->ldc_id = ldc_id;
3463 
3464 	/* allocate pool of receive mblks */
3465 	rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
3466 	if (rv) {
3467 		DWARN(vswp, "%s: unable to create free mblk pool for"
3468 		    " channel %ld (rv %d)", __func__, ldc_id, rv);
3469 		kmem_free(ldcp, sizeof (vsw_ldc_t));
3470 		return (1);
3471 	}
3472 
3473 	progress |= PROG_mblks;
3474 
3475 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
3476 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
3477 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
3478 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
3479 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
3480 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
3481 
3482 	/* required for handshake with peer */
3483 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
3484 	ldcp->peer_session = 0;
3485 	ldcp->session_status = 0;
3486 
3487 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
3488 	ldcp->hss_id = 1;	/* Initial handshake session id */
3489 
3490 	/* only set for outbound lane, inbound set by peer */
3491 	mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
3492 	mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
3493 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
3494 
3495 	attr.devclass = LDC_DEV_NT_SVC;
3496 	attr.instance = ddi_get_instance(vswp->dip);
3497 	attr.mode = LDC_MODE_UNRELIABLE;
3498 	attr.mtu = VSW_LDC_MTU;
3499 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
3500 	if (status != 0) {
3501 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
3502 		    __func__, ldc_id, status);
3503 		goto ldc_attach_fail;
3504 	}
3505 
3506 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
3507 	if (status != 0) {
3508 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
3509 		    __func__, ldc_id, status);
3510 		(void) ldc_fini(ldcp->ldc_handle);
3511 		goto ldc_attach_fail;
3512 	}
3513 
3514 	progress |= PROG_callback;
3515 
3516 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
3517 
3518 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
3519 		DERR(vswp, "%s: ldc_status failed", __func__);
3520 		mutex_destroy(&ldcp->status_lock);
3521 		goto ldc_attach_fail;
3522 	}
3523 
3524 	ldcp->ldc_status = istatus;
3525 	ldcp->ldc_port = port;
3526 	ldcp->ldc_vswp = vswp;
3527 
3528 	/* link it into the list of channels for this port */
3529 	WRITE_ENTER(&ldcl->lockrw);
3530 	ldcp->ldc_next = ldcl->head;
3531 	ldcl->head = ldcp;
3532 	ldcl->num_ldcs++;
3533 	RW_EXIT(&ldcl->lockrw);
3534 
3535 	D1(vswp, "%s: exit", __func__);
3536 	return (0);
3537 
3538 ldc_attach_fail:
3539 	mutex_destroy(&ldcp->ldc_txlock);
3540 	mutex_destroy(&ldcp->ldc_cblock);
3541 
3542 	cv_destroy(&ldcp->drain_cv);
3543 
3544 	rw_destroy(&ldcp->lane_in.dlistrw);
3545 	rw_destroy(&ldcp->lane_out.dlistrw);
3546 
3547 	if (progress & PROG_callback) {
3548 		(void) ldc_unreg_callback(ldcp->ldc_handle);
3549 	}
3550 
3551 	if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) {
3552 		if (vio_destroy_mblks(ldcp->rxh) != 0) {
3553 			/*
3554 			 * Something odd has happened, as the destroy
3555 			 * will only fail if some mblks have been allocated
3556 			 * from the pool already (which shouldn't happen)
3557 			 * and have not been returned.
3558 			 *
3559 			 * Add the pool pointer to a list maintained in
3560 			 * the device instance. Another attempt will be made
3561 			 * to free the pool when the device itself detaches.
3562 			 */
3563 			cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld "
3564 			    "failed and cannot destroy associated mblk "
3565 			    "pool", vswp->instance, ldc_id);
3566 			ldcp->rxh->nextp =  vswp->rxh;
3567 			vswp->rxh = ldcp->rxh;
3568 		}
3569 	}
3570 	mutex_destroy(&ldcp->drain_cv_lock);
3571 	mutex_destroy(&ldcp->hss_lock);
3572 
3573 	mutex_destroy(&ldcp->lane_in.seq_lock);
3574 	mutex_destroy(&ldcp->lane_out.seq_lock);
3575 	kmem_free(ldcp, sizeof (vsw_ldc_t));
3576 
3577 	return (1);
3578 }
3579 
3580 /*
3581  * Detach a logical domain channel (ldc) belonging to a
3582  * particular port.
3583  *
3584  * Returns 0 on success, 1 on failure.
3585  */
3586 static int
3587 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
3588 {
3589 	vsw_t 		*vswp = port->p_vswp;
3590 	vsw_ldc_t 	*ldcp, *prev_ldcp;
3591 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3592 	int 		rv;
3593 
3594 	prev_ldcp = ldcl->head;
3595 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
3596 		if (ldcp->ldc_id == ldc_id) {
3597 			break;
3598 		}
3599 	}
3600 
3601 	/* specified ldc id not found */
3602 	if (ldcp == NULL) {
3603 		DERR(vswp, "%s: ldcp = NULL", __func__);
3604 		return (1);
3605 	}
3606 
3607 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
3608 
3609 	/*
3610 	 * Before we can close the channel we must release any mapped
3611 	 * resources (e.g. drings).
3612 	 */
3613 	vsw_free_lane_resources(ldcp, INBOUND);
3614 	vsw_free_lane_resources(ldcp, OUTBOUND);
3615 
3616 	/*
3617 	 * If the close fails we are in serious trouble, as won't
3618 	 * be able to delete the parent port.
3619 	 */
3620 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
3621 		DERR(vswp, "%s: error %d closing channel %lld",
3622 		    __func__, rv, ldcp->ldc_id);
3623 		return (1);
3624 	}
3625 
3626 	(void) ldc_fini(ldcp->ldc_handle);
3627 
3628 	ldcp->ldc_status = LDC_INIT;
3629 	ldcp->ldc_handle = NULL;
3630 	ldcp->ldc_vswp = NULL;
3631 
3632 	if (ldcp->rxh != NULL) {
3633 		if (vio_destroy_mblks(ldcp->rxh)) {
3634 			/*
3635 			 * Mostly likely some mblks are still in use and
3636 			 * have not been returned to the pool. Add the pool
3637 			 * to the list maintained in the device instance.
3638 			 * Another attempt will be made to destroy the pool
3639 			 * when the device detaches.
3640 			 */
3641 			ldcp->rxh->nextp =  vswp->rxh;
3642 			vswp->rxh = ldcp->rxh;
3643 		}
3644 	}
3645 
3646 	/* unlink it from the list */
3647 	prev_ldcp = ldcp->ldc_next;
3648 	ldcl->num_ldcs--;
3649 
3650 	mutex_destroy(&ldcp->ldc_txlock);
3651 	mutex_destroy(&ldcp->ldc_cblock);
3652 	cv_destroy(&ldcp->drain_cv);
3653 	mutex_destroy(&ldcp->drain_cv_lock);
3654 	mutex_destroy(&ldcp->hss_lock);
3655 	mutex_destroy(&ldcp->lane_in.seq_lock);
3656 	mutex_destroy(&ldcp->lane_out.seq_lock);
3657 	mutex_destroy(&ldcp->status_lock);
3658 	rw_destroy(&ldcp->lane_in.dlistrw);
3659 	rw_destroy(&ldcp->lane_out.dlistrw);
3660 
3661 	kmem_free(ldcp, sizeof (vsw_ldc_t));
3662 
3663 	return (0);
3664 }
3665 
3666 /*
3667  * Open and attempt to bring up the channel. Note that channel
3668  * can only be brought up if peer has also opened channel.
3669  *
3670  * Returns 0 if can open and bring up channel, otherwise
3671  * returns 1.
3672  */
3673 static int
3674 vsw_ldc_init(vsw_ldc_t *ldcp)
3675 {
3676 	vsw_t 		*vswp = ldcp->ldc_vswp;
3677 	ldc_status_t	istatus = 0;
3678 	int		rv;
3679 
3680 	D1(vswp, "%s: enter", __func__);
3681 
3682 	LDC_ENTER_LOCK(ldcp);
3683 
3684 	/* don't start at 0 in case clients don't like that */
3685 	ldcp->next_ident = 1;
3686 
3687 	rv = ldc_open(ldcp->ldc_handle);
3688 	if (rv != 0) {
3689 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
3690 		    __func__, ldcp->ldc_id, rv);
3691 		LDC_EXIT_LOCK(ldcp);
3692 		return (1);
3693 	}
3694 
3695 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
3696 		DERR(vswp, "%s: unable to get status", __func__);
3697 		LDC_EXIT_LOCK(ldcp);
3698 		return (1);
3699 
3700 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
3701 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
3702 		    __func__, ldcp->ldc_id, istatus);
3703 		LDC_EXIT_LOCK(ldcp);
3704 		return (1);
3705 	}
3706 
3707 	mutex_enter(&ldcp->status_lock);
3708 	ldcp->ldc_status = istatus;
3709 	mutex_exit(&ldcp->status_lock);
3710 
3711 	rv = ldc_up(ldcp->ldc_handle);
3712 	if (rv != 0) {
3713 		/*
3714 		 * Not a fatal error for ldc_up() to fail, as peer
3715 		 * end point may simply not be ready yet.
3716 		 */
3717 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
3718 		    ldcp->ldc_id, rv);
3719 		LDC_EXIT_LOCK(ldcp);
3720 		return (1);
3721 	}
3722 
3723 	/*
3724 	 * ldc_up() call is non-blocking so need to explicitly
3725 	 * check channel status to see if in fact the channel
3726 	 * is UP.
3727 	 */
3728 	mutex_enter(&ldcp->status_lock);
3729 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
3730 		DERR(vswp, "%s: unable to get status", __func__);
3731 		mutex_exit(&ldcp->status_lock);
3732 		LDC_EXIT_LOCK(ldcp);
3733 		return (1);
3734 
3735 	}
3736 
3737 	if (ldcp->ldc_status == LDC_UP) {
3738 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
3739 		    ldcp->ldc_id, istatus);
3740 		mutex_exit(&ldcp->status_lock);
3741 		LDC_EXIT_LOCK(ldcp);
3742 
3743 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
3744 		return (0);
3745 	}
3746 
3747 	mutex_exit(&ldcp->status_lock);
3748 	LDC_EXIT_LOCK(ldcp);
3749 
3750 	D1(vswp, "%s: exit", __func__);
3751 	return (0);
3752 }
3753 
3754 /* disable callbacks on the channel */
3755 static int
3756 vsw_ldc_uninit(vsw_ldc_t *ldcp)
3757 {
3758 	vsw_t	*vswp = ldcp->ldc_vswp;
3759 	int	rv;
3760 
3761 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
3762 
3763 	LDC_ENTER_LOCK(ldcp);
3764 
3765 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
3766 	if (rv != 0) {
3767 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
3768 		    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
3769 		LDC_EXIT_LOCK(ldcp);
3770 		return (1);
3771 	}
3772 
3773 	mutex_enter(&ldcp->status_lock);
3774 	ldcp->ldc_status = LDC_INIT;
3775 	mutex_exit(&ldcp->status_lock);
3776 
3777 	LDC_EXIT_LOCK(ldcp);
3778 
3779 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
3780 
3781 	return (0);
3782 }
3783 
3784 static int
3785 vsw_init_ldcs(vsw_port_t *port)
3786 {
3787 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3788 	vsw_ldc_t	*ldcp;
3789 
3790 	READ_ENTER(&ldcl->lockrw);
3791 	ldcp =  ldcl->head;
3792 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3793 		(void) vsw_ldc_init(ldcp);
3794 	}
3795 	RW_EXIT(&ldcl->lockrw);
3796 
3797 	return (0);
3798 }
3799 
3800 static int
3801 vsw_uninit_ldcs(vsw_port_t *port)
3802 {
3803 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3804 	vsw_ldc_t	*ldcp;
3805 
3806 	D1(NULL, "vsw_uninit_ldcs: enter\n");
3807 
3808 	READ_ENTER(&ldcl->lockrw);
3809 	ldcp =  ldcl->head;
3810 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3811 		(void) vsw_ldc_uninit(ldcp);
3812 	}
3813 	RW_EXIT(&ldcl->lockrw);
3814 
3815 	D1(NULL, "vsw_uninit_ldcs: exit\n");
3816 
3817 	return (0);
3818 }
3819 
3820 /*
3821  * Wait until the callback(s) associated with the ldcs under the specified
3822  * port have completed.
3823  *
3824  * Prior to this function being invoked each channel under this port
3825  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3826  *
3827  * A short explaination of what we are doing below..
3828  *
3829  * The simplest approach would be to have a reference counter in
3830  * the ldc structure which is increment/decremented by the callbacks as
3831  * they use the channel. The drain function could then simply disable any
3832  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
3833  * there is a tiny window here - before the callback is able to get the lock
3834  * on the channel it is interrupted and this function gets to execute. It
3835  * sees that the ref count is zero and believes its free to delete the
3836  * associated data structures.
3837  *
3838  * We get around this by taking advantage of the fact that before the ldc
3839  * framework invokes a callback it sets a flag to indicate that there is a
3840  * callback active (or about to become active). If when we attempt to
3841  * unregister a callback when this active flag is set then the unregister
3842  * will fail with EWOULDBLOCK.
3843  *
3844  * If the unregister fails we do a cv_timedwait. We will either be signaled
3845  * by the callback as it is exiting (note we have to wait a short period to
3846  * allow the callback to return fully to the ldc framework and it to clear
3847  * the active flag), or by the timer expiring. In either case we again attempt
3848  * the unregister. We repeat this until we can succesfully unregister the
3849  * callback.
3850  *
3851  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
3852  * the case where the callback has finished but the ldc framework has not yet
3853  * cleared the active flag. In this case we would never get a cv_signal.
3854  */
3855 static int
3856 vsw_drain_ldcs(vsw_port_t *port)
3857 {
3858 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3859 	vsw_ldc_t	*ldcp;
3860 	vsw_t		*vswp = port->p_vswp;
3861 
3862 	D1(vswp, "%s: enter", __func__);
3863 
3864 	READ_ENTER(&ldcl->lockrw);
3865 
3866 	ldcp = ldcl->head;
3867 
3868 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3869 		/*
3870 		 * If we can unregister the channel callback then we
3871 		 * know that there is no callback either running or
3872 		 * scheduled to run for this channel so move on to next
3873 		 * channel in the list.
3874 		 */
3875 		mutex_enter(&ldcp->drain_cv_lock);
3876 
3877 		/* prompt active callbacks to quit */
3878 		ldcp->drain_state = VSW_LDC_DRAINING;
3879 
3880 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
3881 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
3882 			    ldcp->ldc_id);
3883 			mutex_exit(&ldcp->drain_cv_lock);
3884 			continue;
3885 		} else {
3886 			/*
3887 			 * If we end up here we know that either 1) a callback
3888 			 * is currently executing, 2) is about to start (i.e.
3889 			 * the ldc framework has set the active flag but
3890 			 * has not actually invoked the callback yet, or 3)
3891 			 * has finished and has returned to the ldc framework
3892 			 * but the ldc framework has not yet cleared the
3893 			 * active bit.
3894 			 *
3895 			 * Wait for it to finish.
3896 			 */
3897 			while (ldc_unreg_callback(ldcp->ldc_handle)
3898 			    == EWOULDBLOCK)
3899 				(void) cv_timedwait(&ldcp->drain_cv,
3900 				    &ldcp->drain_cv_lock, lbolt + hz);
3901 
3902 			mutex_exit(&ldcp->drain_cv_lock);
3903 			D2(vswp, "%s: unreg callback for chan %ld after "
3904 			    "timeout", __func__, ldcp->ldc_id);
3905 		}
3906 	}
3907 	RW_EXIT(&ldcl->lockrw);
3908 
3909 	D1(vswp, "%s: exit", __func__);
3910 	return (0);
3911 }
3912 
3913 /*
3914  * Wait until all tasks which reference this port have completed.
3915  *
3916  * Prior to this function being invoked each channel under this port
3917  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3918  */
3919 static int
3920 vsw_drain_port_taskq(vsw_port_t *port)
3921 {
3922 	vsw_t		*vswp = port->p_vswp;
3923 
3924 	D1(vswp, "%s: enter", __func__);
3925 
3926 	/*
3927 	 * Mark the port as in the process of being detached, and
3928 	 * dispatch a marker task to the queue so we know when all
3929 	 * relevant tasks have completed.
3930 	 */
3931 	mutex_enter(&port->state_lock);
3932 	port->state = VSW_PORT_DETACHING;
3933 
3934 	if ((vswp->taskq_p == NULL) ||
3935 	    (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
3936 	    port, DDI_NOSLEEP) != DDI_SUCCESS)) {
3937 		DERR(vswp, "%s: unable to dispatch marker task",
3938 		    __func__);
3939 		mutex_exit(&port->state_lock);
3940 		return (1);
3941 	}
3942 
3943 	/*
3944 	 * Wait for the marker task to finish.
3945 	 */
3946 	while (port->state != VSW_PORT_DETACHABLE)
3947 		cv_wait(&port->state_cv, &port->state_lock);
3948 
3949 	mutex_exit(&port->state_lock);
3950 
3951 	D1(vswp, "%s: exit", __func__);
3952 
3953 	return (0);
3954 }
3955 
3956 static void
3957 vsw_marker_task(void *arg)
3958 {
3959 	vsw_port_t	*port = arg;
3960 	vsw_t		*vswp = port->p_vswp;
3961 
3962 	D1(vswp, "%s: enter", __func__);
3963 
3964 	mutex_enter(&port->state_lock);
3965 
3966 	/*
3967 	 * No further tasks should be dispatched which reference
3968 	 * this port so ok to mark it as safe to detach.
3969 	 */
3970 	port->state = VSW_PORT_DETACHABLE;
3971 
3972 	cv_signal(&port->state_cv);
3973 
3974 	mutex_exit(&port->state_lock);
3975 
3976 	D1(vswp, "%s: exit", __func__);
3977 }
3978 
3979 static vsw_port_t *
3980 vsw_lookup_port(vsw_t *vswp, int p_instance)
3981 {
3982 	vsw_port_list_t *plist = &vswp->plist;
3983 	vsw_port_t	*port;
3984 
3985 	for (port = plist->head; port != NULL; port = port->p_next) {
3986 		if (port->p_instance == p_instance) {
3987 			D2(vswp, "vsw_lookup_port: found p_instance\n");
3988 			return (port);
3989 		}
3990 	}
3991 
3992 	return (NULL);
3993 }
3994 
3995 /*
3996  * Search for and remove the specified port from the port
3997  * list. Returns 0 if able to locate and remove port, otherwise
3998  * returns 1.
3999  */
4000 static int
4001 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
4002 {
4003 	vsw_port_list_t *plist = &vswp->plist;
4004 	vsw_port_t	*curr_p, *prev_p;
4005 
4006 	if (plist->head == NULL)
4007 		return (1);
4008 
4009 	curr_p = prev_p = plist->head;
4010 
4011 	while (curr_p != NULL) {
4012 		if (curr_p == port) {
4013 			if (prev_p == curr_p) {
4014 				plist->head = curr_p->p_next;
4015 			} else {
4016 				prev_p->p_next = curr_p->p_next;
4017 			}
4018 			plist->num_ports--;
4019 			break;
4020 		} else {
4021 			prev_p = curr_p;
4022 			curr_p = curr_p->p_next;
4023 		}
4024 	}
4025 	return (0);
4026 }
4027 
4028 /*
4029  * Interrupt handler for ldc messages.
4030  */
4031 static uint_t
4032 vsw_ldc_cb(uint64_t event, caddr_t arg)
4033 {
4034 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
4035 	vsw_t 		*vswp = ldcp->ldc_vswp;
4036 
4037 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4038 
4039 	mutex_enter(&ldcp->ldc_cblock);
4040 
4041 	mutex_enter(&ldcp->status_lock);
4042 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
4043 		mutex_exit(&ldcp->status_lock);
4044 		mutex_exit(&ldcp->ldc_cblock);
4045 		return (LDC_SUCCESS);
4046 	}
4047 	mutex_exit(&ldcp->status_lock);
4048 
4049 	if (event & LDC_EVT_UP) {
4050 		/*
4051 		 * Channel has come up.
4052 		 */
4053 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
4054 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
4055 
4056 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
4057 
4058 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
4059 	}
4060 
4061 	if (event & LDC_EVT_READ) {
4062 		/*
4063 		 * Data available for reading.
4064 		 */
4065 		D2(vswp, "%s: id(ld) event(%llx) data READ",
4066 		    __func__, ldcp->ldc_id, event);
4067 
4068 		vsw_process_pkt(ldcp);
4069 
4070 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
4071 
4072 		goto vsw_cb_exit;
4073 	}
4074 
4075 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
4076 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
4077 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
4078 
4079 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4080 	}
4081 
4082 	/*
4083 	 * Catch either LDC_EVT_WRITE which we don't support or any
4084 	 * unknown event.
4085 	 */
4086 	if (event &
4087 	    ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
4088 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
4089 		    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
4090 	}
4091 
4092 vsw_cb_exit:
4093 	mutex_exit(&ldcp->ldc_cblock);
4094 
4095 	/*
4096 	 * Let the drain function know we are finishing if it
4097 	 * is waiting.
4098 	 */
4099 	mutex_enter(&ldcp->drain_cv_lock);
4100 	if (ldcp->drain_state == VSW_LDC_DRAINING)
4101 		cv_signal(&ldcp->drain_cv);
4102 	mutex_exit(&ldcp->drain_cv_lock);
4103 
4104 	return (LDC_SUCCESS);
4105 }
4106 
4107 /*
4108  * Reinitialise data structures associated with the channel.
4109  */
4110 static void
4111 vsw_ldc_reinit(vsw_ldc_t *ldcp)
4112 {
4113 	vsw_t		*vswp = ldcp->ldc_vswp;
4114 	vsw_port_t	*port;
4115 	vsw_ldc_list_t	*ldcl;
4116 
4117 	D1(vswp, "%s: enter", __func__);
4118 
4119 	port = ldcp->ldc_port;
4120 	ldcl = &port->p_ldclist;
4121 
4122 	READ_ENTER(&ldcl->lockrw);
4123 
4124 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
4125 	    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
4126 
4127 	vsw_free_lane_resources(ldcp, INBOUND);
4128 	vsw_free_lane_resources(ldcp, OUTBOUND);
4129 	RW_EXIT(&ldcl->lockrw);
4130 
4131 	ldcp->lane_in.lstate = 0;
4132 	ldcp->lane_out.lstate = 0;
4133 
4134 	/*
4135 	 * Remove parent port from any multicast groups
4136 	 * it may have registered with. Client must resend
4137 	 * multicast add command after handshake completes.
4138 	 */
4139 	(void) vsw_del_fdb(vswp, port);
4140 
4141 	vsw_del_mcst_port(port);
4142 
4143 	ldcp->peer_session = 0;
4144 	ldcp->session_status = 0;
4145 	ldcp->hcnt = 0;
4146 	ldcp->hphase = VSW_MILESTONE0;
4147 
4148 	D1(vswp, "%s: exit", __func__);
4149 }
4150 
4151 /*
4152  * Process a connection event.
4153  *
4154  * Note - care must be taken to ensure that this function is
4155  * not called with the dlistrw lock held.
4156  */
4157 static void
4158 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
4159 {
4160 	vsw_t		*vswp = ldcp->ldc_vswp;
4161 	vsw_conn_evt_t	*conn = NULL;
4162 
4163 	D1(vswp, "%s: enter", __func__);
4164 
4165 	/*
4166 	 * Check if either a reset or restart event is pending
4167 	 * or in progress. If so just return.
4168 	 *
4169 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
4170 	 * being received by the callback handler, or a ECONNRESET error
4171 	 * code being returned from a ldc_read() or ldc_write() call.
4172 	 *
4173 	 * A VSW_CONN_RESTART event occurs when some error checking code
4174 	 * decides that there is a problem with data from the channel,
4175 	 * and that the handshake should be restarted.
4176 	 */
4177 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
4178 	    (ldstub((uint8_t *)&ldcp->reset_active)))
4179 		return;
4180 
4181 	/*
4182 	 * If it is an LDC_UP event we first check the recorded
4183 	 * state of the channel. If this is UP then we know that
4184 	 * the channel moving to the UP state has already been dealt
4185 	 * with and don't need to dispatch a  new task.
4186 	 *
4187 	 * The reason for this check is that when we do a ldc_up(),
4188 	 * depending on the state of the peer, we may or may not get
4189 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
4190 	 * every time we do ldc_up() we explicitly check the channel
4191 	 * status to see has it come up (ldc_up() is asynch and will
4192 	 * complete at some undefined time), and take the appropriate
4193 	 * action.
4194 	 *
4195 	 * The flip side of this is that we may get a LDC_UP event
4196 	 * when we have already seen that the channel is up and have
4197 	 * dealt with that.
4198 	 */
4199 	mutex_enter(&ldcp->status_lock);
4200 	if (evt == VSW_CONN_UP) {
4201 		if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
4202 			mutex_exit(&ldcp->status_lock);
4203 			return;
4204 		}
4205 	}
4206 	mutex_exit(&ldcp->status_lock);
4207 
4208 	/*
4209 	 * The transaction group id allows us to identify and discard
4210 	 * any tasks which are still pending on the taskq and refer
4211 	 * to the handshake session we are about to restart or reset.
4212 	 * These stale messages no longer have any real meaning.
4213 	 */
4214 	mutex_enter(&ldcp->hss_lock);
4215 	ldcp->hss_id++;
4216 	mutex_exit(&ldcp->hss_lock);
4217 
4218 	ASSERT(vswp->taskq_p != NULL);
4219 
4220 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
4221 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
4222 		    " connection event", vswp->instance);
4223 		goto err_exit;
4224 	}
4225 
4226 	conn->evt = evt;
4227 	conn->ldcp = ldcp;
4228 
4229 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
4230 	    DDI_NOSLEEP) != DDI_SUCCESS) {
4231 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
4232 		    vswp->instance);
4233 
4234 		kmem_free(conn, sizeof (vsw_conn_evt_t));
4235 		goto err_exit;
4236 	}
4237 
4238 	D1(vswp, "%s: exit", __func__);
4239 	return;
4240 
4241 err_exit:
4242 	/*
4243 	 * Have mostly likely failed due to memory shortage. Clear the flag so
4244 	 * that future requests will at least be attempted and will hopefully
4245 	 * succeed.
4246 	 */
4247 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
4248 		ldcp->reset_active = 0;
4249 }
4250 
4251 /*
4252  * Deal with events relating to a connection. Invoked from a taskq.
4253  */
4254 static void
4255 vsw_conn_task(void *arg)
4256 {
4257 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
4258 	vsw_ldc_t	*ldcp = NULL;
4259 	vsw_t		*vswp = NULL;
4260 	uint16_t	evt;
4261 	ldc_status_t	curr_status;
4262 
4263 	ldcp = conn->ldcp;
4264 	evt = conn->evt;
4265 	vswp = ldcp->ldc_vswp;
4266 
4267 	D1(vswp, "%s: enter", __func__);
4268 
4269 	/* can safely free now have copied out data */
4270 	kmem_free(conn, sizeof (vsw_conn_evt_t));
4271 
4272 	mutex_enter(&ldcp->status_lock);
4273 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
4274 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
4275 		    "channel %ld", vswp->instance, ldcp->ldc_id);
4276 		mutex_exit(&ldcp->status_lock);
4277 		return;
4278 	}
4279 
4280 	/*
4281 	 * If we wish to restart the handshake on this channel, then if
4282 	 * the channel is UP we bring it DOWN to flush the underlying
4283 	 * ldc queue.
4284 	 */
4285 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
4286 		(void) ldc_down(ldcp->ldc_handle);
4287 
4288 	/*
4289 	 * re-init all the associated data structures.
4290 	 */
4291 	vsw_ldc_reinit(ldcp);
4292 
4293 	/*
4294 	 * Bring the channel back up (note it does no harm to
4295 	 * do this even if the channel is already UP, Just
4296 	 * becomes effectively a no-op).
4297 	 */
4298 	(void) ldc_up(ldcp->ldc_handle);
4299 
4300 	/*
4301 	 * Check if channel is now UP. This will only happen if
4302 	 * peer has also done a ldc_up().
4303 	 */
4304 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
4305 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
4306 		    "channel %ld", vswp->instance, ldcp->ldc_id);
4307 		mutex_exit(&ldcp->status_lock);
4308 		return;
4309 	}
4310 
4311 	ldcp->ldc_status = curr_status;
4312 
4313 	/* channel UP so restart handshake by sending version info */
4314 	if (curr_status == LDC_UP) {
4315 		if (ldcp->hcnt++ > vsw_num_handshakes) {
4316 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
4317 			    " handshake attempts (%d) on channel %ld",
4318 			    vswp->instance, ldcp->hcnt, ldcp->ldc_id);
4319 			mutex_exit(&ldcp->status_lock);
4320 			return;
4321 		}
4322 
4323 		if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
4324 		    DDI_NOSLEEP) != DDI_SUCCESS) {
4325 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
4326 			    vswp->instance);
4327 
4328 			/*
4329 			 * Don't count as valid restart attempt if couldn't
4330 			 * send version msg.
4331 			 */
4332 			if (ldcp->hcnt > 0)
4333 				ldcp->hcnt--;
4334 		}
4335 	}
4336 
4337 	/*
4338 	 * Mark that the process is complete by clearing the flag.
4339 	 *
4340 	 * Note is it possible that the taskq dispatch above may have failed,
4341 	 * most likely due to memory shortage. We still clear the flag so
4342 	 * future attempts will at least be attempted and will hopefully
4343 	 * succeed.
4344 	 */
4345 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
4346 		ldcp->reset_active = 0;
4347 
4348 	mutex_exit(&ldcp->status_lock);
4349 
4350 	D1(vswp, "%s: exit", __func__);
4351 }
4352 
4353 /*
4354  * returns 0 if legal for event signified by flag to have
4355  * occured at the time it did. Otherwise returns 1.
4356  */
4357 int
4358 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
4359 {
4360 	vsw_t		*vswp = ldcp->ldc_vswp;
4361 	uint64_t	state;
4362 	uint64_t	phase;
4363 
4364 	if (dir == INBOUND)
4365 		state = ldcp->lane_in.lstate;
4366 	else
4367 		state = ldcp->lane_out.lstate;
4368 
4369 	phase = ldcp->hphase;
4370 
4371 	switch (flag) {
4372 	case VSW_VER_INFO_RECV:
4373 		if (phase > VSW_MILESTONE0) {
4374 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
4375 			    " when in state %d\n", ldcp->ldc_id, phase);
4376 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4377 			return (1);
4378 		}
4379 		break;
4380 
4381 	case VSW_VER_ACK_RECV:
4382 	case VSW_VER_NACK_RECV:
4383 		if (!(state & VSW_VER_INFO_SENT)) {
4384 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
4385 			    "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
4386 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4387 			return (1);
4388 		} else
4389 			state &= ~VSW_VER_INFO_SENT;
4390 		break;
4391 
4392 	case VSW_ATTR_INFO_RECV:
4393 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
4394 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
4395 			    " when in state %d\n", ldcp->ldc_id, phase);
4396 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4397 			return (1);
4398 		}
4399 		break;
4400 
4401 	case VSW_ATTR_ACK_RECV:
4402 	case VSW_ATTR_NACK_RECV:
4403 		if (!(state & VSW_ATTR_INFO_SENT)) {
4404 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
4405 			    " or ATTR_NACK when in state %d\n",
4406 			    ldcp->ldc_id, phase);
4407 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4408 			return (1);
4409 		} else
4410 			state &= ~VSW_ATTR_INFO_SENT;
4411 		break;
4412 
4413 	case VSW_DRING_INFO_RECV:
4414 		if (phase < VSW_MILESTONE1) {
4415 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
4416 			    " when in state %d\n", ldcp->ldc_id, phase);
4417 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4418 			return (1);
4419 		}
4420 		break;
4421 
4422 	case VSW_DRING_ACK_RECV:
4423 	case VSW_DRING_NACK_RECV:
4424 		if (!(state & VSW_DRING_INFO_SENT)) {
4425 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
4426 			    " or DRING_NACK when in state %d\n",
4427 			    ldcp->ldc_id, phase);
4428 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4429 			return (1);
4430 		} else
4431 			state &= ~VSW_DRING_INFO_SENT;
4432 		break;
4433 
4434 	case VSW_RDX_INFO_RECV:
4435 		if (phase < VSW_MILESTONE3) {
4436 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
4437 			    " when in state %d\n", ldcp->ldc_id, phase);
4438 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4439 			return (1);
4440 		}
4441 		break;
4442 
4443 	case VSW_RDX_ACK_RECV:
4444 	case VSW_RDX_NACK_RECV:
4445 		if (!(state & VSW_RDX_INFO_SENT)) {
4446 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
4447 			    "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
4448 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4449 			return (1);
4450 		} else
4451 			state &= ~VSW_RDX_INFO_SENT;
4452 		break;
4453 
4454 	case VSW_MCST_INFO_RECV:
4455 		if (phase < VSW_MILESTONE3) {
4456 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
4457 			    " when in state %d\n", ldcp->ldc_id, phase);
4458 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4459 			return (1);
4460 		}
4461 		break;
4462 
4463 	default:
4464 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
4465 		    ldcp->ldc_id, flag);
4466 		return (1);
4467 	}
4468 
4469 	if (dir == INBOUND)
4470 		ldcp->lane_in.lstate = state;
4471 	else
4472 		ldcp->lane_out.lstate = state;
4473 
4474 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
4475 
4476 	return (0);
4477 }
4478 
4479 void
4480 vsw_next_milestone(vsw_ldc_t *ldcp)
4481 {
4482 	vsw_t		*vswp = ldcp->ldc_vswp;
4483 
4484 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
4485 	    ldcp->ldc_id, ldcp->hphase);
4486 
4487 	DUMP_FLAGS(ldcp->lane_in.lstate);
4488 	DUMP_FLAGS(ldcp->lane_out.lstate);
4489 
4490 	switch (ldcp->hphase) {
4491 
4492 	case VSW_MILESTONE0:
4493 		/*
4494 		 * If we haven't started to handshake with our peer,
4495 		 * start to do so now.
4496 		 */
4497 		if (ldcp->lane_out.lstate == 0) {
4498 			D2(vswp, "%s: (chan %lld) starting handshake "
4499 			    "with peer", __func__, ldcp->ldc_id);
4500 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
4501 		}
4502 
4503 		/*
4504 		 * Only way to pass this milestone is to have successfully
4505 		 * negotiated version info.
4506 		 */
4507 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
4508 		    (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
4509 
4510 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
4511 			    __func__, ldcp->ldc_id);
4512 
4513 			/*
4514 			 * Next milestone is passed when attribute
4515 			 * information has been successfully exchanged.
4516 			 */
4517 			ldcp->hphase = VSW_MILESTONE1;
4518 			vsw_send_attr(ldcp);
4519 
4520 		}
4521 		break;
4522 
4523 	case VSW_MILESTONE1:
4524 		/*
4525 		 * Only way to pass this milestone is to have successfully
4526 		 * negotiated attribute information.
4527 		 */
4528 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
4529 
4530 			ldcp->hphase = VSW_MILESTONE2;
4531 
4532 			/*
4533 			 * If the peer device has said it wishes to
4534 			 * use descriptor rings then we send it our ring
4535 			 * info, otherwise we just set up a private ring
4536 			 * which we use an internal buffer
4537 			 */
4538 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
4539 				vsw_send_dring_info(ldcp);
4540 		}
4541 		break;
4542 
4543 	case VSW_MILESTONE2:
4544 		/*
4545 		 * If peer has indicated in its attribute message that
4546 		 * it wishes to use descriptor rings then the only way
4547 		 * to pass this milestone is for us to have received
4548 		 * valid dring info.
4549 		 *
4550 		 * If peer is not using descriptor rings then just fall
4551 		 * through.
4552 		 */
4553 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
4554 		    (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
4555 			break;
4556 
4557 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
4558 		    __func__, ldcp->ldc_id);
4559 
4560 		ldcp->hphase = VSW_MILESTONE3;
4561 		vsw_send_rdx(ldcp);
4562 		break;
4563 
4564 	case VSW_MILESTONE3:
4565 		/*
4566 		 * Pass this milestone when all paramaters have been
4567 		 * successfully exchanged and RDX sent in both directions.
4568 		 *
4569 		 * Mark outbound lane as available to transmit data.
4570 		 */
4571 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
4572 		    (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
4573 
4574 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
4575 			    __func__, ldcp->ldc_id);
4576 			D2(vswp, "%s: ** handshake complete (0x%llx : "
4577 			    "0x%llx) **", __func__, ldcp->lane_in.lstate,
4578 			    ldcp->lane_out.lstate);
4579 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
4580 			ldcp->hphase = VSW_MILESTONE4;
4581 			ldcp->hcnt = 0;
4582 			DISPLAY_STATE();
4583 		} else {
4584 			D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
4585 			    __func__, ldcp->lane_in.lstate,
4586 			    ldcp->lane_out.lstate);
4587 		}
4588 		break;
4589 
4590 	case VSW_MILESTONE4:
4591 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
4592 		    ldcp->ldc_id);
4593 		break;
4594 
4595 	default:
4596 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
4597 		    ldcp->ldc_id, ldcp->hphase);
4598 	}
4599 
4600 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
4601 	    ldcp->hphase);
4602 }
4603 
4604 /*
4605  * Check if major version is supported.
4606  *
4607  * Returns 0 if finds supported major number, and if necessary
4608  * adjusts the minor field.
4609  *
4610  * Returns 1 if can't match major number exactly. Sets mjor/minor
4611  * to next lowest support values, or to zero if no other values possible.
4612  */
4613 static int
4614 vsw_supported_version(vio_ver_msg_t *vp)
4615 {
4616 	int	i;
4617 
4618 	D1(NULL, "vsw_supported_version: enter");
4619 
4620 	for (i = 0; i < VSW_NUM_VER; i++) {
4621 		if (vsw_versions[i].ver_major == vp->ver_major) {
4622 			/*
4623 			 * Matching or lower major version found. Update
4624 			 * minor number if necessary.
4625 			 */
4626 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
4627 				D2(NULL, "%s: adjusting minor value from %d "
4628 				    "to %d", __func__, vp->ver_minor,
4629 				    vsw_versions[i].ver_minor);
4630 				vp->ver_minor = vsw_versions[i].ver_minor;
4631 			}
4632 
4633 			return (0);
4634 		}
4635 
4636 		if (vsw_versions[i].ver_major < vp->ver_major) {
4637 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
4638 				D2(NULL, "%s: adjusting minor value from %d "
4639 				    "to %d", __func__, vp->ver_minor,
4640 				    vsw_versions[i].ver_minor);
4641 				vp->ver_minor = vsw_versions[i].ver_minor;
4642 			}
4643 			return (1);
4644 		}
4645 	}
4646 
4647 	/* No match was possible, zero out fields */
4648 	vp->ver_major = 0;
4649 	vp->ver_minor = 0;
4650 
4651 	D1(NULL, "vsw_supported_version: exit");
4652 
4653 	return (1);
4654 }
4655 
4656 /*
4657  * Main routine for processing messages received over LDC.
4658  */
4659 static void
4660 vsw_process_pkt(void *arg)
4661 {
4662 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
4663 	vsw_t 		*vswp = ldcp->ldc_vswp;
4664 	size_t		msglen;
4665 	vio_msg_tag_t	tag;
4666 	def_msg_t	dmsg;
4667 	int 		rv = 0;
4668 
4669 
4670 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4671 
4672 	/*
4673 	 * If channel is up read messages until channel is empty.
4674 	 */
4675 	do {
4676 		msglen = sizeof (dmsg);
4677 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
4678 
4679 		if (rv != 0) {
4680 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
4681 			    __func__, ldcp->ldc_id, rv, msglen);
4682 		}
4683 
4684 		/* channel has been reset */
4685 		if (rv == ECONNRESET) {
4686 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4687 			break;
4688 		}
4689 
4690 		if (msglen == 0) {
4691 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
4692 			    ldcp->ldc_id);
4693 			break;
4694 		}
4695 
4696 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
4697 		    ldcp->ldc_id, msglen);
4698 
4699 		/*
4700 		 * Figure out what sort of packet we have gotten by
4701 		 * examining the msg tag, and then switch it appropriately.
4702 		 */
4703 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
4704 
4705 		switch (tag.vio_msgtype) {
4706 		case VIO_TYPE_CTRL:
4707 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
4708 			break;
4709 		case VIO_TYPE_DATA:
4710 			vsw_process_data_pkt(ldcp, &dmsg, tag);
4711 			break;
4712 		case VIO_TYPE_ERR:
4713 			vsw_process_err_pkt(ldcp, &dmsg, tag);
4714 			break;
4715 		default:
4716 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
4717 			    "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
4718 			break;
4719 		}
4720 	} while (msglen);
4721 
4722 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4723 }
4724 
4725 /*
4726  * Dispatch a task to process a VIO control message.
4727  */
4728 static void
4729 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
4730 {
4731 	vsw_ctrl_task_t		*ctaskp = NULL;
4732 	vsw_port_t		*port = ldcp->ldc_port;
4733 	vsw_t			*vswp = port->p_vswp;
4734 
4735 	D1(vswp, "%s: enter", __func__);
4736 
4737 	/*
4738 	 * We need to handle RDX ACK messages in-band as once they
4739 	 * are exchanged it is possible that we will get an
4740 	 * immediate (legitimate) data packet.
4741 	 */
4742 	if ((tag.vio_subtype_env == VIO_RDX) &&
4743 	    (tag.vio_subtype == VIO_SUBTYPE_ACK)) {
4744 
4745 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
4746 			return;
4747 
4748 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
4749 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
4750 		    "(ostate 0x%llx : hphase %d)", __func__,
4751 		    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
4752 		vsw_next_milestone(ldcp);
4753 		return;
4754 	}
4755 
4756 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
4757 
4758 	if (ctaskp == NULL) {
4759 		DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
4760 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4761 		return;
4762 	}
4763 
4764 	ctaskp->ldcp = ldcp;
4765 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
4766 	mutex_enter(&ldcp->hss_lock);
4767 	ctaskp->hss_id = ldcp->hss_id;
4768 	mutex_exit(&ldcp->hss_lock);
4769 
4770 	/*
4771 	 * Dispatch task to processing taskq if port is not in
4772 	 * the process of being detached.
4773 	 */
4774 	mutex_enter(&port->state_lock);
4775 	if (port->state == VSW_PORT_INIT) {
4776 		if ((vswp->taskq_p == NULL) ||
4777 		    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
4778 		    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
4779 			DERR(vswp, "%s: unable to dispatch task to taskq",
4780 			    __func__);
4781 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4782 			mutex_exit(&port->state_lock);
4783 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4784 			return;
4785 		}
4786 	} else {
4787 		DWARN(vswp, "%s: port %d detaching, not dispatching "
4788 		    "task", __func__, port->p_instance);
4789 	}
4790 
4791 	mutex_exit(&port->state_lock);
4792 
4793 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
4794 	    ldcp->ldc_id);
4795 	D1(vswp, "%s: exit", __func__);
4796 }
4797 
4798 /*
4799  * Process a VIO ctrl message. Invoked from taskq.
4800  */
4801 static void
4802 vsw_process_ctrl_pkt(void *arg)
4803 {
4804 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
4805 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
4806 	vsw_t 		*vswp = ldcp->ldc_vswp;
4807 	vio_msg_tag_t	tag;
4808 	uint16_t	env;
4809 
4810 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4811 
4812 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
4813 	env = tag.vio_subtype_env;
4814 
4815 	/* stale pkt check */
4816 	mutex_enter(&ldcp->hss_lock);
4817 	if (ctaskp->hss_id < ldcp->hss_id) {
4818 		DWARN(vswp, "%s: discarding stale packet belonging to earlier"
4819 		    " (%ld) handshake session", __func__, ctaskp->hss_id);
4820 		mutex_exit(&ldcp->hss_lock);
4821 		return;
4822 	}
4823 	mutex_exit(&ldcp->hss_lock);
4824 
4825 	/* session id check */
4826 	if (ldcp->session_status & VSW_PEER_SESSION) {
4827 		if (ldcp->peer_session != tag.vio_sid) {
4828 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
4829 			    __func__, ldcp->ldc_id, tag.vio_sid);
4830 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4831 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4832 			return;
4833 		}
4834 	}
4835 
4836 	/*
4837 	 * Switch on vio_subtype envelope, then let lower routines
4838 	 * decide if its an INFO, ACK or NACK packet.
4839 	 */
4840 	switch (env) {
4841 	case VIO_VER_INFO:
4842 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
4843 		break;
4844 	case VIO_DRING_REG:
4845 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
4846 		break;
4847 	case VIO_DRING_UNREG:
4848 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
4849 		break;
4850 	case VIO_ATTR_INFO:
4851 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
4852 		break;
4853 	case VNET_MCAST_INFO:
4854 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
4855 		break;
4856 	case VIO_RDX:
4857 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
4858 		break;
4859 	default:
4860 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
4861 	}
4862 
4863 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4864 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4865 }
4866 
4867 /*
4868  * Version negotiation. We can end up here either because our peer
4869  * has responded to a handshake message we have sent it, or our peer
4870  * has initiated a handshake with us. If its the former then can only
4871  * be ACK or NACK, if its the later can only be INFO.
4872  *
4873  * If its an ACK we move to the next stage of the handshake, namely
4874  * attribute exchange. If its a NACK we see if we can specify another
4875  * version, if we can't we stop.
4876  *
4877  * If it is an INFO we reset all params associated with communication
4878  * in that direction over this channel (remember connection is
4879  * essentially 2 independent simplex channels).
4880  */
4881 void
4882 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
4883 {
4884 	vio_ver_msg_t	*ver_pkt;
4885 	vsw_t 		*vswp = ldcp->ldc_vswp;
4886 
4887 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4888 
4889 	/*
4890 	 * We know this is a ctrl/version packet so
4891 	 * cast it into the correct structure.
4892 	 */
4893 	ver_pkt = (vio_ver_msg_t *)pkt;
4894 
4895 	switch (ver_pkt->tag.vio_subtype) {
4896 	case VIO_SUBTYPE_INFO:
4897 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
4898 
4899 		/*
4900 		 * Record the session id, which we will use from now
4901 		 * until we see another VER_INFO msg. Even then the
4902 		 * session id in most cases will be unchanged, execpt
4903 		 * if channel was reset.
4904 		 */
4905 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
4906 		    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
4907 			DERR(vswp, "%s: updating session id for chan %lld "
4908 			    "from %llx to %llx", __func__, ldcp->ldc_id,
4909 			    ldcp->peer_session, ver_pkt->tag.vio_sid);
4910 		}
4911 
4912 		ldcp->peer_session = ver_pkt->tag.vio_sid;
4913 		ldcp->session_status |= VSW_PEER_SESSION;
4914 
4915 		/* Legal message at this time ? */
4916 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
4917 			return;
4918 
4919 		/*
4920 		 * First check the device class. Currently only expect
4921 		 * to be talking to a network device. In the future may
4922 		 * also talk to another switch.
4923 		 */
4924 		if (ver_pkt->dev_class != VDEV_NETWORK) {
4925 			DERR(vswp, "%s: illegal device class %d", __func__,
4926 			    ver_pkt->dev_class);
4927 
4928 			ver_pkt->tag.vio_sid = ldcp->local_session;
4929 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4930 
4931 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4932 
4933 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
4934 			    sizeof (vio_ver_msg_t), B_TRUE);
4935 
4936 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
4937 			vsw_next_milestone(ldcp);
4938 			return;
4939 		} else {
4940 			ldcp->dev_class = ver_pkt->dev_class;
4941 		}
4942 
4943 		/*
4944 		 * Now check the version.
4945 		 */
4946 		if (vsw_supported_version(ver_pkt) == 0) {
4947 			/*
4948 			 * Support this major version and possibly
4949 			 * adjusted minor version.
4950 			 */
4951 
4952 			D2(vswp, "%s: accepted ver %d:%d", __func__,
4953 			    ver_pkt->ver_major, ver_pkt->ver_minor);
4954 
4955 			/* Store accepted values */
4956 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4957 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4958 
4959 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4960 
4961 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
4962 		} else {
4963 			/*
4964 			 * NACK back with the next lower major/minor
4965 			 * pairing we support (if don't suuport any more
4966 			 * versions then they will be set to zero.
4967 			 */
4968 
4969 			D2(vswp, "%s: replying with ver %d:%d", __func__,
4970 			    ver_pkt->ver_major, ver_pkt->ver_minor);
4971 
4972 			/* Store updated values */
4973 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4974 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4975 
4976 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4977 
4978 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
4979 		}
4980 
4981 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4982 		ver_pkt->tag.vio_sid = ldcp->local_session;
4983 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
4984 		    sizeof (vio_ver_msg_t), B_TRUE);
4985 
4986 		vsw_next_milestone(ldcp);
4987 		break;
4988 
4989 	case VIO_SUBTYPE_ACK:
4990 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
4991 
4992 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
4993 			return;
4994 
4995 		/* Store updated values */
4996 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
4997 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4998 
4999 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
5000 		vsw_next_milestone(ldcp);
5001 
5002 		break;
5003 
5004 	case VIO_SUBTYPE_NACK:
5005 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
5006 
5007 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
5008 			return;
5009 
5010 		/*
5011 		 * If our peer sent us a NACK with the ver fields set to
5012 		 * zero then there is nothing more we can do. Otherwise see
5013 		 * if we support either the version suggested, or a lesser
5014 		 * one.
5015 		 */
5016 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
5017 			DERR(vswp, "%s: peer unable to negotiate any "
5018 			    "further.", __func__);
5019 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
5020 			vsw_next_milestone(ldcp);
5021 			return;
5022 		}
5023 
5024 		/*
5025 		 * Check to see if we support this major version or
5026 		 * a lower one. If we don't then maj/min will be set
5027 		 * to zero.
5028 		 */
5029 		(void) vsw_supported_version(ver_pkt);
5030 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
5031 			/* Nothing more we can do */
5032 			DERR(vswp, "%s: version negotiation failed.\n",
5033 			    __func__);
5034 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
5035 			vsw_next_milestone(ldcp);
5036 		} else {
5037 			/* found a supported major version */
5038 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
5039 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
5040 
5041 			D2(vswp, "%s: resending with updated values (%x, %x)",
5042 			    __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
5043 
5044 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
5045 			ver_pkt->tag.vio_sid = ldcp->local_session;
5046 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
5047 
5048 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
5049 
5050 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
5051 			    sizeof (vio_ver_msg_t), B_TRUE);
5052 
5053 			vsw_next_milestone(ldcp);
5054 
5055 		}
5056 		break;
5057 
5058 	default:
5059 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5060 		    ver_pkt->tag.vio_subtype);
5061 	}
5062 
5063 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
5064 }
5065 
5066 /*
5067  * Process an attribute packet. We can end up here either because our peer
5068  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
5069  * peer has sent us an attribute INFO message
5070  *
5071  * If its an ACK we then move to the next stage of the handshake which
5072  * is to send our descriptor ring info to our peer. If its a NACK then
5073  * there is nothing more we can (currently) do.
5074  *
5075  * If we get a valid/acceptable INFO packet (and we have already negotiated
5076  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
5077  * NACK back and reset channel state to INACTIV.
5078  *
5079  * FUTURE: in time we will probably negotiate over attributes, but for
5080  * the moment unacceptable attributes are regarded as a fatal error.
5081  *
5082  */
5083 void
5084 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
5085 {
5086 	vnet_attr_msg_t		*attr_pkt;
5087 	vsw_t			*vswp = ldcp->ldc_vswp;
5088 	vsw_port_t		*port = ldcp->ldc_port;
5089 	uint64_t		macaddr = 0;
5090 	int			i;
5091 
5092 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5093 
5094 	/*
5095 	 * We know this is a ctrl/attr packet so
5096 	 * cast it into the correct structure.
5097 	 */
5098 	attr_pkt = (vnet_attr_msg_t *)pkt;
5099 
5100 	switch (attr_pkt->tag.vio_subtype) {
5101 	case VIO_SUBTYPE_INFO:
5102 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5103 
5104 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
5105 			return;
5106 
5107 		/*
5108 		 * If the attributes are unacceptable then we NACK back.
5109 		 */
5110 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
5111 
5112 			DERR(vswp, "%s (chan %d): invalid attributes",
5113 			    __func__, ldcp->ldc_id);
5114 
5115 			vsw_free_lane_resources(ldcp, INBOUND);
5116 
5117 			attr_pkt->tag.vio_sid = ldcp->local_session;
5118 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5119 
5120 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
5121 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
5122 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
5123 			    sizeof (vnet_attr_msg_t), B_TRUE);
5124 
5125 			vsw_next_milestone(ldcp);
5126 			return;
5127 		}
5128 
5129 		/*
5130 		 * Otherwise store attributes for this lane and update
5131 		 * lane state.
5132 		 */
5133 		ldcp->lane_in.mtu = attr_pkt->mtu;
5134 		ldcp->lane_in.addr = attr_pkt->addr;
5135 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
5136 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
5137 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
5138 
5139 		macaddr = ldcp->lane_in.addr;
5140 		for (i = ETHERADDRL - 1; i >= 0; i--) {
5141 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
5142 			macaddr >>= 8;
5143 		}
5144 
5145 		/* create the fdb entry for this port/mac address */
5146 		(void) vsw_add_fdb(vswp, port);
5147 
5148 		/* setup device specifc xmit routines */
5149 		mutex_enter(&port->tx_lock);
5150 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
5151 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
5152 			port->transmit = vsw_dringsend;
5153 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
5154 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
5155 			vsw_create_privring(ldcp);
5156 			port->transmit = vsw_descrsend;
5157 		}
5158 		mutex_exit(&port->tx_lock);
5159 
5160 		attr_pkt->tag.vio_sid = ldcp->local_session;
5161 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5162 
5163 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
5164 
5165 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
5166 
5167 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
5168 		    sizeof (vnet_attr_msg_t), B_TRUE);
5169 
5170 		vsw_next_milestone(ldcp);
5171 		break;
5172 
5173 	case VIO_SUBTYPE_ACK:
5174 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5175 
5176 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
5177 			return;
5178 
5179 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
5180 		vsw_next_milestone(ldcp);
5181 		break;
5182 
5183 	case VIO_SUBTYPE_NACK:
5184 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5185 
5186 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
5187 			return;
5188 
5189 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
5190 		vsw_next_milestone(ldcp);
5191 		break;
5192 
5193 	default:
5194 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5195 		    attr_pkt->tag.vio_subtype);
5196 	}
5197 
5198 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5199 }
5200 
5201 /*
5202  * Process a dring info packet. We can end up here either because our peer
5203  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
5204  * peer has sent us a dring INFO message.
5205  *
5206  * If we get a valid/acceptable INFO packet (and we have already negotiated
5207  * a version) we ACK back and update the lane state, otherwise we NACK back.
5208  *
5209  * FUTURE: nothing to stop client from sending us info on multiple dring's
5210  * but for the moment we will just use the first one we are given.
5211  *
5212  */
5213 void
5214 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
5215 {
5216 	vio_dring_reg_msg_t	*dring_pkt;
5217 	vsw_t			*vswp = ldcp->ldc_vswp;
5218 	ldc_mem_info_t		minfo;
5219 	dring_info_t		*dp, *dbp;
5220 	int			dring_found = 0;
5221 
5222 	/*
5223 	 * We know this is a ctrl/dring packet so
5224 	 * cast it into the correct structure.
5225 	 */
5226 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
5227 
5228 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5229 
5230 	switch (dring_pkt->tag.vio_subtype) {
5231 	case VIO_SUBTYPE_INFO:
5232 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5233 
5234 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
5235 			return;
5236 
5237 		/*
5238 		 * If the dring params are unacceptable then we NACK back.
5239 		 */
5240 		if (vsw_check_dring_info(dring_pkt)) {
5241 
5242 			DERR(vswp, "%s (%lld): invalid dring info",
5243 			    __func__, ldcp->ldc_id);
5244 
5245 			vsw_free_lane_resources(ldcp, INBOUND);
5246 
5247 			dring_pkt->tag.vio_sid = ldcp->local_session;
5248 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5249 
5250 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5251 
5252 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5253 
5254 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5255 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
5256 
5257 			vsw_next_milestone(ldcp);
5258 			return;
5259 		}
5260 
5261 		/*
5262 		 * Otherwise, attempt to map in the dring using the
5263 		 * cookie. If that succeeds we send back a unique dring
5264 		 * identifier that the sending side will use in future
5265 		 * to refer to this descriptor ring.
5266 		 */
5267 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5268 
5269 		dp->num_descriptors = dring_pkt->num_descriptors;
5270 		dp->descriptor_size = dring_pkt->descriptor_size;
5271 		dp->options = dring_pkt->options;
5272 		dp->ncookies = dring_pkt->ncookies;
5273 
5274 		/*
5275 		 * Note: should only get one cookie. Enforced in
5276 		 * the ldc layer.
5277 		 */
5278 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
5279 		    sizeof (ldc_mem_cookie_t));
5280 
5281 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
5282 		    dp->num_descriptors, dp->descriptor_size);
5283 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
5284 		    dp->options, dp->ncookies);
5285 
5286 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
5287 		    dp->ncookies, dp->num_descriptors, dp->descriptor_size,
5288 		    LDC_SHADOW_MAP, &(dp->handle))) != 0) {
5289 
5290 			DERR(vswp, "%s: dring_map failed\n", __func__);
5291 
5292 			kmem_free(dp, sizeof (dring_info_t));
5293 			vsw_free_lane_resources(ldcp, INBOUND);
5294 
5295 			dring_pkt->tag.vio_sid = ldcp->local_session;
5296 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5297 
5298 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5299 
5300 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5301 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5302 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
5303 
5304 			vsw_next_milestone(ldcp);
5305 			return;
5306 		}
5307 
5308 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
5309 
5310 			DERR(vswp, "%s: dring_addr failed\n", __func__);
5311 
5312 			kmem_free(dp, sizeof (dring_info_t));
5313 			vsw_free_lane_resources(ldcp, INBOUND);
5314 
5315 			dring_pkt->tag.vio_sid = ldcp->local_session;
5316 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5317 
5318 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5319 
5320 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5321 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5322 			    sizeof (vio_dring_reg_msg_t), B_TRUE);
5323 
5324 			vsw_next_milestone(ldcp);
5325 			return;
5326 		} else {
5327 			/* store the address of the pub part of ring */
5328 			dp->pub_addr = minfo.vaddr;
5329 		}
5330 
5331 		/* no private section as we are importing */
5332 		dp->priv_addr = NULL;
5333 
5334 		/*
5335 		 * Using simple mono increasing int for ident at
5336 		 * the moment.
5337 		 */
5338 		dp->ident = ldcp->next_ident;
5339 		ldcp->next_ident++;
5340 
5341 		dp->end_idx = 0;
5342 		dp->next = NULL;
5343 
5344 		/*
5345 		 * Link it onto the end of the list of drings
5346 		 * for this lane.
5347 		 */
5348 		if (ldcp->lane_in.dringp == NULL) {
5349 			D2(vswp, "%s: adding first INBOUND dring", __func__);
5350 			ldcp->lane_in.dringp = dp;
5351 		} else {
5352 			dbp = ldcp->lane_in.dringp;
5353 
5354 			while (dbp->next != NULL)
5355 				dbp = dbp->next;
5356 
5357 			dbp->next = dp;
5358 		}
5359 
5360 		/* acknowledge it */
5361 		dring_pkt->tag.vio_sid = ldcp->local_session;
5362 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5363 		dring_pkt->dring_ident = dp->ident;
5364 
5365 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5366 		    sizeof (vio_dring_reg_msg_t), B_TRUE);
5367 
5368 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
5369 		vsw_next_milestone(ldcp);
5370 		break;
5371 
5372 	case VIO_SUBTYPE_ACK:
5373 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5374 
5375 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
5376 			return;
5377 
5378 		/*
5379 		 * Peer is acknowledging our dring info and will have
5380 		 * sent us a dring identifier which we will use to
5381 		 * refer to this ring w.r.t. our peer.
5382 		 */
5383 		dp = ldcp->lane_out.dringp;
5384 		if (dp != NULL) {
5385 			/*
5386 			 * Find the ring this ident should be associated
5387 			 * with.
5388 			 */
5389 			if (vsw_dring_match(dp, dring_pkt)) {
5390 				dring_found = 1;
5391 
5392 			} else while (dp != NULL) {
5393 				if (vsw_dring_match(dp, dring_pkt)) {
5394 					dring_found = 1;
5395 					break;
5396 				}
5397 				dp = dp->next;
5398 			}
5399 
5400 			if (dring_found == 0) {
5401 				DERR(NULL, "%s: unrecognised ring cookie",
5402 				    __func__);
5403 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5404 				return;
5405 			}
5406 
5407 		} else {
5408 			DERR(vswp, "%s: DRING ACK received but no drings "
5409 			    "allocated", __func__);
5410 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5411 			return;
5412 		}
5413 
5414 		/* store ident */
5415 		dp->ident = dring_pkt->dring_ident;
5416 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
5417 		vsw_next_milestone(ldcp);
5418 		break;
5419 
5420 	case VIO_SUBTYPE_NACK:
5421 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5422 
5423 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
5424 			return;
5425 
5426 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
5427 		vsw_next_milestone(ldcp);
5428 		break;
5429 
5430 	default:
5431 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5432 		    dring_pkt->tag.vio_subtype);
5433 	}
5434 
5435 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5436 }
5437 
5438 /*
5439  * Process a request from peer to unregister a dring.
5440  *
5441  * For the moment we just restart the handshake if our
5442  * peer endpoint attempts to unregister a dring.
5443  */
5444 void
5445 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
5446 {
5447 	vsw_t			*vswp = ldcp->ldc_vswp;
5448 	vio_dring_unreg_msg_t	*dring_pkt;
5449 
5450 	/*
5451 	 * We know this is a ctrl/dring packet so
5452 	 * cast it into the correct structure.
5453 	 */
5454 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
5455 
5456 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5457 
5458 	switch (dring_pkt->tag.vio_subtype) {
5459 	case VIO_SUBTYPE_INFO:
5460 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5461 
5462 		DWARN(vswp, "%s: restarting handshake..", __func__);
5463 		break;
5464 
5465 	case VIO_SUBTYPE_ACK:
5466 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5467 
5468 		DWARN(vswp, "%s: restarting handshake..", __func__);
5469 		break;
5470 
5471 	case VIO_SUBTYPE_NACK:
5472 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5473 
5474 		DWARN(vswp, "%s: restarting handshake..", __func__);
5475 		break;
5476 
5477 	default:
5478 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5479 		    dring_pkt->tag.vio_subtype);
5480 	}
5481 
5482 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5483 
5484 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5485 }
5486 
5487 #define	SND_MCST_NACK(ldcp, pkt) \
5488 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5489 	pkt->tag.vio_sid = ldcp->local_session; \
5490 	(void) vsw_send_msg(ldcp, (void *)pkt, \
5491 			sizeof (vnet_mcast_msg_t), B_TRUE);
5492 
5493 /*
5494  * Process a multicast request from a vnet.
5495  *
5496  * Vnet's specify a multicast address that they are interested in. This
5497  * address is used as a key into the hash table which forms the multicast
5498  * forwarding database (mFDB).
5499  *
5500  * The table keys are the multicast addresses, while the table entries
5501  * are pointers to lists of ports which wish to receive packets for the
5502  * specified multicast address.
5503  *
5504  * When a multicast packet is being switched we use the address as a key
5505  * into the hash table, and then walk the appropriate port list forwarding
5506  * the pkt to each port in turn.
5507  *
5508  * If a vnet is no longer interested in a particular multicast grouping
5509  * we simply find the correct location in the hash table and then delete
5510  * the relevant port from the port list.
5511  *
5512  * To deal with the case whereby a port is being deleted without first
5513  * removing itself from the lists in the hash table, we maintain a list
5514  * of multicast addresses the port has registered an interest in, within
5515  * the port structure itself. We then simply walk that list of addresses
5516  * using them as keys into the hash table and remove the port from the
5517  * appropriate lists.
5518  */
5519 static void
5520 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
5521 {
5522 	vnet_mcast_msg_t	*mcst_pkt;
5523 	vsw_port_t		*port = ldcp->ldc_port;
5524 	vsw_t			*vswp = ldcp->ldc_vswp;
5525 	int			i;
5526 
5527 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5528 
5529 	/*
5530 	 * We know this is a ctrl/mcast packet so
5531 	 * cast it into the correct structure.
5532 	 */
5533 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
5534 
5535 	switch (mcst_pkt->tag.vio_subtype) {
5536 	case VIO_SUBTYPE_INFO:
5537 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5538 
5539 		/*
5540 		 * Check if in correct state to receive a multicast
5541 		 * message (i.e. handshake complete). If not reset
5542 		 * the handshake.
5543 		 */
5544 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
5545 			return;
5546 
5547 		/*
5548 		 * Before attempting to add or remove address check
5549 		 * that they are valid multicast addresses.
5550 		 * If not, then NACK back.
5551 		 */
5552 		for (i = 0; i < mcst_pkt->count; i++) {
5553 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
5554 				DERR(vswp, "%s: invalid multicast address",
5555 				    __func__);
5556 				SND_MCST_NACK(ldcp, mcst_pkt);
5557 				return;
5558 			}
5559 		}
5560 
5561 		/*
5562 		 * Now add/remove the addresses. If this fails we
5563 		 * NACK back.
5564 		 */
5565 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
5566 			SND_MCST_NACK(ldcp, mcst_pkt);
5567 			return;
5568 		}
5569 
5570 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5571 		mcst_pkt->tag.vio_sid = ldcp->local_session;
5572 
5573 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
5574 
5575 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
5576 		    sizeof (vnet_mcast_msg_t), B_TRUE);
5577 		break;
5578 
5579 	case VIO_SUBTYPE_ACK:
5580 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5581 
5582 		/*
5583 		 * We shouldn't ever get a multicast ACK message as
5584 		 * at the moment we never request multicast addresses
5585 		 * to be set on some other device. This may change in
5586 		 * the future if we have cascading switches.
5587 		 */
5588 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
5589 			return;
5590 
5591 				/* Do nothing */
5592 		break;
5593 
5594 	case VIO_SUBTYPE_NACK:
5595 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5596 
5597 		/*
5598 		 * We shouldn't get a multicast NACK packet for the
5599 		 * same reasons as we shouldn't get a ACK packet.
5600 		 */
5601 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
5602 			return;
5603 
5604 				/* Do nothing */
5605 		break;
5606 
5607 	default:
5608 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5609 		    mcst_pkt->tag.vio_subtype);
5610 	}
5611 
5612 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5613 }
5614 
5615 static void
5616 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
5617 {
5618 	vio_rdx_msg_t	*rdx_pkt;
5619 	vsw_t		*vswp = ldcp->ldc_vswp;
5620 
5621 	/*
5622 	 * We know this is a ctrl/rdx packet so
5623 	 * cast it into the correct structure.
5624 	 */
5625 	rdx_pkt = (vio_rdx_msg_t *)pkt;
5626 
5627 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5628 
5629 	switch (rdx_pkt->tag.vio_subtype) {
5630 	case VIO_SUBTYPE_INFO:
5631 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5632 
5633 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
5634 			return;
5635 
5636 		rdx_pkt->tag.vio_sid = ldcp->local_session;
5637 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5638 
5639 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
5640 
5641 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
5642 
5643 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
5644 		    sizeof (vio_rdx_msg_t), B_TRUE);
5645 
5646 		vsw_next_milestone(ldcp);
5647 		break;
5648 
5649 	case VIO_SUBTYPE_ACK:
5650 		/*
5651 		 * Should be handled in-band by callback handler.
5652 		 */
5653 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
5654 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5655 		break;
5656 
5657 	case VIO_SUBTYPE_NACK:
5658 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5659 
5660 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
5661 			return;
5662 
5663 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
5664 		vsw_next_milestone(ldcp);
5665 		break;
5666 
5667 	default:
5668 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5669 		    rdx_pkt->tag.vio_subtype);
5670 	}
5671 
5672 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5673 }
5674 
5675 static void
5676 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
5677 {
5678 	uint16_t	env = tag.vio_subtype_env;
5679 	vsw_t		*vswp = ldcp->ldc_vswp;
5680 
5681 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5682 
5683 	/* session id check */
5684 	if (ldcp->session_status & VSW_PEER_SESSION) {
5685 		if (ldcp->peer_session != tag.vio_sid) {
5686 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
5687 			    __func__, ldcp->ldc_id, tag.vio_sid);
5688 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5689 			return;
5690 		}
5691 	}
5692 
5693 	/*
5694 	 * It is an error for us to be getting data packets
5695 	 * before the handshake has completed.
5696 	 */
5697 	if (ldcp->hphase != VSW_MILESTONE4) {
5698 		DERR(vswp, "%s: got data packet before handshake complete "
5699 		    "hphase %d (%x: %x)", __func__, ldcp->hphase,
5700 		    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
5701 		DUMP_FLAGS(ldcp->lane_in.lstate);
5702 		DUMP_FLAGS(ldcp->lane_out.lstate);
5703 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5704 		return;
5705 	}
5706 
5707 	/*
5708 	 * Switch on vio_subtype envelope, then let lower routines
5709 	 * decide if its an INFO, ACK or NACK packet.
5710 	 */
5711 	if (env == VIO_DRING_DATA) {
5712 		vsw_process_data_dring_pkt(ldcp, dpkt);
5713 	} else if (env == VIO_PKT_DATA) {
5714 		vsw_process_data_raw_pkt(ldcp, dpkt);
5715 	} else if (env == VIO_DESC_DATA) {
5716 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
5717 	} else {
5718 		DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
5719 	}
5720 
5721 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5722 }
5723 
5724 #define	SND_DRING_NACK(ldcp, pkt) \
5725 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5726 	pkt->tag.vio_sid = ldcp->local_session; \
5727 	(void) vsw_send_msg(ldcp, (void *)pkt, \
5728 			sizeof (vio_dring_msg_t), B_TRUE);
5729 
5730 static void
5731 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
5732 {
5733 	vio_dring_msg_t		*dring_pkt;
5734 	vnet_public_desc_t	*pub_addr = NULL;
5735 	vsw_private_desc_t	*priv_addr = NULL;
5736 	dring_info_t		*dp = NULL;
5737 	vsw_t			*vswp = ldcp->ldc_vswp;
5738 	mblk_t			*mp = NULL;
5739 	mblk_t			*bp = NULL;
5740 	mblk_t			*bpt = NULL;
5741 	size_t			nbytes = 0;
5742 	size_t			off = 0;
5743 	uint64_t		ncookies = 0;
5744 	uint64_t		chain = 0;
5745 	uint64_t		j, len;
5746 	uint32_t		pos, start, datalen;
5747 	uint32_t		range_start, range_end;
5748 	int32_t			end, num, cnt = 0;
5749 	int			i, rv, msg_rv = 0;
5750 	boolean_t		ack_needed = B_FALSE;
5751 	boolean_t		prev_desc_ack = B_FALSE;
5752 	int			read_attempts = 0;
5753 
5754 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5755 
5756 	/*
5757 	 * We know this is a data/dring packet so
5758 	 * cast it into the correct structure.
5759 	 */
5760 	dring_pkt = (vio_dring_msg_t *)dpkt;
5761 
5762 	/*
5763 	 * Switch on the vio_subtype. If its INFO then we need to
5764 	 * process the data. If its an ACK we need to make sure
5765 	 * it makes sense (i.e did we send an earlier data/info),
5766 	 * and if its a NACK then we maybe attempt a retry.
5767 	 */
5768 	switch (dring_pkt->tag.vio_subtype) {
5769 	case VIO_SUBTYPE_INFO:
5770 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
5771 
5772 		READ_ENTER(&ldcp->lane_in.dlistrw);
5773 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
5774 		    dring_pkt->dring_ident)) == NULL) {
5775 			RW_EXIT(&ldcp->lane_in.dlistrw);
5776 
5777 			DERR(vswp, "%s(%lld): unable to find dring from "
5778 			    "ident 0x%llx", __func__, ldcp->ldc_id,
5779 			    dring_pkt->dring_ident);
5780 
5781 			SND_DRING_NACK(ldcp, dring_pkt);
5782 			return;
5783 		}
5784 
5785 		start = pos = dring_pkt->start_idx;
5786 		end = dring_pkt->end_idx;
5787 		len = dp->num_descriptors;
5788 
5789 		range_start = range_end = pos;
5790 
5791 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
5792 		    __func__, ldcp->ldc_id, start, end);
5793 
5794 		if (end == -1) {
5795 			num = -1;
5796 		} else if (end >= 0) {
5797 			num = end >= pos ? end - pos + 1: (len - pos + 1) + end;
5798 
5799 			/* basic sanity check */
5800 			if (end > len) {
5801 				RW_EXIT(&ldcp->lane_in.dlistrw);
5802 				DERR(vswp, "%s(%lld): endpoint %lld outside "
5803 				    "ring length %lld", __func__,
5804 				    ldcp->ldc_id, end, len);
5805 
5806 				SND_DRING_NACK(ldcp, dring_pkt);
5807 				return;
5808 			}
5809 		} else {
5810 			RW_EXIT(&ldcp->lane_in.dlistrw);
5811 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
5812 			    __func__, ldcp->ldc_id, end);
5813 			SND_DRING_NACK(ldcp, dring_pkt);
5814 			return;
5815 		}
5816 
5817 		while (cnt != num) {
5818 vsw_recheck_desc:
5819 			if ((rv = ldc_mem_dring_acquire(dp->handle,
5820 			    pos, pos)) != 0) {
5821 				RW_EXIT(&ldcp->lane_in.dlistrw);
5822 				DERR(vswp, "%s(%lld): unable to acquire "
5823 				    "descriptor at pos %d: err %d",
5824 				    __func__, pos, ldcp->ldc_id, rv);
5825 				SND_DRING_NACK(ldcp, dring_pkt);
5826 				return;
5827 			}
5828 
5829 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
5830 
5831 			/*
5832 			 * When given a bounded range of descriptors
5833 			 * to process, its an error to hit a descriptor
5834 			 * which is not ready. In the non-bounded case
5835 			 * (end_idx == -1) this simply indicates we have
5836 			 * reached the end of the current active range.
5837 			 */
5838 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
5839 				/* unbound - no error */
5840 				if (end == -1) {
5841 					if (read_attempts == vsw_read_attempts)
5842 						break;
5843 
5844 					delay(drv_usectohz(vsw_desc_delay));
5845 					read_attempts++;
5846 					goto vsw_recheck_desc;
5847 				}
5848 
5849 				/* bounded - error - so NACK back */
5850 				RW_EXIT(&ldcp->lane_in.dlistrw);
5851 				DERR(vswp, "%s(%lld): descriptor not READY "
5852 				    "(%d)", __func__, ldcp->ldc_id,
5853 				    pub_addr->hdr.dstate);
5854 				SND_DRING_NACK(ldcp, dring_pkt);
5855 				return;
5856 			}
5857 
5858 			DTRACE_PROBE1(read_attempts, int, read_attempts);
5859 
5860 			range_end = pos;
5861 
5862 			/*
5863 			 * If we ACK'd the previous descriptor then now
5864 			 * record the new range start position for later
5865 			 * ACK's.
5866 			 */
5867 			if (prev_desc_ack) {
5868 				range_start = pos;
5869 
5870 				D2(vswp, "%s(%lld): updating range start to be "
5871 				    "%d", __func__, ldcp->ldc_id, range_start);
5872 
5873 				prev_desc_ack = B_FALSE;
5874 			}
5875 
5876 			/*
5877 			 * Data is padded to align on 8 byte boundary,
5878 			 * datalen is actual data length, i.e. minus that
5879 			 * padding.
5880 			 */
5881 			datalen = pub_addr->nbytes;
5882 
5883 			/*
5884 			 * Does peer wish us to ACK when we have finished
5885 			 * with this descriptor ?
5886 			 */
5887 			if (pub_addr->hdr.ack)
5888 				ack_needed = B_TRUE;
5889 
5890 			D2(vswp, "%s(%lld): processing desc %lld at pos"
5891 			    " 0x%llx : dstate 0x%lx : datalen 0x%lx",
5892 			    __func__, ldcp->ldc_id, pos, pub_addr,
5893 			    pub_addr->hdr.dstate, datalen);
5894 
5895 			/*
5896 			 * Mark that we are starting to process descriptor.
5897 			 */
5898 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
5899 
5900 			mp = vio_allocb(ldcp->rxh);
5901 			if (mp == NULL) {
5902 				/*
5903 				 * No free receive buffers available, so
5904 				 * fallback onto allocb(9F). Make sure that
5905 				 * we get a data buffer which is a multiple
5906 				 * of 8 as this is required by ldc_mem_copy.
5907 				 */
5908 				DTRACE_PROBE(allocb);
5909 				if ((mp = allocb(datalen + VNET_IPALIGN + 8,
5910 				    BPRI_MED)) == NULL) {
5911 					DERR(vswp, "%s(%ld): allocb failed",
5912 					    __func__, ldcp->ldc_id);
5913 					pub_addr->hdr.dstate = VIO_DESC_DONE;
5914 					(void) ldc_mem_dring_release(dp->handle,
5915 					    pos, pos);
5916 					break;
5917 				}
5918 			}
5919 
5920 			/*
5921 			 * Ensure that we ask ldc for an aligned
5922 			 * number of bytes.
5923 			 */
5924 			nbytes = datalen + VNET_IPALIGN;
5925 			if (nbytes & 0x7) {
5926 				off = 8 - (nbytes & 0x7);
5927 				nbytes += off;
5928 			}
5929 
5930 			ncookies = pub_addr->ncookies;
5931 			rv = ldc_mem_copy(ldcp->ldc_handle,
5932 			    (caddr_t)mp->b_rptr, 0, &nbytes,
5933 			    pub_addr->memcookie, ncookies, LDC_COPY_IN);
5934 
5935 			if (rv != 0) {
5936 				DERR(vswp, "%s(%d): unable to copy in data "
5937 				    "from %d cookies in desc %d (rv %d)",
5938 				    __func__, ldcp->ldc_id, ncookies, pos, rv);
5939 				freemsg(mp);
5940 
5941 				pub_addr->hdr.dstate = VIO_DESC_DONE;
5942 				(void) ldc_mem_dring_release(dp->handle,
5943 				    pos, pos);
5944 				break;
5945 			} else {
5946 				D2(vswp, "%s(%d): copied in %ld bytes"
5947 				    " using %d cookies", __func__,
5948 				    ldcp->ldc_id, nbytes, ncookies);
5949 			}
5950 
5951 			/* adjust the read pointer to skip over the padding */
5952 			mp->b_rptr += VNET_IPALIGN;
5953 
5954 			/* point to the actual end of data */
5955 			mp->b_wptr = mp->b_rptr + datalen;
5956 
5957 			/* build a chain of received packets */
5958 			if (bp == NULL) {
5959 				/* first pkt */
5960 				bp = mp;
5961 				bp->b_next = bp->b_prev = NULL;
5962 				bpt = bp;
5963 				chain = 1;
5964 			} else {
5965 				mp->b_next = NULL;
5966 				mp->b_prev = bpt;
5967 				bpt->b_next = mp;
5968 				bpt = mp;
5969 				chain++;
5970 			}
5971 
5972 			/* mark we are finished with this descriptor */
5973 			pub_addr->hdr.dstate = VIO_DESC_DONE;
5974 
5975 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
5976 
5977 			/*
5978 			 * Send an ACK back to peer if requested.
5979 			 */
5980 			if (ack_needed) {
5981 				ack_needed = B_FALSE;
5982 
5983 				dring_pkt->start_idx = range_start;
5984 				dring_pkt->end_idx = range_end;
5985 
5986 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
5987 				    " requested", __func__, ldcp->ldc_id,
5988 				    dring_pkt->start_idx, dring_pkt->end_idx);
5989 
5990 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
5991 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5992 				dring_pkt->tag.vio_sid = ldcp->local_session;
5993 
5994 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
5995 				    sizeof (vio_dring_msg_t), B_FALSE);
5996 
5997 				/*
5998 				 * Check if ACK was successfully sent. If not
5999 				 * we break and deal with that below.
6000 				 */
6001 				if (msg_rv != 0)
6002 					break;
6003 
6004 				prev_desc_ack = B_TRUE;
6005 				range_start = pos;
6006 			}
6007 
6008 			/* next descriptor */
6009 			pos = (pos + 1) % len;
6010 			cnt++;
6011 
6012 			/*
6013 			 * Break out of loop here and stop processing to
6014 			 * allow some other network device (or disk) to
6015 			 * get access to the cpu.
6016 			 */
6017 			if (chain > vsw_chain_len) {
6018 				D3(vswp, "%s(%lld): switching chain of %d "
6019 				    "msgs", __func__, ldcp->ldc_id, chain);
6020 				break;
6021 			}
6022 		}
6023 		RW_EXIT(&ldcp->lane_in.dlistrw);
6024 
6025 		/*
6026 		 * If when we attempted to send the ACK we found that the
6027 		 * channel had been reset then now handle this. We deal with
6028 		 * it here as we cannot reset the channel while holding the
6029 		 * dlistrw lock, and we don't want to acquire/release it
6030 		 * continuously in the above loop, as a channel reset should
6031 		 * be a rare event.
6032 		 */
6033 		if (msg_rv == ECONNRESET) {
6034 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
6035 			break;
6036 		}
6037 
6038 		/* send the chain of packets to be switched */
6039 		if (bp != NULL) {
6040 			D3(vswp, "%s(%lld): switching chain of %d msgs",
6041 			    __func__, ldcp->ldc_id, chain);
6042 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
6043 			    ldcp->ldc_port, NULL);
6044 		}
6045 
6046 		DTRACE_PROBE1(msg_cnt, int, cnt);
6047 
6048 		/*
6049 		 * We are now finished so ACK back with the state
6050 		 * set to STOPPING so our peer knows we are finished
6051 		 */
6052 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
6053 		dring_pkt->tag.vio_sid = ldcp->local_session;
6054 
6055 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
6056 
6057 		DTRACE_PROBE(stop_process_sent);
6058 
6059 		/*
6060 		 * We have not processed any more descriptors beyond
6061 		 * the last one we ACK'd.
6062 		 */
6063 		if (prev_desc_ack)
6064 			range_start = range_end;
6065 
6066 		dring_pkt->start_idx = range_start;
6067 		dring_pkt->end_idx = range_end;
6068 
6069 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
6070 		    __func__, ldcp->ldc_id, dring_pkt->start_idx,
6071 		    dring_pkt->end_idx);
6072 
6073 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
6074 		    sizeof (vio_dring_msg_t), B_TRUE);
6075 		break;
6076 
6077 	case VIO_SUBTYPE_ACK:
6078 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
6079 		/*
6080 		 * Verify that the relevant descriptors are all
6081 		 * marked as DONE
6082 		 */
6083 		READ_ENTER(&ldcp->lane_out.dlistrw);
6084 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
6085 		    dring_pkt->dring_ident)) == NULL) {
6086 			RW_EXIT(&ldcp->lane_out.dlistrw);
6087 			DERR(vswp, "%s: unknown ident in ACK", __func__);
6088 			return;
6089 		}
6090 
6091 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
6092 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6093 
6094 		start = end = 0;
6095 		start = dring_pkt->start_idx;
6096 		end = dring_pkt->end_idx;
6097 		len = dp->num_descriptors;
6098 
6099 		j = num = 0;
6100 		/* calculate # descriptors taking into a/c wrap around */
6101 		num = end >= start ? end - start + 1: (len - start + 1) + end;
6102 
6103 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
6104 		    __func__, ldcp->ldc_id, start, end, num);
6105 
6106 		mutex_enter(&dp->dlock);
6107 		dp->last_ack_recv = end;
6108 		mutex_exit(&dp->dlock);
6109 
6110 		for (i = start; j < num; i = (i + 1) % len, j++) {
6111 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6112 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6113 
6114 			/*
6115 			 * If the last descriptor in a range has the ACK
6116 			 * bit set then we will get two messages from our
6117 			 * peer relating to it. The normal ACK msg and then
6118 			 * a subsequent STOP msg. The first message will have
6119 			 * resulted in the descriptor being reclaimed and
6120 			 * its state set to FREE so when we encounter a non
6121 			 * DONE descriptor we need to check to see if its
6122 			 * because we have just reclaimed it.
6123 			 */
6124 			mutex_enter(&priv_addr->dstate_lock);
6125 			if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
6126 				/* clear all the fields */
6127 				bzero(priv_addr->datap, priv_addr->datalen);
6128 				priv_addr->datalen = 0;
6129 
6130 				pub_addr->hdr.dstate = VIO_DESC_FREE;
6131 				pub_addr->hdr.ack = 0;
6132 
6133 				priv_addr->dstate = VIO_DESC_FREE;
6134 				mutex_exit(&priv_addr->dstate_lock);
6135 
6136 				D3(vswp, "clearing descp %d : pub state "
6137 				    "0x%llx : priv state 0x%llx", i,
6138 				    pub_addr->hdr.dstate, priv_addr->dstate);
6139 
6140 			} else {
6141 				mutex_exit(&priv_addr->dstate_lock);
6142 
6143 				if (dring_pkt->dring_process_state !=
6144 				    VIO_DP_STOPPED) {
6145 					DERR(vswp, "%s: descriptor %lld at pos "
6146 					    " 0x%llx not DONE (0x%lx)\n",
6147 					    __func__, i, pub_addr,
6148 					    pub_addr->hdr.dstate);
6149 					RW_EXIT(&ldcp->lane_out.dlistrw);
6150 					return;
6151 				}
6152 			}
6153 		}
6154 
6155 		/*
6156 		 * If our peer is stopping processing descriptors then
6157 		 * we check to make sure it has processed all the descriptors
6158 		 * we have updated. If not then we send it a new message
6159 		 * to prompt it to restart.
6160 		 */
6161 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
6162 			DTRACE_PROBE(stop_process_recv);
6163 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
6164 			    __func__, ldcp->ldc_id, dring_pkt->start_idx,
6165 			    dring_pkt->end_idx);
6166 
6167 			/*
6168 			 * Check next descriptor in public section of ring.
6169 			 * If its marked as READY then we need to prompt our
6170 			 * peer to start processing the ring again.
6171 			 */
6172 			i = (end + 1) % len;
6173 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6174 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6175 
6176 			/*
6177 			 * Hold the restart lock across all of this to
6178 			 * make sure that its not possible for us to
6179 			 * decide that a msg needs to be sent in the future
6180 			 * but the sending code having already checked is
6181 			 * about to exit.
6182 			 */
6183 			mutex_enter(&dp->restart_lock);
6184 			mutex_enter(&priv_addr->dstate_lock);
6185 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
6186 
6187 				mutex_exit(&priv_addr->dstate_lock);
6188 
6189 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
6190 				dring_pkt->tag.vio_sid = ldcp->local_session;
6191 
6192 				mutex_enter(&ldcp->lane_out.seq_lock);
6193 				dring_pkt->seq_num = ldcp->lane_out.seq_num++;
6194 				mutex_exit(&ldcp->lane_out.seq_lock);
6195 
6196 				dring_pkt->start_idx = (end + 1) % len;
6197 				dring_pkt->end_idx = -1;
6198 
6199 				D2(vswp, "%s(%lld) : sending restart msg:"
6200 				    " %d : %d", __func__, ldcp->ldc_id,
6201 				    dring_pkt->start_idx, dring_pkt->end_idx);
6202 
6203 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
6204 				    sizeof (vio_dring_msg_t), B_FALSE);
6205 
6206 			} else {
6207 				mutex_exit(&priv_addr->dstate_lock);
6208 				dp->restart_reqd = B_TRUE;
6209 			}
6210 			mutex_exit(&dp->restart_lock);
6211 		}
6212 		RW_EXIT(&ldcp->lane_out.dlistrw);
6213 
6214 		/* only do channel reset after dropping dlistrw lock */
6215 		if (msg_rv == ECONNRESET)
6216 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
6217 
6218 		break;
6219 
6220 	case VIO_SUBTYPE_NACK:
6221 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
6222 		    __func__, ldcp->ldc_id);
6223 		/*
6224 		 * Something is badly wrong if we are getting NACK's
6225 		 * for our data pkts. So reset the channel.
6226 		 */
6227 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
6228 
6229 		break;
6230 
6231 	default:
6232 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
6233 		    ldcp->ldc_id, dring_pkt->tag.vio_subtype);
6234 	}
6235 
6236 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
6237 }
6238 
6239 /*
6240  * VIO_PKT_DATA (a.k.a raw data mode )
6241  *
6242  * Note - currently not supported. Do nothing.
6243  */
6244 static void
6245 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
6246 {
6247 	_NOTE(ARGUNUSED(dpkt))
6248 
6249 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
6250 	DERR(NULL, "%s (%lld): currently unsupported", __func__, ldcp->ldc_id);
6251 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
6252 }
6253 
6254 /*
6255  * Process an in-band descriptor message (most likely from
6256  * OBP).
6257  */
6258 static void
6259 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
6260 {
6261 	vnet_ibnd_desc_t	*ibnd_desc;
6262 	dring_info_t		*dp = NULL;
6263 	vsw_private_desc_t	*priv_addr = NULL;
6264 	vsw_t			*vswp = ldcp->ldc_vswp;
6265 	mblk_t			*mp = NULL;
6266 	mblk_t			*nmp;
6267 	size_t			nbytes = 0;
6268 	size_t			off = 0;
6269 	uint64_t		idx = 0;
6270 	uint32_t		num = 1, len, datalen = 0;
6271 	uint64_t		ncookies = 0;
6272 	int			i, rv;
6273 	int			j = 0;
6274 
6275 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6276 
6277 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
6278 
6279 	switch (ibnd_desc->hdr.tag.vio_subtype) {
6280 	case VIO_SUBTYPE_INFO:
6281 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
6282 
6283 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
6284 			return;
6285 
6286 		/*
6287 		 * Data is padded to align on a 8 byte boundary,
6288 		 * nbytes is actual data length, i.e. minus that
6289 		 * padding.
6290 		 */
6291 		datalen = ibnd_desc->nbytes;
6292 
6293 		D2(vswp, "%s(%lld): processing inband desc : "
6294 		    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
6295 
6296 		ncookies = ibnd_desc->ncookies;
6297 
6298 		/*
6299 		 * allocb(9F) returns an aligned data block. We
6300 		 * need to ensure that we ask ldc for an aligned
6301 		 * number of bytes also.
6302 		 */
6303 		nbytes = datalen;
6304 		if (nbytes & 0x7) {
6305 			off = 8 - (nbytes & 0x7);
6306 			nbytes += off;
6307 		}
6308 
6309 		mp = allocb(datalen, BPRI_MED);
6310 		if (mp == NULL) {
6311 			DERR(vswp, "%s(%lld): allocb failed",
6312 			    __func__, ldcp->ldc_id);
6313 			return;
6314 		}
6315 
6316 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
6317 		    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
6318 		    LDC_COPY_IN);
6319 
6320 		if (rv != 0) {
6321 			DERR(vswp, "%s(%d): unable to copy in data from "
6322 			    "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
6323 			freemsg(mp);
6324 			return;
6325 		}
6326 
6327 		D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
6328 		    __func__, ldcp->ldc_id, nbytes, ncookies);
6329 
6330 		/*
6331 		 * Upper layer is expecting the IP header in the packet to
6332 		 * be 4-bytes aligned, but the OBP is sending packets that
6333 		 * are not aligned.  So, copy the data to another message
6334 		 * such that the alignment requirement is met.
6335 		 */
6336 		nmp = allocb(datalen + VNET_IPALIGN, BPRI_MED);
6337 		if (nmp == NULL) {
6338 			DERR(vswp, "%s(%lld): allocb failed",
6339 			    __func__, ldcp->ldc_id);
6340 			freemsg(mp);
6341 			return;
6342 		}
6343 		nmp->b_rptr += VNET_IPALIGN;
6344 		bcopy(mp->b_rptr, nmp->b_rptr, datalen);
6345 		freemsg(mp);
6346 
6347 		/* point to the actual end of data */
6348 		nmp->b_wptr = nmp->b_rptr + datalen;
6349 
6350 		/*
6351 		 * We ACK back every in-band descriptor message we process
6352 		 */
6353 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
6354 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
6355 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
6356 		    sizeof (vnet_ibnd_desc_t), B_TRUE);
6357 
6358 		/* send the packet to be switched */
6359 		vswp->vsw_switch_frame(vswp, nmp, VSW_VNETPORT,
6360 		    ldcp->ldc_port, NULL);
6361 
6362 		break;
6363 
6364 	case VIO_SUBTYPE_ACK:
6365 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
6366 
6367 		/* Verify the ACK is valid */
6368 		idx = ibnd_desc->hdr.desc_handle;
6369 
6370 		if (idx >= VSW_RING_NUM_EL) {
6371 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
6372 			    "(idx %ld)", vswp->instance, idx);
6373 			return;
6374 		}
6375 
6376 		if ((dp = ldcp->lane_out.dringp) == NULL) {
6377 			DERR(vswp, "%s: no dring found", __func__);
6378 			return;
6379 		}
6380 
6381 		len = dp->num_descriptors;
6382 		/*
6383 		 * If the descriptor we are being ACK'ed for is not the
6384 		 * one we expected, then pkts were lost somwhere, either
6385 		 * when we tried to send a msg, or a previous ACK msg from
6386 		 * our peer. In either case we now reclaim the descriptors
6387 		 * in the range from the last ACK we received up to the
6388 		 * current ACK.
6389 		 */
6390 		if (idx != dp->last_ack_recv) {
6391 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
6392 			    __func__, dp->last_ack_recv, idx);
6393 			num = idx >= dp->last_ack_recv ?
6394 			    idx - dp->last_ack_recv + 1:
6395 			    (len - dp->last_ack_recv + 1) + idx;
6396 		}
6397 
6398 		/*
6399 		 * When we sent the in-band message to our peer we
6400 		 * marked the copy in our private ring as READY. We now
6401 		 * check that the descriptor we are being ACK'ed for is in
6402 		 * fact READY, i.e. it is one we have shared with our peer.
6403 		 *
6404 		 * If its not we flag an error, but still reset the descr
6405 		 * back to FREE.
6406 		 */
6407 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
6408 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6409 			mutex_enter(&priv_addr->dstate_lock);
6410 			if (priv_addr->dstate != VIO_DESC_READY) {
6411 				DERR(vswp, "%s: (%ld) desc at index %ld not "
6412 				    "READY (0x%lx)", __func__,
6413 				    ldcp->ldc_id, idx, priv_addr->dstate);
6414 				DERR(vswp, "%s: bound %d: ncookies %ld : "
6415 				    "datalen %ld", __func__,
6416 				    priv_addr->bound, priv_addr->ncookies,
6417 				    priv_addr->datalen);
6418 			}
6419 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
6420 			    ldcp->ldc_id, idx);
6421 			/* release resources associated with sent msg */
6422 			bzero(priv_addr->datap, priv_addr->datalen);
6423 			priv_addr->datalen = 0;
6424 			priv_addr->dstate = VIO_DESC_FREE;
6425 			mutex_exit(&priv_addr->dstate_lock);
6426 		}
6427 		/* update to next expected value */
6428 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
6429 
6430 		break;
6431 
6432 	case VIO_SUBTYPE_NACK:
6433 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
6434 
6435 		/*
6436 		 * We should only get a NACK if our peer doesn't like
6437 		 * something about a message we have sent it. If this
6438 		 * happens we just release the resources associated with
6439 		 * the message. (We are relying on higher layers to decide
6440 		 * whether or not to resend.
6441 		 */
6442 
6443 		/* limit check */
6444 		idx = ibnd_desc->hdr.desc_handle;
6445 
6446 		if (idx >= VSW_RING_NUM_EL) {
6447 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
6448 			    __func__, idx);
6449 			return;
6450 		}
6451 
6452 		if ((dp = ldcp->lane_out.dringp) == NULL) {
6453 			DERR(vswp, "%s: no dring found", __func__);
6454 			return;
6455 		}
6456 
6457 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6458 
6459 		/* move to correct location in ring */
6460 		priv_addr += idx;
6461 
6462 		/* release resources associated with sent msg */
6463 		mutex_enter(&priv_addr->dstate_lock);
6464 		bzero(priv_addr->datap, priv_addr->datalen);
6465 		priv_addr->datalen = 0;
6466 		priv_addr->dstate = VIO_DESC_FREE;
6467 		mutex_exit(&priv_addr->dstate_lock);
6468 
6469 		break;
6470 
6471 	default:
6472 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
6473 		    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
6474 	}
6475 
6476 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
6477 }
6478 
6479 static void
6480 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
6481 {
6482 	_NOTE(ARGUNUSED(epkt))
6483 
6484 	vsw_t		*vswp = ldcp->ldc_vswp;
6485 	uint16_t	env = tag.vio_subtype_env;
6486 
6487 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
6488 
6489 	/*
6490 	 * Error vio_subtypes have yet to be defined. So for
6491 	 * the moment we can't do anything.
6492 	 */
6493 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
6494 
6495 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
6496 }
6497 
6498 /*
6499  * Switch the given ethernet frame when operating in layer 2 mode.
6500  *
6501  * vswp: pointer to the vsw instance
6502  * mp: pointer to chain of ethernet frame(s) to be switched
6503  * caller: identifies the source of this frame as:
6504  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
6505  *		2. VSW_PHYSDEV - the physical ethernet device
6506  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
6507  * arg: argument provided by the caller.
6508  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
6509  *		2. for PHYSDEV - NULL
6510  *		3. for LOCALDEV - pointer to to this vsw_t(self)
6511  */
6512 void
6513 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
6514 			vsw_port_t *arg, mac_resource_handle_t mrh)
6515 {
6516 	struct ether_header	*ehp;
6517 	vsw_port_t		*port = NULL;
6518 	mblk_t			*bp, *ret_m;
6519 	mblk_t			*nmp = NULL;
6520 	vsw_port_list_t		*plist = &vswp->plist;
6521 
6522 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
6523 
6524 	/*
6525 	 * PERF: rather than breaking up the chain here, scan it
6526 	 * to find all mblks heading to same destination and then
6527 	 * pass that sub-chain to the lower transmit functions.
6528 	 */
6529 
6530 	/* process the chain of packets */
6531 	bp = mp;
6532 	while (bp) {
6533 		mp = bp;
6534 		bp = bp->b_next;
6535 		mp->b_next = mp->b_prev = NULL;
6536 		ehp = (struct ether_header *)mp->b_rptr;
6537 
6538 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
6539 		    __func__, MBLKSIZE(mp), MBLKL(mp));
6540 
6541 		READ_ENTER(&vswp->if_lockrw);
6542 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
6543 			/*
6544 			 * If destination is VSW_LOCALDEV (vsw as an eth
6545 			 * interface) and if the device is up & running,
6546 			 * send the packet up the stack on this host.
6547 			 * If the virtual interface is down, drop the packet.
6548 			 */
6549 			if (caller != VSW_LOCALDEV) {
6550 				if (vswp->if_state & VSW_IF_UP) {
6551 					RW_EXIT(&vswp->if_lockrw);
6552 					mac_rx(vswp->if_mh, mrh, mp);
6553 				} else {
6554 					RW_EXIT(&vswp->if_lockrw);
6555 					/* Interface down, drop pkt */
6556 					freemsg(mp);
6557 				}
6558 			} else {
6559 				RW_EXIT(&vswp->if_lockrw);
6560 				freemsg(mp);
6561 			}
6562 			continue;
6563 		}
6564 		RW_EXIT(&vswp->if_lockrw);
6565 
6566 		READ_ENTER(&plist->lockrw);
6567 		port = vsw_lookup_fdb(vswp, ehp);
6568 		if (port) {
6569 			/*
6570 			 * Mark the port as in-use.
6571 			 */
6572 			mutex_enter(&port->ref_lock);
6573 			port->ref_cnt++;
6574 			mutex_exit(&port->ref_lock);
6575 			RW_EXIT(&plist->lockrw);
6576 
6577 			/*
6578 			 * If plumbed and in promisc mode then copy msg
6579 			 * and send up the stack.
6580 			 */
6581 			READ_ENTER(&vswp->if_lockrw);
6582 			if (VSW_U_P(vswp->if_state)) {
6583 				RW_EXIT(&vswp->if_lockrw);
6584 				nmp = copymsg(mp);
6585 				if (nmp)
6586 					mac_rx(vswp->if_mh, mrh, nmp);
6587 			} else {
6588 				RW_EXIT(&vswp->if_lockrw);
6589 			}
6590 
6591 			/*
6592 			 * If the destination is in FDB, the packet
6593 			 * should be forwarded to the correponding
6594 			 * vsw_port (connected to a vnet device -
6595 			 * VSW_VNETPORT)
6596 			 */
6597 			(void) vsw_portsend(port, mp);
6598 
6599 			/*
6600 			 * Decrement use count in port and check if
6601 			 * should wake delete thread.
6602 			 */
6603 			mutex_enter(&port->ref_lock);
6604 			port->ref_cnt--;
6605 			if (port->ref_cnt == 0)
6606 				cv_signal(&port->ref_cv);
6607 			mutex_exit(&port->ref_lock);
6608 		} else {
6609 			RW_EXIT(&plist->lockrw);
6610 			/*
6611 			 * Destination not in FDB.
6612 			 *
6613 			 * If the destination is broadcast or
6614 			 * multicast forward the packet to all
6615 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
6616 			 * except the caller.
6617 			 */
6618 			if (IS_BROADCAST(ehp)) {
6619 				D3(vswp, "%s: BROADCAST pkt", __func__);
6620 				(void) vsw_forward_all(vswp, mp, caller, arg);
6621 			} else if (IS_MULTICAST(ehp)) {
6622 				D3(vswp, "%s: MULTICAST pkt", __func__);
6623 				(void) vsw_forward_grp(vswp, mp, caller, arg);
6624 			} else {
6625 				/*
6626 				 * If the destination is unicast, and came
6627 				 * from either a logical network device or
6628 				 * the switch itself when it is plumbed, then
6629 				 * send it out on the physical device and also
6630 				 * up the stack if the logical interface is
6631 				 * in promiscious mode.
6632 				 *
6633 				 * NOTE:  The assumption here is that if we
6634 				 * cannot find the destination in our fdb, its
6635 				 * a unicast address, and came from either a
6636 				 * vnet or down the stack (when plumbed) it
6637 				 * must be destinded for an ethernet device
6638 				 * outside our ldoms.
6639 				 */
6640 				if (caller == VSW_VNETPORT) {
6641 					READ_ENTER(&vswp->if_lockrw);
6642 					if (VSW_U_P(vswp->if_state)) {
6643 						RW_EXIT(&vswp->if_lockrw);
6644 						nmp = copymsg(mp);
6645 						if (nmp)
6646 							mac_rx(vswp->if_mh,
6647 							    mrh, nmp);
6648 					} else {
6649 						RW_EXIT(&vswp->if_lockrw);
6650 					}
6651 					if ((ret_m = vsw_tx_msg(vswp, mp))
6652 					    != NULL) {
6653 						DERR(vswp, "%s: drop mblks to "
6654 						    "phys dev", __func__);
6655 						freemsg(ret_m);
6656 					}
6657 
6658 				} else if (caller == VSW_PHYSDEV) {
6659 					/*
6660 					 * Pkt seen because card in promisc
6661 					 * mode. Send up stack if plumbed in
6662 					 * promisc mode, else drop it.
6663 					 */
6664 					READ_ENTER(&vswp->if_lockrw);
6665 					if (VSW_U_P(vswp->if_state)) {
6666 						RW_EXIT(&vswp->if_lockrw);
6667 						mac_rx(vswp->if_mh, mrh, mp);
6668 					} else {
6669 						RW_EXIT(&vswp->if_lockrw);
6670 						freemsg(mp);
6671 					}
6672 
6673 				} else if (caller == VSW_LOCALDEV) {
6674 					/*
6675 					 * Pkt came down the stack, send out
6676 					 * over physical device.
6677 					 */
6678 					if ((ret_m = vsw_tx_msg(vswp, mp))
6679 					    != NULL) {
6680 						DERR(vswp, "%s: drop mblks to "
6681 						    "phys dev", __func__);
6682 						freemsg(ret_m);
6683 					}
6684 				}
6685 			}
6686 		}
6687 	}
6688 	D1(vswp, "%s: exit\n", __func__);
6689 }
6690 
6691 /*
6692  * Switch ethernet frame when in layer 3 mode (i.e. using IP
6693  * layer to do the routing).
6694  *
6695  * There is a large amount of overlap between this function and
6696  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
6697  * both these functions.
6698  */
6699 void
6700 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
6701 			vsw_port_t *arg, mac_resource_handle_t mrh)
6702 {
6703 	struct ether_header	*ehp;
6704 	vsw_port_t		*port = NULL;
6705 	mblk_t			*bp = NULL;
6706 	vsw_port_list_t		*plist = &vswp->plist;
6707 
6708 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
6709 
6710 	/*
6711 	 * In layer 3 mode should only ever be switching packets
6712 	 * between IP layer and vnet devices. So make sure thats
6713 	 * who is invoking us.
6714 	 */
6715 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
6716 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
6717 		freemsgchain(mp);
6718 		return;
6719 	}
6720 
6721 	/* process the chain of packets */
6722 	bp = mp;
6723 	while (bp) {
6724 		mp = bp;
6725 		bp = bp->b_next;
6726 		mp->b_next = mp->b_prev = NULL;
6727 		ehp = (struct ether_header *)mp->b_rptr;
6728 
6729 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
6730 		    __func__, MBLKSIZE(mp), MBLKL(mp));
6731 
6732 		READ_ENTER(&plist->lockrw);
6733 		port = vsw_lookup_fdb(vswp, ehp);
6734 		if (port) {
6735 			/*
6736 			 * Mark port as in-use.
6737 			 */
6738 			mutex_enter(&port->ref_lock);
6739 			port->ref_cnt++;
6740 			mutex_exit(&port->ref_lock);
6741 			RW_EXIT(&plist->lockrw);
6742 
6743 			D2(vswp, "%s: sending to target port", __func__);
6744 			(void) vsw_portsend(port, mp);
6745 
6746 			/*
6747 			 * Finished with port so decrement ref count and
6748 			 * check if should wake delete thread.
6749 			 */
6750 			mutex_enter(&port->ref_lock);
6751 			port->ref_cnt--;
6752 			if (port->ref_cnt == 0)
6753 				cv_signal(&port->ref_cv);
6754 			mutex_exit(&port->ref_lock);
6755 		} else {
6756 			RW_EXIT(&plist->lockrw);
6757 			/*
6758 			 * Destination not in FDB
6759 			 *
6760 			 * If the destination is broadcast or
6761 			 * multicast forward the packet to all
6762 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
6763 			 * except the caller.
6764 			 */
6765 			if (IS_BROADCAST(ehp)) {
6766 				D2(vswp, "%s: BROADCAST pkt", __func__);
6767 				(void) vsw_forward_all(vswp, mp, caller, arg);
6768 			} else if (IS_MULTICAST(ehp)) {
6769 				D2(vswp, "%s: MULTICAST pkt", __func__);
6770 				(void) vsw_forward_grp(vswp, mp, caller, arg);
6771 			} else {
6772 				/*
6773 				 * Unicast pkt from vnet that we don't have
6774 				 * an FDB entry for, so must be destinded for
6775 				 * the outside world. Attempt to send up to the
6776 				 * IP layer to allow it to deal with it.
6777 				 */
6778 				if (caller == VSW_VNETPORT) {
6779 					READ_ENTER(&vswp->if_lockrw);
6780 					if (vswp->if_state & VSW_IF_UP) {
6781 						RW_EXIT(&vswp->if_lockrw);
6782 						D2(vswp, "%s: sending up",
6783 						    __func__);
6784 						mac_rx(vswp->if_mh, mrh, mp);
6785 					} else {
6786 						RW_EXIT(&vswp->if_lockrw);
6787 						/* Interface down, drop pkt */
6788 						D2(vswp, "%s I/F down",
6789 						    __func__);
6790 						freemsg(mp);
6791 					}
6792 				}
6793 			}
6794 		}
6795 	}
6796 
6797 	D1(vswp, "%s: exit", __func__);
6798 }
6799 
6800 /*
6801  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
6802  * except the caller (port on which frame arrived).
6803  */
6804 static int
6805 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
6806 {
6807 	vsw_port_list_t	*plist = &vswp->plist;
6808 	vsw_port_t	*portp;
6809 	mblk_t		*nmp = NULL;
6810 	mblk_t		*ret_m = NULL;
6811 	int		skip_port = 0;
6812 
6813 	D1(vswp, "vsw_forward_all: enter\n");
6814 
6815 	/*
6816 	 * Broadcast message from inside ldoms so send to outside
6817 	 * world if in either of layer 2 modes.
6818 	 */
6819 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
6820 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
6821 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
6822 
6823 		nmp = dupmsg(mp);
6824 		if (nmp) {
6825 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
6826 				DERR(vswp, "%s: dropping pkt(s) "
6827 				    "consisting of %ld bytes of data for"
6828 				    " physical device", __func__, MBLKL(ret_m));
6829 				freemsg(ret_m);
6830 			}
6831 		}
6832 	}
6833 
6834 	if (caller == VSW_VNETPORT)
6835 		skip_port = 1;
6836 
6837 	/*
6838 	 * Broadcast message from other vnet (layer 2 or 3) or outside
6839 	 * world (layer 2 only), send up stack if plumbed.
6840 	 */
6841 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
6842 		READ_ENTER(&vswp->if_lockrw);
6843 		if (vswp->if_state & VSW_IF_UP) {
6844 			RW_EXIT(&vswp->if_lockrw);
6845 			nmp = copymsg(mp);
6846 			if (nmp)
6847 				mac_rx(vswp->if_mh, NULL, nmp);
6848 		} else {
6849 			RW_EXIT(&vswp->if_lockrw);
6850 		}
6851 	}
6852 
6853 	/* send it to all VNETPORTs */
6854 	READ_ENTER(&plist->lockrw);
6855 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
6856 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
6857 		/*
6858 		 * Caution ! - don't reorder these two checks as arg
6859 		 * will be NULL if the caller is PHYSDEV. skip_port is
6860 		 * only set if caller is VNETPORT.
6861 		 */
6862 		if ((skip_port) && (portp == arg))
6863 			continue;
6864 		else {
6865 			nmp = dupmsg(mp);
6866 			if (nmp) {
6867 				(void) vsw_portsend(portp, nmp);
6868 			} else {
6869 				DERR(vswp, "vsw_forward_all: nmp NULL");
6870 			}
6871 		}
6872 	}
6873 	RW_EXIT(&plist->lockrw);
6874 
6875 	freemsg(mp);
6876 
6877 	D1(vswp, "vsw_forward_all: exit\n");
6878 	return (0);
6879 }
6880 
6881 /*
6882  * Forward pkts to any devices or interfaces which have registered
6883  * an interest in them (i.e. multicast groups).
6884  */
6885 static int
6886 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
6887 {
6888 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
6889 	mfdb_ent_t		*entp = NULL;
6890 	mfdb_ent_t		*tpp = NULL;
6891 	vsw_port_t 		*port;
6892 	uint64_t		key = 0;
6893 	mblk_t			*nmp = NULL;
6894 	mblk_t			*ret_m = NULL;
6895 	boolean_t		check_if = B_TRUE;
6896 
6897 	/*
6898 	 * Convert address to hash table key
6899 	 */
6900 	KEY_HASH(key, ehp->ether_dhost);
6901 
6902 	D1(vswp, "%s: key 0x%llx", __func__, key);
6903 
6904 	/*
6905 	 * If pkt came from either a vnet or down the stack (if we are
6906 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
6907 	 * over the physical adapter, and then check to see if any other
6908 	 * vnets are interested in it.
6909 	 */
6910 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
6911 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
6912 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
6913 		nmp = dupmsg(mp);
6914 		if (nmp) {
6915 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
6916 				DERR(vswp, "%s: dropping pkt(s) consisting of "
6917 				    "%ld bytes of data for physical device",
6918 				    __func__, MBLKL(ret_m));
6919 				freemsg(ret_m);
6920 			}
6921 		}
6922 	}
6923 
6924 	READ_ENTER(&vswp->mfdbrw);
6925 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
6926 	    (mod_hash_val_t *)&entp) != 0) {
6927 		D3(vswp, "%s: no table entry found for addr 0x%llx",
6928 		    __func__, key);
6929 	} else {
6930 		/*
6931 		 * Send to list of devices associated with this address...
6932 		 */
6933 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
6934 
6935 			/* dont send to ourselves */
6936 			if ((caller == VSW_VNETPORT) &&
6937 			    (tpp->d_addr == (void *)arg)) {
6938 				port = (vsw_port_t *)tpp->d_addr;
6939 				D3(vswp, "%s: not sending to ourselves"
6940 				    " : port %d", __func__, port->p_instance);
6941 				continue;
6942 
6943 			} else if ((caller == VSW_LOCALDEV) &&
6944 			    (tpp->d_type == VSW_LOCALDEV)) {
6945 				D3(vswp, "%s: not sending back up stack",
6946 				    __func__);
6947 				continue;
6948 			}
6949 
6950 			if (tpp->d_type == VSW_VNETPORT) {
6951 				port = (vsw_port_t *)tpp->d_addr;
6952 				D3(vswp, "%s: sending to port %ld for addr "
6953 				    "0x%llx", __func__, port->p_instance, key);
6954 
6955 				nmp = dupmsg(mp);
6956 				if (nmp)
6957 					(void) vsw_portsend(port, nmp);
6958 			} else {
6959 				if (vswp->if_state & VSW_IF_UP) {
6960 					nmp = copymsg(mp);
6961 					if (nmp)
6962 						mac_rx(vswp->if_mh, NULL, nmp);
6963 					check_if = B_FALSE;
6964 					D3(vswp, "%s: sending up stack"
6965 					    " for addr 0x%llx", __func__, key);
6966 				}
6967 			}
6968 		}
6969 	}
6970 
6971 	RW_EXIT(&vswp->mfdbrw);
6972 
6973 	/*
6974 	 * If the pkt came from either a vnet or from physical device,
6975 	 * and if we havent already sent the pkt up the stack then we
6976 	 * check now if we can/should (i.e. the interface is plumbed
6977 	 * and in promisc mode).
6978 	 */
6979 	if ((check_if) &&
6980 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
6981 		READ_ENTER(&vswp->if_lockrw);
6982 		if (VSW_U_P(vswp->if_state)) {
6983 			RW_EXIT(&vswp->if_lockrw);
6984 			D3(vswp, "%s: (caller %d) finally sending up stack"
6985 			    " for addr 0x%llx", __func__, caller, key);
6986 			nmp = copymsg(mp);
6987 			if (nmp)
6988 				mac_rx(vswp->if_mh, NULL, nmp);
6989 		} else {
6990 			RW_EXIT(&vswp->if_lockrw);
6991 		}
6992 	}
6993 
6994 	freemsg(mp);
6995 
6996 	D1(vswp, "%s: exit", __func__);
6997 
6998 	return (0);
6999 }
7000 
7001 /* transmit the packet over the given port */
7002 static int
7003 vsw_portsend(vsw_port_t *port, mblk_t *mp)
7004 {
7005 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
7006 	vsw_ldc_t 	*ldcp;
7007 	int		status = 0;
7008 
7009 
7010 	READ_ENTER(&ldcl->lockrw);
7011 	/*
7012 	 * Note for now, we have a single channel.
7013 	 */
7014 	ldcp = ldcl->head;
7015 	if (ldcp == NULL) {
7016 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
7017 		freemsg(mp);
7018 		RW_EXIT(&ldcl->lockrw);
7019 		return (1);
7020 	}
7021 
7022 	/*
7023 	 * Send the message out using the appropriate
7024 	 * transmit function which will free mblock when it
7025 	 * is finished with it.
7026 	 */
7027 	mutex_enter(&port->tx_lock);
7028 	if (port->transmit != NULL)
7029 		status = (*port->transmit)(ldcp, mp);
7030 	else {
7031 		freemsg(mp);
7032 	}
7033 	mutex_exit(&port->tx_lock);
7034 
7035 	RW_EXIT(&ldcl->lockrw);
7036 
7037 	return (status);
7038 }
7039 
7040 /*
7041  * Send packet out via descriptor ring to a logical device.
7042  */
7043 static int
7044 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
7045 {
7046 	vio_dring_msg_t		dring_pkt;
7047 	dring_info_t		*dp = NULL;
7048 	vsw_private_desc_t	*priv_desc = NULL;
7049 	vnet_public_desc_t	*pub = NULL;
7050 	vsw_t			*vswp = ldcp->ldc_vswp;
7051 	mblk_t			*bp;
7052 	size_t			n, size;
7053 	caddr_t			bufp;
7054 	int			idx;
7055 	int			status = LDC_TX_SUCCESS;
7056 
7057 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
7058 
7059 	/* TODO: make test a macro */
7060 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
7061 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
7062 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
7063 		    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
7064 		    ldcp->lane_out.lstate);
7065 		freemsg(mp);
7066 		return (LDC_TX_FAILURE);
7067 	}
7068 
7069 	/*
7070 	 * Note - using first ring only, this may change
7071 	 * in the future.
7072 	 */
7073 	READ_ENTER(&ldcp->lane_out.dlistrw);
7074 	if ((dp = ldcp->lane_out.dringp) == NULL) {
7075 		RW_EXIT(&ldcp->lane_out.dlistrw);
7076 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
7077 		    " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
7078 		freemsg(mp);
7079 		return (LDC_TX_FAILURE);
7080 	}
7081 
7082 	size = msgsize(mp);
7083 	if (size > (size_t)ETHERMAX) {
7084 		RW_EXIT(&ldcp->lane_out.dlistrw);
7085 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
7086 		    ldcp->ldc_id, size);
7087 		freemsg(mp);
7088 		return (LDC_TX_FAILURE);
7089 	}
7090 
7091 	/*
7092 	 * Find a free descriptor
7093 	 *
7094 	 * Note: for the moment we are assuming that we will only
7095 	 * have one dring going from the switch to each of its
7096 	 * peers. This may change in the future.
7097 	 */
7098 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
7099 		D2(vswp, "%s(%lld): no descriptor available for ring "
7100 		    "at 0x%llx", __func__, ldcp->ldc_id, dp);
7101 
7102 		/* nothing more we can do */
7103 		status = LDC_TX_NORESOURCES;
7104 		goto vsw_dringsend_free_exit;
7105 	} else {
7106 		D2(vswp, "%s(%lld): free private descriptor found at pos %ld "
7107 		    "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc);
7108 	}
7109 
7110 	/* copy data into the descriptor */
7111 	bufp = priv_desc->datap;
7112 	bufp += VNET_IPALIGN;
7113 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
7114 		n = MBLKL(bp);
7115 		bcopy(bp->b_rptr, bufp, n);
7116 		bufp += n;
7117 	}
7118 
7119 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
7120 
7121 	pub = priv_desc->descp;
7122 	pub->nbytes = priv_desc->datalen;
7123 
7124 	mutex_enter(&priv_desc->dstate_lock);
7125 	pub->hdr.dstate = VIO_DESC_READY;
7126 	mutex_exit(&priv_desc->dstate_lock);
7127 
7128 	/*
7129 	 * Determine whether or not we need to send a message to our
7130 	 * peer prompting them to read our newly updated descriptor(s).
7131 	 */
7132 	mutex_enter(&dp->restart_lock);
7133 	if (dp->restart_reqd) {
7134 		dp->restart_reqd = B_FALSE;
7135 		mutex_exit(&dp->restart_lock);
7136 
7137 		/*
7138 		 * Send a vio_dring_msg to peer to prompt them to read
7139 		 * the updated descriptor ring.
7140 		 */
7141 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
7142 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
7143 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
7144 		dring_pkt.tag.vio_sid = ldcp->local_session;
7145 
7146 		/* Note - for now using first ring */
7147 		dring_pkt.dring_ident = dp->ident;
7148 
7149 		mutex_enter(&ldcp->lane_out.seq_lock);
7150 		dring_pkt.seq_num = ldcp->lane_out.seq_num++;
7151 		mutex_exit(&ldcp->lane_out.seq_lock);
7152 
7153 		/*
7154 		 * If last_ack_recv is -1 then we know we've not
7155 		 * received any ack's yet, so this must be the first
7156 		 * msg sent, so set the start to the begining of the ring.
7157 		 */
7158 		mutex_enter(&dp->dlock);
7159 		if (dp->last_ack_recv == -1) {
7160 			dring_pkt.start_idx = 0;
7161 		} else {
7162 			dring_pkt.start_idx =
7163 			    (dp->last_ack_recv + 1) % dp->num_descriptors;
7164 		}
7165 		dring_pkt.end_idx = -1;
7166 		mutex_exit(&dp->dlock);
7167 
7168 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
7169 		    ldcp->ldc_id, dp, dring_pkt.dring_ident);
7170 		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
7171 		    __func__, ldcp->ldc_id, dring_pkt.start_idx,
7172 		    dring_pkt.end_idx, dring_pkt.seq_num);
7173 
7174 		RW_EXIT(&ldcp->lane_out.dlistrw);
7175 
7176 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
7177 		    sizeof (vio_dring_msg_t), B_TRUE);
7178 
7179 		/* free the message block */
7180 		freemsg(mp);
7181 		return (status);
7182 
7183 	} else {
7184 		mutex_exit(&dp->restart_lock);
7185 		D2(vswp, "%s(%lld): updating descp %d", __func__,
7186 		    ldcp->ldc_id, idx);
7187 	}
7188 
7189 vsw_dringsend_free_exit:
7190 
7191 	RW_EXIT(&ldcp->lane_out.dlistrw);
7192 
7193 	/* free the message block */
7194 	freemsg(mp);
7195 
7196 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
7197 	return (status);
7198 }
7199 
7200 /*
7201  * Send an in-band descriptor message over ldc.
7202  */
7203 static int
7204 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
7205 {
7206 	vsw_t			*vswp = ldcp->ldc_vswp;
7207 	vnet_ibnd_desc_t	ibnd_msg;
7208 	vsw_private_desc_t	*priv_desc = NULL;
7209 	dring_info_t		*dp = NULL;
7210 	size_t			n, size = 0;
7211 	caddr_t			bufp;
7212 	mblk_t			*bp;
7213 	int			idx, i;
7214 	int			status = LDC_TX_SUCCESS;
7215 	static int		warn_msg = 1;
7216 
7217 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
7218 
7219 	ASSERT(mp != NULL);
7220 
7221 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
7222 	    (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
7223 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
7224 		    __func__, ldcp->ldc_id, ldcp->ldc_status,
7225 		    ldcp->lane_out.lstate);
7226 		freemsg(mp);
7227 		return (LDC_TX_FAILURE);
7228 	}
7229 
7230 	/*
7231 	 * only expect single dring to exist, which we use
7232 	 * as an internal buffer, rather than a transfer channel.
7233 	 */
7234 	READ_ENTER(&ldcp->lane_out.dlistrw);
7235 	if ((dp = ldcp->lane_out.dringp) == NULL) {
7236 		DERR(vswp, "%s(%lld): no dring for outbound lane",
7237 		    __func__, ldcp->ldc_id);
7238 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
7239 		    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
7240 		RW_EXIT(&ldcp->lane_out.dlistrw);
7241 		freemsg(mp);
7242 		return (LDC_TX_FAILURE);
7243 	}
7244 
7245 	size = msgsize(mp);
7246 	if (size > (size_t)ETHERMAX) {
7247 		RW_EXIT(&ldcp->lane_out.dlistrw);
7248 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
7249 		    ldcp->ldc_id, size);
7250 		freemsg(mp);
7251 		return (LDC_TX_FAILURE);
7252 	}
7253 
7254 	/*
7255 	 * Find a free descriptor in our buffer ring
7256 	 */
7257 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
7258 		RW_EXIT(&ldcp->lane_out.dlistrw);
7259 		if (warn_msg) {
7260 			DERR(vswp, "%s(%lld): no descriptor available for ring "
7261 			    "at 0x%llx", __func__, ldcp->ldc_id, dp);
7262 			warn_msg = 0;
7263 		}
7264 
7265 		/* nothing more we can do */
7266 		status = LDC_TX_NORESOURCES;
7267 		goto vsw_descrsend_free_exit;
7268 	} else {
7269 		D2(vswp, "%s(%lld): free private descriptor found at pos "
7270 		    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
7271 		warn_msg = 1;
7272 	}
7273 
7274 	/* copy data into the descriptor */
7275 	bufp = priv_desc->datap;
7276 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
7277 		n = MBLKL(bp);
7278 		bcopy(bp->b_rptr, bufp, n);
7279 		bufp += n;
7280 	}
7281 
7282 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
7283 
7284 	/* create and send the in-band descp msg */
7285 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
7286 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
7287 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
7288 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
7289 
7290 	mutex_enter(&ldcp->lane_out.seq_lock);
7291 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
7292 	mutex_exit(&ldcp->lane_out.seq_lock);
7293 
7294 	/*
7295 	 * Copy the mem cookies describing the data from the
7296 	 * private region of the descriptor ring into the inband
7297 	 * descriptor.
7298 	 */
7299 	for (i = 0; i < priv_desc->ncookies; i++) {
7300 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
7301 		    sizeof (ldc_mem_cookie_t));
7302 	}
7303 
7304 	ibnd_msg.hdr.desc_handle = idx;
7305 	ibnd_msg.ncookies = priv_desc->ncookies;
7306 	ibnd_msg.nbytes = size;
7307 
7308 	RW_EXIT(&ldcp->lane_out.dlistrw);
7309 
7310 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
7311 	    sizeof (vnet_ibnd_desc_t), B_TRUE);
7312 
7313 vsw_descrsend_free_exit:
7314 
7315 	/* free the allocated message blocks */
7316 	freemsg(mp);
7317 
7318 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
7319 	return (status);
7320 }
7321 
7322 static void
7323 vsw_send_ver(void *arg)
7324 {
7325 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
7326 	vsw_t		*vswp = ldcp->ldc_vswp;
7327 	lane_t		*lp = &ldcp->lane_out;
7328 	vio_ver_msg_t	ver_msg;
7329 
7330 	D1(vswp, "%s enter", __func__);
7331 
7332 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7333 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7334 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
7335 	ver_msg.tag.vio_sid = ldcp->local_session;
7336 
7337 	ver_msg.ver_major = vsw_versions[0].ver_major;
7338 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
7339 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
7340 
7341 	lp->lstate |= VSW_VER_INFO_SENT;
7342 	lp->ver_major = ver_msg.ver_major;
7343 	lp->ver_minor = ver_msg.ver_minor;
7344 
7345 	DUMP_TAG(ver_msg.tag);
7346 
7347 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
7348 
7349 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
7350 }
7351 
7352 static void
7353 vsw_send_attr(vsw_ldc_t *ldcp)
7354 {
7355 	vsw_t			*vswp = ldcp->ldc_vswp;
7356 	lane_t			*lp = &ldcp->lane_out;
7357 	vnet_attr_msg_t		attr_msg;
7358 
7359 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
7360 
7361 	/*
7362 	 * Subtype is set to INFO by default
7363 	 */
7364 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7365 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7366 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
7367 	attr_msg.tag.vio_sid = ldcp->local_session;
7368 
7369 	/* payload copied from default settings for lane */
7370 	attr_msg.mtu = lp->mtu;
7371 	attr_msg.addr_type = lp->addr_type;
7372 	attr_msg.xfer_mode = lp->xfer_mode;
7373 	attr_msg.ack_freq = lp->xfer_mode;
7374 
7375 	READ_ENTER(&vswp->if_lockrw);
7376 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
7377 	RW_EXIT(&vswp->if_lockrw);
7378 
7379 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
7380 
7381 	DUMP_TAG(attr_msg.tag);
7382 
7383 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
7384 
7385 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
7386 }
7387 
7388 /*
7389  * Create dring info msg (which also results in the creation of
7390  * a dring).
7391  */
7392 static vio_dring_reg_msg_t *
7393 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
7394 {
7395 	vio_dring_reg_msg_t	*mp;
7396 	dring_info_t		*dp;
7397 	vsw_t			*vswp = ldcp->ldc_vswp;
7398 
7399 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
7400 
7401 	/*
7402 	 * If we can't create a dring, obviously no point sending
7403 	 * a message.
7404 	 */
7405 	if ((dp = vsw_create_dring(ldcp)) == NULL)
7406 		return (NULL);
7407 
7408 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
7409 
7410 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
7411 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
7412 	mp->tag.vio_subtype_env = VIO_DRING_REG;
7413 	mp->tag.vio_sid = ldcp->local_session;
7414 
7415 	/* payload */
7416 	mp->num_descriptors = dp->num_descriptors;
7417 	mp->descriptor_size = dp->descriptor_size;
7418 	mp->options = dp->options;
7419 	mp->ncookies = dp->ncookies;
7420 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
7421 
7422 	mp->dring_ident = 0;
7423 
7424 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
7425 
7426 	return (mp);
7427 }
7428 
7429 static void
7430 vsw_send_dring_info(vsw_ldc_t *ldcp)
7431 {
7432 	vio_dring_reg_msg_t	*dring_msg;
7433 	vsw_t			*vswp = ldcp->ldc_vswp;
7434 
7435 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
7436 
7437 	dring_msg = vsw_create_dring_info_pkt(ldcp);
7438 	if (dring_msg == NULL) {
7439 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
7440 		    vswp->instance, __func__);
7441 		return;
7442 	}
7443 
7444 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
7445 
7446 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
7447 
7448 	(void) vsw_send_msg(ldcp, dring_msg,
7449 	    sizeof (vio_dring_reg_msg_t), B_TRUE);
7450 
7451 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
7452 
7453 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
7454 }
7455 
7456 static void
7457 vsw_send_rdx(vsw_ldc_t *ldcp)
7458 {
7459 	vsw_t		*vswp = ldcp->ldc_vswp;
7460 	vio_rdx_msg_t	rdx_msg;
7461 
7462 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
7463 
7464 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7465 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7466 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
7467 	rdx_msg.tag.vio_sid = ldcp->local_session;
7468 
7469 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
7470 
7471 	DUMP_TAG(rdx_msg.tag);
7472 
7473 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
7474 
7475 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
7476 }
7477 
7478 /*
7479  * Generic routine to send message out over ldc channel.
7480  *
7481  * It is possible that when we attempt to write over the ldc channel
7482  * that we get notified that it has been reset. Depending on the value
7483  * of the handle_reset flag we either handle that event here or simply
7484  * notify the caller that the channel was reset.
7485  */
7486 static int
7487 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
7488 {
7489 	int		rv;
7490 	size_t		msglen = size;
7491 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
7492 	vsw_t		*vswp = ldcp->ldc_vswp;
7493 
7494 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
7495 	    ldcp->ldc_id, size);
7496 
7497 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
7498 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
7499 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
7500 
7501 	mutex_enter(&ldcp->ldc_txlock);
7502 	do {
7503 		msglen = size;
7504 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
7505 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
7506 
7507 	if ((rv != 0) || (msglen != size)) {
7508 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) "
7509 		    "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen);
7510 	}
7511 	mutex_exit(&ldcp->ldc_txlock);
7512 
7513 	/*
7514 	 * If channel has been reset we either handle it here or
7515 	 * simply report back that it has been reset and let caller
7516 	 * decide what to do.
7517 	 */
7518 	if (rv == ECONNRESET) {
7519 		DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id);
7520 
7521 		/*
7522 		 * N.B - must never be holding the dlistrw lock when
7523 		 * we do a reset of the channel.
7524 		 */
7525 		if (handle_reset) {
7526 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
7527 		}
7528 	}
7529 
7530 	return (rv);
7531 }
7532 
7533 /*
7534  * Add an entry into FDB, for the given mac address and port_id.
7535  * Returns 0 on success, 1 on failure.
7536  *
7537  * Lock protecting FDB must be held by calling process.
7538  */
7539 static int
7540 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
7541 {
7542 	uint64_t	addr = 0;
7543 
7544 	D1(vswp, "%s: enter", __func__);
7545 
7546 	KEY_HASH(addr, port->p_macaddr);
7547 
7548 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
7549 
7550 	/*
7551 	 * Note: duplicate keys will be rejected by mod_hash.
7552 	 */
7553 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
7554 	    (mod_hash_val_t)port) != 0) {
7555 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
7556 		return (1);
7557 	}
7558 
7559 	D1(vswp, "%s: exit", __func__);
7560 	return (0);
7561 }
7562 
7563 /*
7564  * Remove an entry from FDB.
7565  * Returns 0 on success, 1 on failure.
7566  */
7567 static int
7568 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
7569 {
7570 	uint64_t	addr = 0;
7571 
7572 	D1(vswp, "%s: enter", __func__);
7573 
7574 	KEY_HASH(addr, port->p_macaddr);
7575 
7576 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
7577 
7578 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
7579 
7580 	D1(vswp, "%s: enter", __func__);
7581 
7582 	return (0);
7583 }
7584 
7585 /*
7586  * Search fdb for a given mac address.
7587  * Returns pointer to the entry if found, else returns NULL.
7588  */
7589 static vsw_port_t *
7590 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
7591 {
7592 	uint64_t	key = 0;
7593 	vsw_port_t	*port = NULL;
7594 
7595 	D1(vswp, "%s: enter", __func__);
7596 
7597 	KEY_HASH(key, ehp->ether_dhost);
7598 
7599 	D2(vswp, "%s: key = 0x%llx", __func__, key);
7600 
7601 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
7602 	    (mod_hash_val_t *)&port) != 0) {
7603 		D2(vswp, "%s: no port found", __func__);
7604 		return (NULL);
7605 	}
7606 
7607 	D1(vswp, "%s: exit", __func__);
7608 
7609 	return (port);
7610 }
7611 
7612 /*
7613  * Add or remove multicast address(es).
7614  *
7615  * Returns 0 on success, 1 on failure.
7616  */
7617 static int
7618 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
7619 {
7620 	mcst_addr_t		*mcst_p = NULL;
7621 	vsw_t			*vswp = port->p_vswp;
7622 	uint64_t		addr = 0x0;
7623 	int			i;
7624 
7625 	D1(vswp, "%s: enter", __func__);
7626 
7627 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
7628 
7629 	mutex_enter(&vswp->mac_lock);
7630 	if (vswp->mh == NULL) {
7631 		mutex_exit(&vswp->mac_lock);
7632 		return (1);
7633 	}
7634 	mutex_exit(&vswp->mac_lock);
7635 
7636 	for (i = 0; i < mcst_pkt->count; i++) {
7637 		/*
7638 		 * Convert address into form that can be used
7639 		 * as hash table key.
7640 		 */
7641 		KEY_HASH(addr, mcst_pkt->mca[i]);
7642 
7643 		/*
7644 		 * Add or delete the specified address/port combination.
7645 		 */
7646 		if (mcst_pkt->set == 0x1) {
7647 			D3(vswp, "%s: adding multicast address 0x%llx for "
7648 			    "port %ld", __func__, addr, port->p_instance);
7649 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
7650 				/*
7651 				 * Update the list of multicast
7652 				 * addresses contained within the
7653 				 * port structure to include this new
7654 				 * one.
7655 				 */
7656 				mcst_p = kmem_alloc(
7657 				    sizeof (mcst_addr_t), KM_NOSLEEP);
7658 				if (mcst_p == NULL) {
7659 					DERR(vswp, "%s: unable to alloc mem",
7660 					    __func__);
7661 					return (1);
7662 				}
7663 
7664 				mcst_p->nextp = NULL;
7665 				mcst_p->addr = addr;
7666 
7667 				mutex_enter(&port->mca_lock);
7668 				mcst_p->nextp = port->mcap;
7669 				port->mcap = mcst_p;
7670 				mutex_exit(&port->mca_lock);
7671 
7672 				/*
7673 				 * Program the address into HW. If the addr
7674 				 * has already been programmed then the MAC
7675 				 * just increments a ref counter (which is
7676 				 * used when the address is being deleted)
7677 				 */
7678 				mutex_enter(&vswp->mac_lock);
7679 				if ((vswp->mh == NULL) ||
7680 				    mac_multicst_add(vswp->mh,
7681 				    (uchar_t *)&mcst_pkt->mca[i])) {
7682 					mutex_exit(&vswp->mac_lock);
7683 					cmn_err(CE_WARN, "!vsw%d: unable to "
7684 					    "add multicast address",
7685 					    vswp->instance);
7686 					(void) vsw_del_mcst(vswp, VSW_VNETPORT,
7687 					    addr, port);
7688 					vsw_del_addr(VSW_VNETPORT, port, addr);
7689 					return (1);
7690 				}
7691 				mutex_exit(&vswp->mac_lock);
7692 
7693 			} else {
7694 				DERR(vswp, "%s: error adding multicast "
7695 				    "address 0x%llx for port %ld",
7696 				    __func__, addr, port->p_instance);
7697 				return (1);
7698 			}
7699 		} else {
7700 			/*
7701 			 * Delete an entry from the multicast hash
7702 			 * table and update the address list
7703 			 * appropriately.
7704 			 */
7705 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
7706 				D3(vswp, "%s: deleting multicast address "
7707 				    "0x%llx for port %ld", __func__, addr,
7708 				    port->p_instance);
7709 
7710 				vsw_del_addr(VSW_VNETPORT, port, addr);
7711 
7712 				/*
7713 				 * Remove the address from HW. The address
7714 				 * will actually only be removed once the ref
7715 				 * count within the MAC layer has dropped to
7716 				 * zero. I.e. we can safely call this fn even
7717 				 * if other ports are interested in this
7718 				 * address.
7719 				 */
7720 				mutex_enter(&vswp->mac_lock);
7721 				if ((vswp->mh == NULL) ||
7722 				    mac_multicst_remove(vswp->mh,
7723 				    (uchar_t *)&mcst_pkt->mca[i])) {
7724 					mutex_exit(&vswp->mac_lock);
7725 					cmn_err(CE_WARN, "!vsw%d: unable to "
7726 					    "remove multicast address",
7727 					    vswp->instance);
7728 					return (1);
7729 				}
7730 				mutex_exit(&vswp->mac_lock);
7731 
7732 			} else {
7733 				DERR(vswp, "%s: error deleting multicast "
7734 				    "addr 0x%llx for port %ld",
7735 				    __func__, addr, port->p_instance);
7736 				return (1);
7737 			}
7738 		}
7739 	}
7740 	D1(vswp, "%s: exit", __func__);
7741 	return (0);
7742 }
7743 
7744 /*
7745  * Add a new multicast entry.
7746  *
7747  * Search hash table based on address. If match found then
7748  * update associated val (which is chain of ports), otherwise
7749  * create new key/val (addr/port) pair and insert into table.
7750  */
7751 static int
7752 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
7753 {
7754 	int		dup = 0;
7755 	int		rv = 0;
7756 	mfdb_ent_t	*ment = NULL;
7757 	mfdb_ent_t	*tmp_ent = NULL;
7758 	mfdb_ent_t	*new_ent = NULL;
7759 	void		*tgt = NULL;
7760 
7761 	if (devtype == VSW_VNETPORT) {
7762 		/*
7763 		 * Being invoked from a vnet.
7764 		 */
7765 		ASSERT(arg != NULL);
7766 		tgt = arg;
7767 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
7768 		    ((vsw_port_t *)arg)->p_instance, addr);
7769 	} else {
7770 		/*
7771 		 * We are being invoked via the m_multicst mac entry
7772 		 * point.
7773 		 */
7774 		D2(NULL, "%s: address 0x%llx", __func__, addr);
7775 		tgt = (void *)vswp;
7776 	}
7777 
7778 	WRITE_ENTER(&vswp->mfdbrw);
7779 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
7780 	    (mod_hash_val_t *)&ment) != 0) {
7781 
7782 		/* address not currently in table */
7783 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
7784 		ment->d_addr = (void *)tgt;
7785 		ment->d_type = devtype;
7786 		ment->nextp = NULL;
7787 
7788 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
7789 		    (mod_hash_val_t)ment) != 0) {
7790 			DERR(vswp, "%s: hash table insertion failed", __func__);
7791 			kmem_free(ment, sizeof (mfdb_ent_t));
7792 			rv = 1;
7793 		} else {
7794 			D2(vswp, "%s: added initial entry for 0x%llx to "
7795 			    "table", __func__, addr);
7796 		}
7797 	} else {
7798 		/*
7799 		 * Address in table. Check to see if specified port
7800 		 * is already associated with the address. If not add
7801 		 * it now.
7802 		 */
7803 		tmp_ent = ment;
7804 		while (tmp_ent != NULL) {
7805 			if (tmp_ent->d_addr == (void *)tgt) {
7806 				if (devtype == VSW_VNETPORT) {
7807 					DERR(vswp, "%s: duplicate port entry "
7808 					    "found for portid %ld and key "
7809 					    "0x%llx", __func__,
7810 					    ((vsw_port_t *)arg)->p_instance,
7811 					    addr);
7812 				} else {
7813 					DERR(vswp, "%s: duplicate entry found"
7814 					    "for key 0x%llx", __func__, addr);
7815 				}
7816 				rv = 1;
7817 				dup = 1;
7818 				break;
7819 			}
7820 			tmp_ent = tmp_ent->nextp;
7821 		}
7822 
7823 		/*
7824 		 * Port not on list so add it to end now.
7825 		 */
7826 		if (0 == dup) {
7827 			D2(vswp, "%s: added entry for 0x%llx to table",
7828 			    __func__, addr);
7829 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
7830 			new_ent->d_addr = (void *)tgt;
7831 			new_ent->d_type = devtype;
7832 			new_ent->nextp = NULL;
7833 
7834 			tmp_ent = ment;
7835 			while (tmp_ent->nextp != NULL)
7836 				tmp_ent = tmp_ent->nextp;
7837 
7838 			tmp_ent->nextp = new_ent;
7839 		}
7840 	}
7841 
7842 	RW_EXIT(&vswp->mfdbrw);
7843 	return (rv);
7844 }
7845 
7846 /*
7847  * Remove a multicast entry from the hashtable.
7848  *
7849  * Search hash table based on address. If match found, scan
7850  * list of ports associated with address. If specified port
7851  * found remove it from list.
7852  */
7853 static int
7854 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
7855 {
7856 	mfdb_ent_t	*ment = NULL;
7857 	mfdb_ent_t	*curr_p, *prev_p;
7858 	void		*tgt = NULL;
7859 
7860 	D1(vswp, "%s: enter", __func__);
7861 
7862 	if (devtype == VSW_VNETPORT) {
7863 		tgt = (vsw_port_t *)arg;
7864 		D2(vswp, "%s: removing port %d from mFDB for address"
7865 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
7866 	} else {
7867 		D2(vswp, "%s: removing entry", __func__);
7868 		tgt = (void *)vswp;
7869 	}
7870 
7871 	WRITE_ENTER(&vswp->mfdbrw);
7872 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
7873 	    (mod_hash_val_t *)&ment) != 0) {
7874 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
7875 		RW_EXIT(&vswp->mfdbrw);
7876 		return (1);
7877 	}
7878 
7879 	prev_p = curr_p = ment;
7880 
7881 	while (curr_p != NULL) {
7882 		if (curr_p->d_addr == (void *)tgt) {
7883 			if (devtype == VSW_VNETPORT) {
7884 				D2(vswp, "%s: port %d found", __func__,
7885 				    ((vsw_port_t *)tgt)->p_instance);
7886 			} else {
7887 				D2(vswp, "%s: instance found", __func__);
7888 			}
7889 
7890 			if (prev_p == curr_p) {
7891 				/*
7892 				 * head of list, if no other element is in
7893 				 * list then destroy this entry, otherwise
7894 				 * just replace it with updated value.
7895 				 */
7896 				ment = curr_p->nextp;
7897 				kmem_free(curr_p, sizeof (mfdb_ent_t));
7898 				if (ment == NULL) {
7899 					(void) mod_hash_destroy(vswp->mfdb,
7900 					    (mod_hash_val_t)addr);
7901 				} else {
7902 					(void) mod_hash_replace(vswp->mfdb,
7903 					    (mod_hash_key_t)addr,
7904 					    (mod_hash_val_t)ment);
7905 				}
7906 			} else {
7907 				/*
7908 				 * Not head of list, no need to do
7909 				 * replacement, just adjust list pointers.
7910 				 */
7911 				prev_p->nextp = curr_p->nextp;
7912 				kmem_free(curr_p, sizeof (mfdb_ent_t));
7913 			}
7914 			break;
7915 		}
7916 
7917 		prev_p = curr_p;
7918 		curr_p = curr_p->nextp;
7919 	}
7920 
7921 	RW_EXIT(&vswp->mfdbrw);
7922 
7923 	D1(vswp, "%s: exit", __func__);
7924 
7925 	return (0);
7926 }
7927 
7928 /*
7929  * Port is being deleted, but has registered an interest in one
7930  * or more multicast groups. Using the list of addresses maintained
7931  * within the port structure find the appropriate entry in the hash
7932  * table and remove this port from the list of interested ports.
7933  */
7934 static void
7935 vsw_del_mcst_port(vsw_port_t *port)
7936 {
7937 	mcst_addr_t	*mcst_p = NULL;
7938 	vsw_t		*vswp = port->p_vswp;
7939 
7940 	D1(vswp, "%s: enter", __func__);
7941 
7942 	mutex_enter(&port->mca_lock);
7943 	while (port->mcap != NULL) {
7944 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
7945 		    port->mcap->addr, port);
7946 
7947 		mcst_p = port->mcap->nextp;
7948 		kmem_free(port->mcap, sizeof (mcst_addr_t));
7949 		port->mcap = mcst_p;
7950 	}
7951 	mutex_exit(&port->mca_lock);
7952 
7953 	D1(vswp, "%s: exit", __func__);
7954 }
7955 
7956 /*
7957  * This vsw instance is detaching, but has registered an interest in one
7958  * or more multicast groups. Using the list of addresses maintained
7959  * within the vsw structure find the appropriate entry in the hash
7960  * table and remove this instance from the list of interested ports.
7961  */
7962 static void
7963 vsw_del_mcst_vsw(vsw_t *vswp)
7964 {
7965 	mcst_addr_t	*next_p = NULL;
7966 
7967 	D1(vswp, "%s: enter", __func__);
7968 
7969 	mutex_enter(&vswp->mca_lock);
7970 
7971 	while (vswp->mcap != NULL) {
7972 		DERR(vswp, "%s: deleting addr 0x%llx",
7973 		    __func__, vswp->mcap->addr);
7974 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
7975 
7976 		next_p = vswp->mcap->nextp;
7977 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
7978 		vswp->mcap = next_p;
7979 	}
7980 
7981 	vswp->mcap = NULL;
7982 	mutex_exit(&vswp->mca_lock);
7983 
7984 	D1(vswp, "%s: exit", __func__);
7985 }
7986 
7987 
7988 /*
7989  * Remove the specified address from the list of address maintained
7990  * in this port node.
7991  */
7992 static void
7993 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
7994 {
7995 	vsw_t		*vswp = NULL;
7996 	vsw_port_t	*port = NULL;
7997 	mcst_addr_t	*prev_p = NULL;
7998 	mcst_addr_t	*curr_p = NULL;
7999 
8000 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
8001 	    __func__, devtype, addr);
8002 
8003 	if (devtype == VSW_VNETPORT) {
8004 		port = (vsw_port_t *)arg;
8005 		mutex_enter(&port->mca_lock);
8006 		prev_p = curr_p = port->mcap;
8007 	} else {
8008 		vswp = (vsw_t *)arg;
8009 		mutex_enter(&vswp->mca_lock);
8010 		prev_p = curr_p = vswp->mcap;
8011 	}
8012 
8013 	while (curr_p != NULL) {
8014 		if (curr_p->addr == addr) {
8015 			D2(NULL, "%s: address found", __func__);
8016 			/* match found */
8017 			if (prev_p == curr_p) {
8018 				/* list head */
8019 				if (devtype == VSW_VNETPORT)
8020 					port->mcap = curr_p->nextp;
8021 				else
8022 					vswp->mcap = curr_p->nextp;
8023 			} else {
8024 				prev_p->nextp = curr_p->nextp;
8025 			}
8026 			kmem_free(curr_p, sizeof (mcst_addr_t));
8027 			break;
8028 		} else {
8029 			prev_p = curr_p;
8030 			curr_p = curr_p->nextp;
8031 		}
8032 	}
8033 
8034 	if (devtype == VSW_VNETPORT)
8035 		mutex_exit(&port->mca_lock);
8036 	else
8037 		mutex_exit(&vswp->mca_lock);
8038 
8039 	D1(NULL, "%s: exit", __func__);
8040 }
8041 
8042 /*
8043  * Creates a descriptor ring (dring) and links it into the
8044  * link of outbound drings for this channel.
8045  *
8046  * Returns NULL if creation failed.
8047  */
8048 static dring_info_t *
8049 vsw_create_dring(vsw_ldc_t *ldcp)
8050 {
8051 	vsw_private_desc_t	*priv_addr = NULL;
8052 	vsw_t			*vswp = ldcp->ldc_vswp;
8053 	ldc_mem_info_t		minfo;
8054 	dring_info_t		*dp, *tp;
8055 	int			i;
8056 
8057 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
8058 
8059 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
8060 
8061 	/* create public section of ring */
8062 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
8063 	    VSW_PUB_SIZE, &dp->handle)) != 0) {
8064 
8065 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
8066 		    "failed", ldcp->ldc_id);
8067 		goto create_fail_exit;
8068 	}
8069 
8070 	ASSERT(dp->handle != NULL);
8071 
8072 	/*
8073 	 * Get the base address of the public section of the ring.
8074 	 */
8075 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
8076 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
8077 		    ldcp->ldc_id);
8078 		goto dring_fail_exit;
8079 	} else {
8080 		ASSERT(minfo.vaddr != 0);
8081 		dp->pub_addr = minfo.vaddr;
8082 	}
8083 
8084 	dp->num_descriptors = VSW_RING_NUM_EL;
8085 	dp->descriptor_size = VSW_PUB_SIZE;
8086 	dp->options = VIO_TX_DRING;
8087 	dp->ncookies = 1;	/* guaranteed by ldc */
8088 
8089 	/*
8090 	 * create private portion of ring
8091 	 */
8092 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
8093 	    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
8094 
8095 	if (vsw_setup_ring(ldcp, dp)) {
8096 		DERR(vswp, "%s: unable to setup ring", __func__);
8097 		goto dring_fail_exit;
8098 	}
8099 
8100 	/* haven't used any descriptors yet */
8101 	dp->end_idx = 0;
8102 	dp->last_ack_recv = -1;
8103 
8104 	/* bind dring to the channel */
8105 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
8106 	    LDC_SHADOW_MAP, LDC_MEM_RW,
8107 	    &dp->cookie[0], &dp->ncookies)) != 0) {
8108 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
8109 		    "%lld", ldcp->ldc_id);
8110 		goto dring_fail_exit;
8111 	}
8112 
8113 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
8114 	dp->restart_reqd = B_TRUE;
8115 
8116 	/*
8117 	 * Only ever create rings for outgoing lane. Link it onto
8118 	 * end of list.
8119 	 */
8120 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
8121 	if (ldcp->lane_out.dringp == NULL) {
8122 		D2(vswp, "vsw_create_dring: adding first outbound ring");
8123 		ldcp->lane_out.dringp = dp;
8124 	} else {
8125 		tp = ldcp->lane_out.dringp;
8126 		while (tp->next != NULL)
8127 			tp = tp->next;
8128 
8129 		tp->next = dp;
8130 	}
8131 	RW_EXIT(&ldcp->lane_out.dlistrw);
8132 
8133 	return (dp);
8134 
8135 dring_fail_exit:
8136 	(void) ldc_mem_dring_destroy(dp->handle);
8137 
8138 create_fail_exit:
8139 	if (dp->priv_addr != NULL) {
8140 		priv_addr = dp->priv_addr;
8141 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
8142 			if (priv_addr->memhandle != NULL)
8143 				(void) ldc_mem_free_handle(
8144 				    priv_addr->memhandle);
8145 			priv_addr++;
8146 		}
8147 		kmem_free(dp->priv_addr,
8148 		    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
8149 	}
8150 	mutex_destroy(&dp->dlock);
8151 
8152 	kmem_free(dp, sizeof (dring_info_t));
8153 	return (NULL);
8154 }
8155 
8156 /*
8157  * Create a ring consisting of just a private portion and link
8158  * it into the list of rings for the outbound lane.
8159  *
8160  * These type of rings are used primarily for temporary data
8161  * storage (i.e. as data buffers).
8162  */
8163 void
8164 vsw_create_privring(vsw_ldc_t *ldcp)
8165 {
8166 	dring_info_t		*dp, *tp;
8167 	vsw_t			*vswp = ldcp->ldc_vswp;
8168 
8169 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
8170 
8171 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
8172 
8173 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
8174 
8175 	/* no public section */
8176 	dp->pub_addr = NULL;
8177 
8178 	dp->priv_addr = kmem_zalloc(
8179 	    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
8180 
8181 	dp->num_descriptors = VSW_RING_NUM_EL;
8182 
8183 	if (vsw_setup_ring(ldcp, dp)) {
8184 		DERR(vswp, "%s: setup of ring failed", __func__);
8185 		kmem_free(dp->priv_addr,
8186 		    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
8187 		mutex_destroy(&dp->dlock);
8188 		kmem_free(dp, sizeof (dring_info_t));
8189 		return;
8190 	}
8191 
8192 	/* haven't used any descriptors yet */
8193 	dp->end_idx = 0;
8194 
8195 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
8196 	dp->restart_reqd = B_TRUE;
8197 
8198 	/*
8199 	 * Only ever create rings for outgoing lane. Link it onto
8200 	 * end of list.
8201 	 */
8202 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
8203 	if (ldcp->lane_out.dringp == NULL) {
8204 		D2(vswp, "%s: adding first outbound privring", __func__);
8205 		ldcp->lane_out.dringp = dp;
8206 	} else {
8207 		tp = ldcp->lane_out.dringp;
8208 		while (tp->next != NULL)
8209 			tp = tp->next;
8210 
8211 		tp->next = dp;
8212 	}
8213 	RW_EXIT(&ldcp->lane_out.dlistrw);
8214 
8215 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
8216 }
8217 
8218 /*
8219  * Setup the descriptors in the dring. Returns 0 on success, 1 on
8220  * failure.
8221  */
8222 int
8223 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
8224 {
8225 	vnet_public_desc_t	*pub_addr = NULL;
8226 	vsw_private_desc_t	*priv_addr = NULL;
8227 	vsw_t			*vswp = ldcp->ldc_vswp;
8228 	uint64_t		*tmpp;
8229 	uint64_t		offset = 0;
8230 	uint32_t		ncookies = 0;
8231 	static char		*name = "vsw_setup_ring";
8232 	int			i, j, nc, rv;
8233 
8234 	priv_addr = dp->priv_addr;
8235 	pub_addr = dp->pub_addr;
8236 
8237 	/* public section may be null but private should never be */
8238 	ASSERT(priv_addr != NULL);
8239 
8240 	/*
8241 	 * Allocate the region of memory which will be used to hold
8242 	 * the data the descriptors will refer to.
8243 	 */
8244 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
8245 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
8246 
8247 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
8248 	    dp->data_sz, dp->data_addr);
8249 
8250 	tmpp = (uint64_t *)dp->data_addr;
8251 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
8252 
8253 	/*
8254 	 * Initialise some of the private and public (if they exist)
8255 	 * descriptor fields.
8256 	 */
8257 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
8258 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
8259 
8260 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
8261 		    &priv_addr->memhandle)) != 0) {
8262 			DERR(vswp, "%s: alloc mem handle failed", name);
8263 			goto setup_ring_cleanup;
8264 		}
8265 
8266 		priv_addr->datap = (void *)tmpp;
8267 
8268 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
8269 		    (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
8270 		    LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
8271 		    &(priv_addr->memcookie[0]), &ncookies);
8272 		if (rv != 0) {
8273 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
8274 			    "(rv %d)", name, ldcp->ldc_id, rv);
8275 			goto setup_ring_cleanup;
8276 		}
8277 		priv_addr->bound = 1;
8278 
8279 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
8280 		    name, i, priv_addr->memcookie[0].addr,
8281 		    priv_addr->memcookie[0].size);
8282 
8283 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
8284 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
8285 			    "invalid num of cookies (%d) for size 0x%llx",
8286 			    name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ);
8287 
8288 			goto setup_ring_cleanup;
8289 		} else {
8290 			for (j = 1; j < ncookies; j++) {
8291 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
8292 				    &(priv_addr->memcookie[j]));
8293 				if (rv != 0) {
8294 					DERR(vswp, "%s: ldc_mem_nextcookie "
8295 					    "failed rv (%d)", name, rv);
8296 					goto setup_ring_cleanup;
8297 				}
8298 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
8299 				    "size 0x%llx", name, j,
8300 				    priv_addr->memcookie[j].addr,
8301 				    priv_addr->memcookie[j].size);
8302 			}
8303 
8304 		}
8305 		priv_addr->ncookies = ncookies;
8306 		priv_addr->dstate = VIO_DESC_FREE;
8307 
8308 		if (pub_addr != NULL) {
8309 
8310 			/* link pub and private sides */
8311 			priv_addr->descp = pub_addr;
8312 
8313 			pub_addr->ncookies = priv_addr->ncookies;
8314 
8315 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
8316 				bcopy(&priv_addr->memcookie[nc],
8317 				    &pub_addr->memcookie[nc],
8318 				    sizeof (ldc_mem_cookie_t));
8319 			}
8320 
8321 			pub_addr->hdr.dstate = VIO_DESC_FREE;
8322 			pub_addr++;
8323 		}
8324 
8325 		/*
8326 		 * move to next element in the dring and the next
8327 		 * position in the data buffer.
8328 		 */
8329 		priv_addr++;
8330 		tmpp += offset;
8331 	}
8332 
8333 	return (0);
8334 
8335 setup_ring_cleanup:
8336 	priv_addr = dp->priv_addr;
8337 
8338 	for (j = 0; j < i; j++) {
8339 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
8340 		(void) ldc_mem_free_handle(priv_addr->memhandle);
8341 
8342 		mutex_destroy(&priv_addr->dstate_lock);
8343 
8344 		priv_addr++;
8345 	}
8346 	kmem_free(dp->data_addr, dp->data_sz);
8347 
8348 	return (1);
8349 }
8350 
8351 /*
8352  * Searches the private section of a ring for a free descriptor,
8353  * starting at the location of the last free descriptor found
8354  * previously.
8355  *
8356  * Returns 0 if free descriptor is available, and updates state
8357  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
8358  *
8359  * FUTURE: might need to return contiguous range of descriptors
8360  * as dring info msg assumes all will be contiguous.
8361  */
8362 static int
8363 vsw_dring_find_free_desc(dring_info_t *dringp,
8364 		vsw_private_desc_t **priv_p, int *idx)
8365 {
8366 	vsw_private_desc_t	*addr = NULL;
8367 	int			num = VSW_RING_NUM_EL;
8368 	int			ret = 1;
8369 
8370 	D1(NULL, "%s enter\n", __func__);
8371 
8372 	ASSERT(dringp->priv_addr != NULL);
8373 
8374 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
8375 	    __func__, dringp, dringp->end_idx);
8376 
8377 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
8378 
8379 	mutex_enter(&addr->dstate_lock);
8380 	if (addr->dstate == VIO_DESC_FREE) {
8381 		addr->dstate = VIO_DESC_READY;
8382 		*priv_p = addr;
8383 		*idx = dringp->end_idx;
8384 		dringp->end_idx = (dringp->end_idx + 1) % num;
8385 		ret = 0;
8386 
8387 	}
8388 	mutex_exit(&addr->dstate_lock);
8389 
8390 	/* ring full */
8391 	if (ret == 1) {
8392 		D2(NULL, "%s: no desp free: started at %d", __func__,
8393 		    dringp->end_idx);
8394 	}
8395 
8396 	D1(NULL, "%s: exit\n", __func__);
8397 
8398 	return (ret);
8399 }
8400 
8401 /*
8402  * Map from a dring identifier to the ring itself. Returns
8403  * pointer to ring or NULL if no match found.
8404  *
8405  * Should be called with dlistrw rwlock held as reader.
8406  */
8407 static dring_info_t *
8408 vsw_ident2dring(lane_t *lane, uint64_t ident)
8409 {
8410 	dring_info_t	*dp = NULL;
8411 
8412 	if ((dp = lane->dringp) == NULL) {
8413 		return (NULL);
8414 	} else {
8415 		if (dp->ident == ident)
8416 			return (dp);
8417 
8418 		while (dp != NULL) {
8419 			if (dp->ident == ident)
8420 				break;
8421 			dp = dp->next;
8422 		}
8423 	}
8424 
8425 	return (dp);
8426 }
8427 
8428 /*
8429  * Set the default lane attributes. These are copied into
8430  * the attr msg we send to our peer. If they are not acceptable
8431  * then (currently) the handshake ends.
8432  */
8433 static void
8434 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
8435 {
8436 	bzero(lp, sizeof (lane_t));
8437 
8438 	READ_ENTER(&vswp->if_lockrw);
8439 	ether_copy(&(vswp->if_addr), &(lp->addr));
8440 	RW_EXIT(&vswp->if_lockrw);
8441 
8442 	lp->mtu = VSW_MTU;
8443 	lp->addr_type = ADDR_TYPE_MAC;
8444 	lp->xfer_mode = VIO_DRING_MODE;
8445 	lp->ack_freq = 0;	/* for shared mode */
8446 
8447 	mutex_enter(&lp->seq_lock);
8448 	lp->seq_num = VNET_ISS;
8449 	mutex_exit(&lp->seq_lock);
8450 }
8451 
8452 /*
8453  * Verify that the attributes are acceptable.
8454  *
8455  * FUTURE: If some attributes are not acceptable, change them
8456  * our desired values.
8457  */
8458 static int
8459 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
8460 {
8461 	int	ret = 0;
8462 
8463 	D1(NULL, "vsw_check_attr enter\n");
8464 
8465 	/*
8466 	 * Note we currently only support in-band descriptors
8467 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
8468 	 */
8469 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
8470 	    (pkt->xfer_mode != VIO_DRING_MODE)) {
8471 		D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode);
8472 		ret = 1;
8473 	}
8474 
8475 	/* Only support MAC addresses at moment. */
8476 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
8477 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
8478 		    "or address 0x%llx\n", pkt->addr_type, pkt->addr);
8479 		ret = 1;
8480 	}
8481 
8482 	/*
8483 	 * MAC address supplied by device should match that stored
8484 	 * in the vsw-port OBP node. Need to decide what to do if they
8485 	 * don't match, for the moment just warn but don't fail.
8486 	 */
8487 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
8488 		DERR(NULL, "vsw_check_attr: device supplied address "
8489 		    "0x%llx doesn't match node address 0x%llx\n",
8490 		    pkt->addr, port->p_macaddr);
8491 	}
8492 
8493 	/*
8494 	 * Ack freq only makes sense in pkt mode, in shared
8495 	 * mode the ring descriptors say whether or not to
8496 	 * send back an ACK.
8497 	 */
8498 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
8499 	    (pkt->ack_freq > 0)) {
8500 		D2(NULL, "vsw_check_attr: non zero ack freq "
8501 		    " in SHM mode\n");
8502 		ret = 1;
8503 	}
8504 
8505 	/*
8506 	 * Note: for the moment we only support ETHER
8507 	 * frames. This may change in the future.
8508 	 */
8509 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
8510 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
8511 		    pkt->mtu);
8512 		ret = 1;
8513 	}
8514 
8515 	D1(NULL, "vsw_check_attr exit\n");
8516 
8517 	return (ret);
8518 }
8519 
8520 /*
8521  * Returns 1 if there is a problem, 0 otherwise.
8522  */
8523 static int
8524 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
8525 {
8526 	_NOTE(ARGUNUSED(pkt))
8527 
8528 	int	ret = 0;
8529 
8530 	D1(NULL, "vsw_check_dring_info enter\n");
8531 
8532 	if ((pkt->num_descriptors == 0) ||
8533 	    (pkt->descriptor_size == 0) ||
8534 	    (pkt->ncookies != 1)) {
8535 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
8536 		ret = 1;
8537 	}
8538 
8539 	D1(NULL, "vsw_check_dring_info exit\n");
8540 
8541 	return (ret);
8542 }
8543 
8544 /*
8545  * Returns 1 if two memory cookies match. Otherwise returns 0.
8546  */
8547 static int
8548 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
8549 {
8550 	if ((m1->addr != m2->addr) ||
8551 	    (m2->size != m2->size)) {
8552 		return (0);
8553 	} else {
8554 		return (1);
8555 	}
8556 }
8557 
8558 /*
8559  * Returns 1 if ring described in reg message matches that
8560  * described by dring_info structure. Otherwise returns 0.
8561  */
8562 static int
8563 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
8564 {
8565 	if ((msg->descriptor_size != dp->descriptor_size) ||
8566 	    (msg->num_descriptors != dp->num_descriptors) ||
8567 	    (msg->ncookies != dp->ncookies) ||
8568 	    !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
8569 		return (0);
8570 	} else {
8571 		return (1);
8572 	}
8573 
8574 }
8575 
8576 static caddr_t
8577 vsw_print_ethaddr(uint8_t *a, char *ebuf)
8578 {
8579 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
8580 	    a[0], a[1], a[2], a[3], a[4], a[5]);
8581 	return (ebuf);
8582 }
8583 
8584 /*
8585  * Reset and free all the resources associated with
8586  * the channel.
8587  */
8588 static void
8589 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
8590 {
8591 	dring_info_t		*dp, *dpp;
8592 	lane_t			*lp = NULL;
8593 	int			rv = 0;
8594 
8595 	ASSERT(ldcp != NULL);
8596 
8597 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
8598 
8599 	if (dir == INBOUND) {
8600 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
8601 		    " of channel %lld", __func__, ldcp->ldc_id);
8602 		lp = &ldcp->lane_in;
8603 	} else {
8604 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
8605 		    " of channel %lld", __func__, ldcp->ldc_id);
8606 		lp = &ldcp->lane_out;
8607 	}
8608 
8609 	lp->lstate = VSW_LANE_INACTIV;
8610 	mutex_enter(&lp->seq_lock);
8611 	lp->seq_num = VNET_ISS;
8612 	mutex_exit(&lp->seq_lock);
8613 	if (lp->dringp) {
8614 		if (dir == INBOUND) {
8615 			WRITE_ENTER(&lp->dlistrw);
8616 			dp = lp->dringp;
8617 			while (dp != NULL) {
8618 				dpp = dp->next;
8619 				if (dp->handle != NULL)
8620 					(void) ldc_mem_dring_unmap(dp->handle);
8621 				kmem_free(dp, sizeof (dring_info_t));
8622 				dp = dpp;
8623 			}
8624 			RW_EXIT(&lp->dlistrw);
8625 		} else {
8626 			/*
8627 			 * unbind, destroy exported dring, free dring struct
8628 			 */
8629 			WRITE_ENTER(&lp->dlistrw);
8630 			dp = lp->dringp;
8631 			rv = vsw_free_ring(dp);
8632 			RW_EXIT(&lp->dlistrw);
8633 		}
8634 		if (rv == 0) {
8635 			lp->dringp = NULL;
8636 		}
8637 	}
8638 
8639 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
8640 }
8641 
8642 /*
8643  * Free ring and all associated resources.
8644  *
8645  * Should be called with dlistrw rwlock held as writer.
8646  */
8647 static int
8648 vsw_free_ring(dring_info_t *dp)
8649 {
8650 	vsw_private_desc_t	*paddr = NULL;
8651 	dring_info_t		*dpp;
8652 	int			i, rv = 1;
8653 
8654 	while (dp != NULL) {
8655 		mutex_enter(&dp->dlock);
8656 		dpp = dp->next;
8657 		if (dp->priv_addr != NULL) {
8658 			/*
8659 			 * First unbind and free the memory handles
8660 			 * stored in each descriptor within the ring.
8661 			 */
8662 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
8663 				paddr = (vsw_private_desc_t *)
8664 				    dp->priv_addr + i;
8665 				if (paddr->memhandle != NULL) {
8666 					if (paddr->bound == 1) {
8667 						rv = ldc_mem_unbind_handle(
8668 						    paddr->memhandle);
8669 
8670 						if (rv != 0) {
8671 							DERR(NULL, "error "
8672 							"unbinding handle for "
8673 							"ring 0x%llx at pos %d",
8674 							    dp, i);
8675 							mutex_exit(&dp->dlock);
8676 							return (rv);
8677 						}
8678 						paddr->bound = 0;
8679 					}
8680 
8681 					rv = ldc_mem_free_handle(
8682 					    paddr->memhandle);
8683 					if (rv != 0) {
8684 						DERR(NULL, "error freeing "
8685 						    "handle for ring 0x%llx "
8686 						    "at pos %d", dp, i);
8687 						mutex_exit(&dp->dlock);
8688 						return (rv);
8689 					}
8690 					paddr->memhandle = NULL;
8691 				}
8692 				mutex_destroy(&paddr->dstate_lock);
8693 			}
8694 			kmem_free(dp->priv_addr,
8695 			    (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
8696 		}
8697 
8698 		/*
8699 		 * Now unbind and destroy the ring itself.
8700 		 */
8701 		if (dp->handle != NULL) {
8702 			(void) ldc_mem_dring_unbind(dp->handle);
8703 			(void) ldc_mem_dring_destroy(dp->handle);
8704 		}
8705 
8706 		if (dp->data_addr != NULL) {
8707 			kmem_free(dp->data_addr, dp->data_sz);
8708 		}
8709 
8710 		mutex_exit(&dp->dlock);
8711 		mutex_destroy(&dp->dlock);
8712 		mutex_destroy(&dp->restart_lock);
8713 		kmem_free(dp, sizeof (dring_info_t));
8714 
8715 		dp = dpp;
8716 	}
8717 	return (0);
8718 }
8719 
8720 /*
8721  * Debugging routines
8722  */
8723 static void
8724 display_state(void)
8725 {
8726 	vsw_t		*vswp;
8727 	vsw_port_list_t	*plist;
8728 	vsw_port_t 	*port;
8729 	vsw_ldc_list_t	*ldcl;
8730 	vsw_ldc_t 	*ldcp;
8731 
8732 	cmn_err(CE_NOTE, "***** system state *****");
8733 
8734 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
8735 		plist = &vswp->plist;
8736 		READ_ENTER(&plist->lockrw);
8737 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
8738 		    vswp->instance, plist->num_ports);
8739 
8740 		for (port = plist->head; port != NULL; port = port->p_next) {
8741 			ldcl = &port->p_ldclist;
8742 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
8743 			    port->p_instance, ldcl->num_ldcs);
8744 			READ_ENTER(&ldcl->lockrw);
8745 			ldcp = ldcl->head;
8746 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
8747 				cmn_err(CE_CONT, "chan %lu : dev %d : "
8748 				    "status %d : phase %u\n",
8749 				    ldcp->ldc_id, ldcp->dev_class,
8750 				    ldcp->ldc_status, ldcp->hphase);
8751 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
8752 				    "psession %lu\n", ldcp->ldc_id,
8753 				    ldcp->local_session, ldcp->peer_session);
8754 
8755 				cmn_err(CE_CONT, "Inbound lane:\n");
8756 				display_lane(&ldcp->lane_in);
8757 				cmn_err(CE_CONT, "Outbound lane:\n");
8758 				display_lane(&ldcp->lane_out);
8759 			}
8760 			RW_EXIT(&ldcl->lockrw);
8761 		}
8762 		RW_EXIT(&plist->lockrw);
8763 	}
8764 	cmn_err(CE_NOTE, "***** system state *****");
8765 }
8766 
8767 static void
8768 display_lane(lane_t *lp)
8769 {
8770 	dring_info_t	*drp;
8771 
8772 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
8773 	    lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
8774 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
8775 	    lp->addr_type, lp->addr, lp->xfer_mode);
8776 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
8777 
8778 	cmn_err(CE_CONT, "Dring info:\n");
8779 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
8780 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
8781 		    drp->num_descriptors, drp->descriptor_size);
8782 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
8783 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
8784 		    (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
8785 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
8786 		    drp->ident, drp->end_idx);
8787 		display_ring(drp);
8788 	}
8789 }
8790 
8791 static void
8792 display_ring(dring_info_t *dringp)
8793 {
8794 	uint64_t		i;
8795 	uint64_t		priv_count = 0;
8796 	uint64_t		pub_count = 0;
8797 	vnet_public_desc_t	*pub_addr = NULL;
8798 	vsw_private_desc_t	*priv_addr = NULL;
8799 
8800 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
8801 		if (dringp->pub_addr != NULL) {
8802 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
8803 
8804 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
8805 				pub_count++;
8806 		}
8807 
8808 		if (dringp->priv_addr != NULL) {
8809 			priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
8810 
8811 			if (priv_addr->dstate == VIO_DESC_FREE)
8812 				priv_count++;
8813 		}
8814 	}
8815 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
8816 	    i, priv_count, pub_count);
8817 }
8818 
8819 static void
8820 dump_flags(uint64_t state)
8821 {
8822 	int	i;
8823 
8824 	typedef struct flag_name {
8825 		int	flag_val;
8826 		char	*flag_name;
8827 	} flag_name_t;
8828 
8829 	flag_name_t	flags[] = {
8830 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
8831 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
8832 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
8833 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
8834 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
8835 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
8836 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
8837 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
8838 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
8839 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
8840 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
8841 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
8842 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
8843 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
8844 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
8845 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
8846 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
8847 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
8848 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
8849 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
8850 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
8851 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
8852 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
8853 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
8854 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
8855 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
8856 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
8857 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
8858 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
8859 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
8860 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
8861 
8862 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
8863 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
8864 		if (state & flags[i].flag_val)
8865 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
8866 	}
8867 }
8868