xref: /illumos-gate/usr/src/uts/sun4v/io/vsw.c (revision 3893cb7fe5bfa1c9a4f7954517a917367f6cf081)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 
74 /*
75  * Function prototypes.
76  */
77 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
78 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
79 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
80 static	int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
81 static	int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *);
82 static	int vsw_get_physaddr(vsw_t *);
83 static	int vsw_setup_switching(vsw_t *);
84 static	int vsw_setup_layer2(vsw_t *);
85 static	int vsw_setup_layer3(vsw_t *);
86 
87 /* MAC Ring table functions. */
88 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
89 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
90 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
91 static void vsw_queue_stop(vsw_queue_t *vqp);
92 static vsw_queue_t *vsw_queue_create();
93 static void vsw_queue_destroy(vsw_queue_t *vqp);
94 
95 /* MAC layer routines */
96 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
97 		mac_resource_t *mrp);
98 static	int vsw_get_hw_maddr(vsw_t *);
99 static	int vsw_set_hw(vsw_t *, vsw_port_t *, int);
100 static	int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
101 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
102 static	int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
103 static	int vsw_unset_hw_addr(vsw_t *, int);
104 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
105 static void vsw_reconfig_hw(vsw_t *);
106 static int vsw_prog_if(vsw_t *);
107 static int vsw_prog_ports(vsw_t *);
108 static int vsw_mac_attach(vsw_t *vswp);
109 static void vsw_mac_detach(vsw_t *vswp);
110 
111 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
112 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
113 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
114 static int vsw_mac_register(vsw_t *);
115 static int vsw_mac_unregister(vsw_t *);
116 static int vsw_m_stat(void *, uint_t, uint64_t *);
117 static void vsw_m_stop(void *arg);
118 static int vsw_m_start(void *arg);
119 static int vsw_m_unicst(void *arg, const uint8_t *);
120 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
121 static int vsw_m_promisc(void *arg, boolean_t);
122 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
123 
124 /* MDEG routines */
125 static	int vsw_mdeg_register(vsw_t *vswp);
126 static	void vsw_mdeg_unregister(vsw_t *vswp);
127 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
128 static	int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
129 static	void vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
130 static	void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);
131 
132 /* Port add/deletion routines */
133 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
134 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
135 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
136 static	int vsw_detach_ports(vsw_t *vswp);
137 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
138 static	int vsw_port_delete(vsw_port_t *port);
139 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
140 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
141 static	int vsw_init_ldcs(vsw_port_t *port);
142 static	int vsw_uninit_ldcs(vsw_port_t *port);
143 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
144 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
145 static	int vsw_drain_ldcs(vsw_port_t *port);
146 static	int vsw_drain_port_taskq(vsw_port_t *port);
147 static	void vsw_marker_task(void *);
148 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
149 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
150 
151 /* Interrupt routines */
152 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
153 
154 /* Handshake routines */
155 static	void vsw_ldc_reinit(vsw_ldc_t *);
156 static	void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
157 static	void vsw_conn_task(void *);
158 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
159 static	void vsw_next_milestone(vsw_ldc_t *);
160 static	int vsw_supported_version(vio_ver_msg_t *);
161 
162 /* Data processing routines */
163 static void vsw_process_pkt(void *);
164 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
165 static void vsw_process_ctrl_pkt(void *);
166 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
167 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
168 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
169 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
170 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
171 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
172 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
173 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
174 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
175 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
176 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
177 
178 /* Switching/data transmit routines */
179 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
180 	    vsw_port_t *port, mac_resource_handle_t);
181 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
182 	    vsw_port_t *port, mac_resource_handle_t);
183 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
184 	    vsw_port_t *port);
185 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
186 	    vsw_port_t *port);
187 static	int vsw_portsend(vsw_port_t *, mblk_t *);
188 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
189 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
190 
191 /* Packet creation routines */
192 static void vsw_send_ver(void *);
193 static void vsw_send_attr(vsw_ldc_t *);
194 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
195 static void vsw_send_dring_info(vsw_ldc_t *);
196 static void vsw_send_rdx(vsw_ldc_t *);
197 
198 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
199 
200 /* Forwarding database (FDB) routines */
201 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
202 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
203 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
204 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
205 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
206 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
207 static	void vsw_del_addr(uint8_t, void *, uint64_t);
208 static	void vsw_del_mcst_port(vsw_port_t *);
209 static	void vsw_del_mcst_vsw(vsw_t *);
210 
211 /* Dring routines */
212 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
213 static void vsw_create_privring(vsw_ldc_t *);
214 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
215 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
216     int *);
217 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
218 
219 static void vsw_set_lane_attr(vsw_t *, lane_t *);
220 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
221 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
222 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
223 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
224 
225 /* Misc support routines */
226 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
227 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
228 static int vsw_free_ring(dring_info_t *);
229 
230 /* Debugging routines */
231 static void dump_flags(uint64_t);
232 static void display_state(void);
233 static void display_lane(lane_t *);
234 static void display_ring(dring_info_t *);
235 
236 int	vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */
237 int	vsw_wretries = 100;		/* # of write attempts */
238 int	vsw_chain_len = 150;		/* max # of mblks in msg chain */
239 int	vsw_desc_delay = 0;		/* delay in us */
240 int	vsw_read_attempts = 5;		/* # of reads of descriptor */
241 
242 uint32_t	vsw_mblk_size = VSW_MBLK_SIZE;
243 uint32_t	vsw_num_mblks = VSW_NUM_MBLKS;
244 
245 static	mac_callbacks_t	vsw_m_callbacks = {
246 	0,
247 	vsw_m_stat,
248 	vsw_m_start,
249 	vsw_m_stop,
250 	vsw_m_promisc,
251 	vsw_m_multicst,
252 	vsw_m_unicst,
253 	vsw_m_tx,
254 	NULL,
255 	NULL,
256 	NULL
257 };
258 
259 static	struct	cb_ops	vsw_cb_ops = {
260 	nulldev,			/* cb_open */
261 	nulldev,			/* cb_close */
262 	nodev,				/* cb_strategy */
263 	nodev,				/* cb_print */
264 	nodev,				/* cb_dump */
265 	nodev,				/* cb_read */
266 	nodev,				/* cb_write */
267 	nodev,				/* cb_ioctl */
268 	nodev,				/* cb_devmap */
269 	nodev,				/* cb_mmap */
270 	nodev,				/* cb_segmap */
271 	nochpoll,			/* cb_chpoll */
272 	ddi_prop_op,			/* cb_prop_op */
273 	NULL,				/* cb_stream */
274 	D_MP,				/* cb_flag */
275 	CB_REV,				/* rev */
276 	nodev,				/* int (*cb_aread)() */
277 	nodev				/* int (*cb_awrite)() */
278 };
279 
280 static	struct	dev_ops	vsw_ops = {
281 	DEVO_REV,		/* devo_rev */
282 	0,			/* devo_refcnt */
283 	vsw_getinfo,		/* devo_getinfo */
284 	nulldev,		/* devo_identify */
285 	nulldev,		/* devo_probe */
286 	vsw_attach,		/* devo_attach */
287 	vsw_detach,		/* devo_detach */
288 	nodev,			/* devo_reset */
289 	&vsw_cb_ops,		/* devo_cb_ops */
290 	(struct bus_ops *)NULL,	/* devo_bus_ops */
291 	ddi_power		/* devo_power */
292 };
293 
294 extern	struct	mod_ops	mod_driverops;
295 static struct modldrv vswmodldrv = {
296 	&mod_driverops,
297 	"sun4v Virtual Switch %I%",
298 	&vsw_ops,
299 };
300 
301 #define	LDC_ENTER_LOCK(ldcp)	\
302 				mutex_enter(&((ldcp)->ldc_cblock));\
303 				mutex_enter(&((ldcp)->ldc_txlock));
304 #define	LDC_EXIT_LOCK(ldcp)	\
305 				mutex_exit(&((ldcp)->ldc_txlock));\
306 				mutex_exit(&((ldcp)->ldc_cblock));
307 
308 /* Driver soft state ptr  */
309 static void	*vsw_state;
310 
311 /*
312  * Linked list of "vsw_t" structures - one per instance.
313  */
314 vsw_t		*vsw_head = NULL;
315 krwlock_t	vsw_rw;
316 
317 /*
318  * Property names
319  */
320 static char vdev_propname[] = "virtual-device";
321 static char vsw_propname[] = "virtual-network-switch";
322 static char physdev_propname[] = "vsw-phys-dev";
323 static char smode_propname[] = "vsw-switch-mode";
324 static char macaddr_propname[] = "local-mac-address";
325 static char remaddr_propname[] = "remote-mac-address";
326 static char ldcids_propname[] = "ldc-ids";
327 static char chan_propname[] = "channel-endpoint";
328 static char id_propname[] = "id";
329 static char reg_propname[] = "reg";
330 
331 /* supported versions */
332 static	ver_sup_t	vsw_versions[] = { {1, 0} };
333 
334 /*
335  * Matching criteria passed to the MDEG to register interest
336  * in changes to 'virtual-device-port' nodes identified by their
337  * 'id' property.
338  */
339 static md_prop_match_t vport_prop_match[] = {
340 	{ MDET_PROP_VAL,    "id"   },
341 	{ MDET_LIST_END,    NULL    }
342 };
343 
344 static mdeg_node_match_t vport_match = { "virtual-device-port",
345 						vport_prop_match };
346 
347 /*
348  * Matching criteria passed to the MDEG to register interest
349  * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified
350  * by their 'name' and 'cfg-handle' properties.
351  */
352 static md_prop_match_t vdev_prop_match[] = {
353 	{ MDET_PROP_STR,    "name"   },
354 	{ MDET_PROP_VAL,    "cfg-handle" },
355 	{ MDET_LIST_END,    NULL    }
356 };
357 
358 static mdeg_node_match_t vdev_match = { "virtual-device",
359 						vdev_prop_match };
360 
361 
362 /*
363  * Specification of an MD node passed to the MDEG to filter any
364  * 'vport' nodes that do not belong to the specified node. This
365  * template is copied for each vsw instance and filled in with
366  * the appropriate 'cfg-handle' value before being passed to the MDEG.
367  */
368 static mdeg_prop_spec_t vsw_prop_template[] = {
369 	{ MDET_PROP_STR,    "name",		vsw_propname },
370 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
371 	{ MDET_LIST_END,    NULL,		NULL	}
372 };
373 
374 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
375 
376 /*
377  * From /etc/system enable/disable thread per ring. This is a mode
378  * selection that is done a vsw driver attach time.
379  */
380 boolean_t vsw_multi_ring_enable = B_FALSE;
381 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;
382 
383 /*
384  * Print debug messages - set to 0x1f to enable all msgs
385  * or 0x0 to turn all off.
386  */
387 int vswdbg = 0x0;
388 
389 /*
390  * debug levels:
391  * 0x01:	Function entry/exit tracing
392  * 0x02:	Internal function messages
393  * 0x04:	Verbose internal messages
394  * 0x08:	Warning messages
395  * 0x10:	Error messages
396  */
397 
398 static void
399 vswdebug(vsw_t *vswp, const char *fmt, ...)
400 {
401 	char buf[512];
402 	va_list ap;
403 
404 	va_start(ap, fmt);
405 	(void) vsprintf(buf, fmt, ap);
406 	va_end(ap);
407 
408 	if (vswp == NULL)
409 		cmn_err(CE_CONT, "%s\n", buf);
410 	else
411 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
412 }
413 
414 /*
415  * For the moment the state dump routines have their own
416  * private flag.
417  */
418 #define	DUMP_STATE	0
419 
420 #if DUMP_STATE
421 
422 #define	DUMP_TAG(tag) \
423 {			\
424 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
425 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
426 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
427 }
428 
429 #define	DUMP_TAG_PTR(tag) \
430 {			\
431 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
432 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
433 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
434 }
435 
436 #define	DUMP_FLAGS(flags) dump_flags(flags);
437 #define	DISPLAY_STATE()	display_state()
438 
439 #else
440 
441 #define	DUMP_TAG(tag)
442 #define	DUMP_TAG_PTR(tag)
443 #define	DUMP_FLAGS(state)
444 #define	DISPLAY_STATE()
445 
446 #endif	/* DUMP_STATE */
447 
448 #ifdef DEBUG
449 
450 #define	D1		\
451 if (vswdbg & 0x01)	\
452 	vswdebug
453 
454 #define	D2		\
455 if (vswdbg & 0x02)	\
456 	vswdebug
457 
458 #define	D3		\
459 if (vswdbg & 0x04)	\
460 	vswdebug
461 
462 #define	DWARN		\
463 if (vswdbg & 0x08)	\
464 	vswdebug
465 
466 #define	DERR		\
467 if (vswdbg & 0x10)	\
468 	vswdebug
469 
470 #else
471 
472 #define	DERR		if (0)	vswdebug
473 #define	DWARN		if (0)	vswdebug
474 #define	D1		if (0)	vswdebug
475 #define	D2		if (0)	vswdebug
476 #define	D3		if (0)	vswdebug
477 
478 #endif	/* DEBUG */
479 
480 static struct modlinkage modlinkage = {
481 	MODREV_1,
482 	&vswmodldrv,
483 	NULL
484 };
485 
486 int
487 _init(void)
488 {
489 	int status;
490 
491 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
492 
493 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
494 	if (status != 0) {
495 		return (status);
496 	}
497 
498 	mac_init_ops(&vsw_ops, "vsw");
499 	status = mod_install(&modlinkage);
500 	if (status != 0) {
501 		ddi_soft_state_fini(&vsw_state);
502 	}
503 	return (status);
504 }
505 
506 int
507 _fini(void)
508 {
509 	int status;
510 
511 	status = mod_remove(&modlinkage);
512 	if (status != 0)
513 		return (status);
514 	mac_fini_ops(&vsw_ops);
515 	ddi_soft_state_fini(&vsw_state);
516 
517 	rw_destroy(&vsw_rw);
518 
519 	return (status);
520 }
521 
522 int
523 _info(struct modinfo *modinfop)
524 {
525 	return (mod_info(&modlinkage, modinfop));
526 }
527 
528 static int
529 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
530 {
531 	vsw_t		*vswp;
532 	int		instance;
533 	char		hashname[MAXNAMELEN];
534 	char		qname[TASKQ_NAMELEN];
535 	enum		{ PROG_init = 0x00,
536 				PROG_if_lock = 0x01,
537 				PROG_fdb = 0x02,
538 				PROG_mfdb = 0x04,
539 				PROG_report_dev = 0x08,
540 				PROG_plist = 0x10,
541 				PROG_taskq = 0x20}
542 			progress;
543 
544 	progress = PROG_init;
545 
546 	switch (cmd) {
547 	case DDI_ATTACH:
548 		break;
549 	case DDI_RESUME:
550 		/* nothing to do for this non-device */
551 		return (DDI_SUCCESS);
552 	case DDI_PM_RESUME:
553 	default:
554 		return (DDI_FAILURE);
555 	}
556 
557 	instance = ddi_get_instance(dip);
558 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
559 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
560 		return (DDI_FAILURE);
561 	}
562 	vswp = ddi_get_soft_state(vsw_state, instance);
563 
564 	if (vswp == NULL) {
565 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
566 		goto vsw_attach_fail;
567 	}
568 
569 	vswp->dip = dip;
570 	vswp->instance = instance;
571 	ddi_set_driver_private(dip, (caddr_t)vswp);
572 
573 	mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL);
574 	mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
575 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
576 	progress |= PROG_if_lock;
577 
578 	/* setup the unicast forwarding database  */
579 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
580 							vswp->instance);
581 	D2(vswp, "creating unicast hash table (%s)...", hashname);
582 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
583 		mod_hash_null_valdtor, sizeof (void *));
584 
585 	progress |= PROG_fdb;
586 
587 	/* setup the multicast fowarding database */
588 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
589 							vswp->instance);
590 	D2(vswp, "creating multicast hash table %s)...", hashname);
591 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
592 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
593 			mod_hash_null_valdtor, sizeof (void *));
594 
595 	progress |= PROG_mfdb;
596 
597 	/*
598 	 * create lock protecting list of multicast addresses
599 	 * which could come via m_multicst() entry point when plumbed.
600 	 */
601 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
602 	vswp->mcap = NULL;
603 
604 	ddi_report_dev(vswp->dip);
605 
606 	progress |= PROG_report_dev;
607 
608 	WRITE_ENTER(&vsw_rw);
609 	vswp->next = vsw_head;
610 	vsw_head = vswp;
611 	RW_EXIT(&vsw_rw);
612 
613 	/* setup the port list */
614 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
615 	vswp->plist.head = NULL;
616 
617 	progress |= PROG_plist;
618 
619 	/*
620 	 * Create the taskq which will process all the VIO
621 	 * control messages.
622 	 */
623 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
624 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
625 					TASKQ_DEFAULTPRI, 0)) == NULL) {
626 		cmn_err(CE_WARN, "!vsw%d: Unable to create task queue",
627 			vswp->instance);
628 		goto vsw_attach_fail;
629 	}
630 
631 	progress |= PROG_taskq;
632 
633 	/* prevent auto-detaching */
634 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
635 				DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
636 		cmn_err(CE_NOTE, "!Unable to set \"%s\" property for "
637 			"instance %u", DDI_NO_AUTODETACH, instance);
638 	}
639 
640 	/*
641 	 * Now we have everything setup, register an interest in
642 	 * specific MD nodes.
643 	 *
644 	 * The callback is invoked in 2 cases, firstly if upon mdeg
645 	 * registration there are existing nodes which match our specified
646 	 * criteria, and secondly if the MD is changed (and again, there
647 	 * are nodes which we are interested in present within it. Note
648 	 * that our callback will be invoked even if our specified nodes
649 	 * have not actually changed).
650 	 *
651 	 * Until the callback is invoked we cannot switch any pkts as
652 	 * we don't know basic information such as what mode we are
653 	 * operating in. However we expect the callback to be invoked
654 	 * immediately upon registration as this driver should only
655 	 * be attaching if there are vsw nodes in the MD.
656 	 */
657 	if (vsw_mdeg_register(vswp))
658 		goto vsw_attach_fail;
659 
660 	return (DDI_SUCCESS);
661 
662 vsw_attach_fail:
663 	DERR(NULL, "vsw_attach: failed");
664 
665 	if (progress & PROG_taskq)
666 		ddi_taskq_destroy(vswp->taskq_p);
667 
668 	if (progress & PROG_plist)
669 		rw_destroy(&vswp->plist.lockrw);
670 
671 	if (progress & PROG_report_dev) {
672 		ddi_remove_minor_node(dip, NULL);
673 		mutex_destroy(&vswp->mca_lock);
674 	}
675 
676 	if (progress & PROG_mfdb) {
677 		mod_hash_destroy_hash(vswp->mfdb);
678 		vswp->mfdb = NULL;
679 		rw_destroy(&vswp->mfdbrw);
680 	}
681 
682 	if (progress & PROG_fdb) {
683 		mod_hash_destroy_hash(vswp->fdb);
684 		vswp->fdb = NULL;
685 	}
686 
687 	if (progress & PROG_if_lock) {
688 		rw_destroy(&vswp->if_lockrw);
689 		mutex_destroy(&vswp->mac_lock);
690 		mutex_destroy(&vswp->hw_lock);
691 	}
692 
693 	ddi_soft_state_free(vsw_state, instance);
694 	return (DDI_FAILURE);
695 }
696 
697 static int
698 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
699 {
700 	vio_mblk_pool_t		*poolp, *npoolp;
701 	vsw_t			**vswpp, *vswp;
702 	int 			instance;
703 
704 	instance = ddi_get_instance(dip);
705 	vswp = ddi_get_soft_state(vsw_state, instance);
706 
707 	if (vswp == NULL) {
708 		return (DDI_FAILURE);
709 	}
710 
711 	switch (cmd) {
712 	case DDI_DETACH:
713 		break;
714 	case DDI_SUSPEND:
715 	case DDI_PM_SUSPEND:
716 	default:
717 		return (DDI_FAILURE);
718 	}
719 
720 	D2(vswp, "detaching instance %d", instance);
721 
722 	if (vswp->if_state & VSW_IF_REG) {
723 		if (vsw_mac_unregister(vswp) != 0) {
724 			cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
725 				"MAC layer", vswp->instance);
726 			return (DDI_FAILURE);
727 		}
728 	}
729 
730 	vsw_mdeg_unregister(vswp);
731 
732 	/* remove mac layer callback */
733 	mutex_enter(&vswp->mac_lock);
734 	if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
735 		mac_rx_remove(vswp->mh, vswp->mrh);
736 		vswp->mrh = NULL;
737 	}
738 	mutex_exit(&vswp->mac_lock);
739 
740 	if (vsw_detach_ports(vswp) != 0) {
741 		cmn_err(CE_WARN, "!vsw%d: Unable to detach ports",
742 							vswp->instance);
743 		return (DDI_FAILURE);
744 	}
745 
746 	rw_destroy(&vswp->if_lockrw);
747 
748 	mutex_destroy(&vswp->hw_lock);
749 
750 	/*
751 	 * Now that the ports have been deleted, stop and close
752 	 * the physical device.
753 	 */
754 	mutex_enter(&vswp->mac_lock);
755 	if (vswp->mh != NULL) {
756 		if (vswp->mstarted)
757 			mac_stop(vswp->mh);
758 		if (vswp->mresources)
759 			mac_resource_set(vswp->mh, NULL, NULL);
760 		mac_close(vswp->mh);
761 
762 		vswp->mh = NULL;
763 		vswp->txinfo = NULL;
764 	}
765 	mutex_exit(&vswp->mac_lock);
766 	mutex_destroy(&vswp->mac_lock);
767 
768 	/*
769 	 * Destroy any free pools that may still exist.
770 	 */
771 	poolp = vswp->rxh;
772 	while (poolp != NULL) {
773 		npoolp = vswp->rxh = poolp->nextp;
774 		if (vio_destroy_mblks(poolp) != 0) {
775 			vswp->rxh = poolp;
776 			return (DDI_FAILURE);
777 		}
778 		poolp = npoolp;
779 	}
780 
781 	/*
782 	 * Remove this instance from any entries it may be on in
783 	 * the hash table by using the list of addresses maintained
784 	 * in the vsw_t structure.
785 	 */
786 	vsw_del_mcst_vsw(vswp);
787 
788 	vswp->mcap = NULL;
789 	mutex_destroy(&vswp->mca_lock);
790 
791 	/*
792 	 * By now any pending tasks have finished and the underlying
793 	 * ldc's have been destroyed, so its safe to delete the control
794 	 * message taskq.
795 	 */
796 	if (vswp->taskq_p != NULL)
797 		ddi_taskq_destroy(vswp->taskq_p);
798 
799 	/*
800 	 * At this stage all the data pointers in the hash table
801 	 * should be NULL, as all the ports have been removed and will
802 	 * have deleted themselves from the port lists which the data
803 	 * pointers point to. Hence we can destroy the table using the
804 	 * default destructors.
805 	 */
806 	D2(vswp, "vsw_detach: destroying hash tables..");
807 	mod_hash_destroy_hash(vswp->fdb);
808 	vswp->fdb = NULL;
809 
810 	WRITE_ENTER(&vswp->mfdbrw);
811 	mod_hash_destroy_hash(vswp->mfdb);
812 	vswp->mfdb = NULL;
813 	RW_EXIT(&vswp->mfdbrw);
814 	rw_destroy(&vswp->mfdbrw);
815 
816 	ddi_remove_minor_node(dip, NULL);
817 
818 	rw_destroy(&vswp->plist.lockrw);
819 	WRITE_ENTER(&vsw_rw);
820 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
821 		if (*vswpp == vswp) {
822 			*vswpp = vswp->next;
823 			break;
824 		}
825 	}
826 	RW_EXIT(&vsw_rw);
827 	ddi_soft_state_free(vsw_state, instance);
828 
829 	return (DDI_SUCCESS);
830 }
831 
832 static int
833 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
834 {
835 	_NOTE(ARGUNUSED(dip))
836 
837 	vsw_t	*vswp = NULL;
838 	dev_t	dev = (dev_t)arg;
839 	int	instance;
840 
841 	instance = getminor(dev);
842 
843 	switch (infocmd) {
844 	case DDI_INFO_DEVT2DEVINFO:
845 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
846 			*result = NULL;
847 			return (DDI_FAILURE);
848 		}
849 		*result = vswp->dip;
850 		return (DDI_SUCCESS);
851 
852 	case DDI_INFO_DEVT2INSTANCE:
853 		*result = (void *)(uintptr_t)instance;
854 		return (DDI_SUCCESS);
855 
856 	default:
857 		*result = NULL;
858 		return (DDI_FAILURE);
859 	}
860 }
861 
862 /*
863  * Get the value of the "vsw-phys-dev" property in the specified
864  * node. This property is the name of the physical device that
865  * the virtual switch will use to talk to the outside world.
866  *
867  * Note it is valid for this property to be NULL (but the property
868  * itself must exist). Callers of this routine should verify that
869  * the value returned is what they expected (i.e. either NULL or non NULL).
870  *
871  * On success returns value of the property in region pointed to by
872  * the 'name' argument, and with return value of 0. Otherwise returns 1.
873  */
874 static int
875 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
876 {
877 	int	len = 0;
878 	char	*physname = NULL;
879 	char	*dev;
880 
881 	if (md_get_prop_data(mdp, node, physdev_propname,
882 				(uint8_t **)(&physname), &len) != 0) {
883 		cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical "
884 				"device(s) from MD", vswp->instance);
885 		return (1);
886 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
887 		cmn_err(CE_WARN, "!vsw%d: %s is too long a device name",
888 			vswp->instance, physname);
889 		return (1);
890 	} else {
891 		(void) strncpy(name, physname, strlen(physname) + 1);
892 		D2(vswp, "%s: using first device specified (%s)",
893 			__func__, physname);
894 	}
895 
896 #ifdef DEBUG
897 	/*
898 	 * As a temporary measure to aid testing we check to see if there
899 	 * is a vsw.conf file present. If there is we use the value of the
900 	 * vsw_physname property in the file as the name of the physical
901 	 * device, overriding the value from the MD.
902 	 *
903 	 * There may be multiple devices listed, but for the moment
904 	 * we just use the first one.
905 	 */
906 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
907 		"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
908 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
909 			cmn_err(CE_WARN, "vsw%d: %s is too long a device name",
910 				vswp->instance, dev);
911 			ddi_prop_free(dev);
912 			return (1);
913 		} else {
914 			cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from "
915 				"config file", vswp->instance, dev);
916 
917 			(void) strncpy(name, dev, strlen(dev) + 1);
918 		}
919 
920 		ddi_prop_free(dev);
921 	}
922 #endif
923 
924 	return (0);
925 }
926 
927 /*
928  * Read the 'vsw-switch-mode' property from the specified MD node.
929  *
930  * Returns 0 on success and the number of modes found in 'found',
931  * otherwise returns 1.
932  */
933 static int
934 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
935 						uint8_t *modes, int *found)
936 {
937 	int		len = 0;
938 	int		smode_num = 0;
939 	char		*smode = NULL;
940 	char		*curr_mode = NULL;
941 
942 	D1(vswp, "%s: enter", __func__);
943 
944 	/*
945 	 * Get the switch-mode property. The modes are listed in
946 	 * decreasing order of preference, i.e. prefered mode is
947 	 * first item in list.
948 	 */
949 	len = 0;
950 	smode_num = 0;
951 	if (md_get_prop_data(mdp, node, smode_propname,
952 				(uint8_t **)(&smode), &len) != 0) {
953 		/*
954 		 * Unable to get switch-mode property from MD, nothing
955 		 * more we can do.
956 		 */
957 		cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
958 			" from the MD", vswp->instance);
959 		*found = 0;
960 		return (1);
961 	}
962 
963 	curr_mode = smode;
964 	/*
965 	 * Modes of operation:
966 	 * 'switched'	 - layer 2 switching, underlying HW in
967 	 *			programmed mode.
968 	 * 'promiscuous' - layer 2 switching, underlying HW in
969 	 *			promiscuous mode.
970 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
971 	 *			in non-promiscuous mode.
972 	 */
973 	while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) {
974 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
975 		if (strcmp(curr_mode, "switched") == 0) {
976 			modes[smode_num++] = VSW_LAYER2;
977 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
978 			modes[smode_num++] = VSW_LAYER2_PROMISC;
979 		} else if (strcmp(curr_mode, "routed") == 0) {
980 			modes[smode_num++] = VSW_LAYER3;
981 		} else {
982 			cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
983 				"setting to default switched mode",
984 				vswp->instance, curr_mode);
985 			modes[smode_num++] = VSW_LAYER2;
986 		}
987 		curr_mode += strlen(curr_mode) + 1;
988 	}
989 	*found = smode_num;
990 
991 	D2(vswp, "%s: %d modes found", __func__, smode_num);
992 
993 	D1(vswp, "%s: exit", __func__);
994 
995 	return (0);
996 }
997 
998 /*
999  * Get the mac address of the physical device.
1000  *
1001  * Returns 0 on success, 1 on failure.
1002  */
1003 static int
1004 vsw_get_physaddr(vsw_t *vswp)
1005 {
1006 	mac_handle_t	mh;
1007 	char		drv[LIFNAMSIZ];
1008 	uint_t		ddi_instance;
1009 
1010 	D1(vswp, "%s: enter", __func__);
1011 
1012 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS)
1013 		return (1);
1014 
1015 	if (mac_open(vswp->physname, ddi_instance, &mh) != 0) {
1016 		cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
1017 				vswp->instance, vswp->physname);
1018 		return (1);
1019 	}
1020 
1021 	READ_ENTER(&vswp->if_lockrw);
1022 	mac_unicst_get(mh, vswp->if_addr.ether_addr_octet);
1023 	RW_EXIT(&vswp->if_lockrw);
1024 
1025 	mac_close(mh);
1026 
1027 	vswp->mdprops |= VSW_DEV_MACADDR;
1028 
1029 	D1(vswp, "%s: exit", __func__);
1030 
1031 	return (0);
1032 }
1033 
1034 /*
1035  * Check to see if the card supports the setting of multiple unicst
1036  * addresses.
1037  *
1038  * Returns 0 if card supports the programming of multiple unicast addresses,
1039  * otherwise returns 1.
1040  */
1041 static int
1042 vsw_get_hw_maddr(vsw_t *vswp)
1043 {
1044 	D1(vswp, "%s: enter", __func__);
1045 
1046 	mutex_enter(&vswp->mac_lock);
1047 	if (vswp->mh == NULL) {
1048 		mutex_exit(&vswp->mac_lock);
1049 		return (1);
1050 	}
1051 
1052 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
1053 		cmn_err(CE_WARN, "!vsw%d: device (%s) does not support "
1054 			"setting multiple unicast addresses", vswp->instance,
1055 			vswp->physname);
1056 		mutex_exit(&vswp->mac_lock);
1057 		return (1);
1058 	}
1059 	mutex_exit(&vswp->mac_lock);
1060 
1061 	D2(vswp, "%s: %d addrs : %d free", __func__,
1062 		vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
1063 
1064 	D1(vswp, "%s: exit", __func__);
1065 
1066 	return (0);
1067 }
1068 
1069 /*
1070  * Setup the required switching mode.
1071  *
1072  * Returns 0 on success, 1 on failure.
1073  */
1074 static int
1075 vsw_setup_switching(vsw_t *vswp)
1076 {
1077 	int	i, rv = 1;
1078 
1079 	D1(vswp, "%s: enter", __func__);
1080 
1081 	/* select best switching mode */
1082 	for (i = 0; i < vswp->smode_num; i++) {
1083 		vswp->smode_idx = i;
1084 		switch (vswp->smode[i]) {
1085 		case VSW_LAYER2:
1086 		case VSW_LAYER2_PROMISC:
1087 			rv = vsw_setup_layer2(vswp);
1088 			break;
1089 
1090 		case VSW_LAYER3:
1091 			rv = vsw_setup_layer3(vswp);
1092 			break;
1093 
1094 		default:
1095 			DERR(vswp, "unknown switch mode");
1096 			rv = 1;
1097 			break;
1098 		}
1099 
1100 		if (rv == 0)
1101 			break;
1102 	}
1103 
1104 	if (rv == 1) {
1105 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
1106 			"switching mode", vswp->instance);
1107 		return (rv);
1108 	}
1109 
1110 	D2(vswp, "%s: Operating in mode %d", __func__,
1111 					vswp->smode[vswp->smode_idx]);
1112 
1113 	D1(vswp, "%s: exit", __func__);
1114 
1115 	return (0);
1116 }
1117 
1118 /*
1119  * Setup for layer 2 switching.
1120  *
1121  * Returns 0 on success, 1 on failure.
1122  */
1123 static int
1124 vsw_setup_layer2(vsw_t *vswp)
1125 {
1126 	D1(vswp, "%s: enter", __func__);
1127 
1128 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
1129 
1130 	/*
1131 	 * Attempt to link into the MAC layer so we can get
1132 	 * and send packets out over the physical adapter.
1133 	 */
1134 	if (vswp->mdprops & VSW_MD_PHYSNAME) {
1135 		if (vsw_mac_attach(vswp) != 0) {
1136 			/*
1137 			 * Registration with the MAC layer has failed,
1138 			 * so return 1 so that can fall back to next
1139 			 * prefered switching method.
1140 			 */
1141 			cmn_err(CE_WARN, "!vsw%d: Unable to join as MAC layer "
1142 				"client", vswp->instance);
1143 			return (1);
1144 		}
1145 
1146 		if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
1147 			/*
1148 			 * Verify that underlying device can support multiple
1149 			 * unicast mac addresses.
1150 			 */
1151 			if (vsw_get_hw_maddr(vswp) != 0) {
1152 				cmn_err(CE_WARN, "!vsw%d: Unable to setup "
1153 					"layer2 switching", vswp->instance);
1154 				vsw_mac_detach(vswp);
1155 				return (1);
1156 			}
1157 		}
1158 
1159 	} else {
1160 		/*
1161 		 * No physical device name found in MD which is
1162 		 * required for layer 2.
1163 		 */
1164 		cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
1165 			vswp->instance);
1166 		return (1);
1167 	}
1168 
1169 	D1(vswp, "%s: exit", __func__);
1170 
1171 	return (0);
1172 }
1173 
1174 static int
1175 vsw_setup_layer3(vsw_t *vswp)
1176 {
1177 	D1(vswp, "%s: enter", __func__);
1178 
1179 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1180 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
1181 
1182 	D1(vswp, "%s: exit", __func__);
1183 
1184 	return (0);
1185 }
1186 
1187 /*
1188  * Link into the MAC layer to gain access to the services provided by
1189  * the underlying physical device driver (which should also have
1190  * registered with the MAC layer).
1191  *
1192  * Only when in layer 2 mode.
1193  */
1194 static int
1195 vsw_mac_attach(vsw_t *vswp)
1196 {
1197 	char	drv[LIFNAMSIZ];
1198 	uint_t	ddi_instance;
1199 
1200 	D1(vswp, "%s: enter", __func__);
1201 
1202 	ASSERT(vswp->mh == NULL);
1203 	ASSERT(vswp->mrh == NULL);
1204 	ASSERT(vswp->mstarted == B_FALSE);
1205 	ASSERT(vswp->mresources == B_FALSE);
1206 
1207 	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
1208 
1209 	mutex_enter(&vswp->mac_lock);
1210 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
1211 		cmn_err(CE_WARN, "!vsw%d: invalid device name: %s",
1212 			vswp->instance, vswp->physname);
1213 		goto mac_fail_exit;
1214 	}
1215 
1216 	if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
1217 		cmn_err(CE_WARN, "!vsw%d: mac_open %s failed",
1218 			vswp->instance, vswp->physname);
1219 		goto mac_fail_exit;
1220 	}
1221 
1222 	ASSERT(vswp->mh != NULL);
1223 
1224 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1225 
1226 	if (vsw_multi_ring_enable) {
1227 		/*
1228 		 * Initialize the ring table.
1229 		 */
1230 		vsw_mac_ring_tbl_init(vswp);
1231 
1232 		/*
1233 		 * Register our rx callback function.
1234 		 */
1235 		vswp->mrh = mac_rx_add(vswp->mh,
1236 			vsw_rx_queue_cb, (void *)vswp);
1237 		ASSERT(vswp->mrh != NULL);
1238 
1239 		/*
1240 		 * Register our mac resource callback.
1241 		 */
1242 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
1243 		vswp->mresources = B_TRUE;
1244 
1245 		/*
1246 		 * Get the ring resources available to us from
1247 		 * the mac below us.
1248 		 */
1249 		mac_resources(vswp->mh);
1250 	} else {
1251 		/*
1252 		 * Just register our rx callback function
1253 		 */
1254 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1255 		ASSERT(vswp->mrh != NULL);
1256 	}
1257 
1258 	/* Get the MAC tx fn */
1259 	vswp->txinfo = mac_tx_get(vswp->mh);
1260 
1261 	/* start the interface */
1262 	if (mac_start(vswp->mh) != 0) {
1263 		cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
1264 			vswp->instance);
1265 		goto mac_fail_exit;
1266 	}
1267 
1268 	mutex_exit(&vswp->mac_lock);
1269 
1270 	vswp->mstarted = B_TRUE;
1271 
1272 	D1(vswp, "%s: exit", __func__);
1273 	return (0);
1274 
1275 mac_fail_exit:
1276 	mutex_exit(&vswp->mac_lock);
1277 	vsw_mac_detach(vswp);
1278 
1279 	D1(vswp, "%s: exit", __func__);
1280 	return (1);
1281 }
1282 
1283 static void
1284 vsw_mac_detach(vsw_t *vswp)
1285 {
1286 	D1(vswp, "vsw_mac_detach: enter");
1287 
1288 	ASSERT(vswp != NULL);
1289 
1290 	if (vsw_multi_ring_enable) {
1291 		vsw_mac_ring_tbl_destroy(vswp);
1292 	}
1293 
1294 	mutex_enter(&vswp->mac_lock);
1295 
1296 	if (vswp->mh != NULL) {
1297 		if (vswp->mstarted)
1298 			mac_stop(vswp->mh);
1299 		if (vswp->mrh != NULL)
1300 			mac_rx_remove(vswp->mh, vswp->mrh);
1301 		if (vswp->mresources)
1302 			mac_resource_set(vswp->mh, NULL, NULL);
1303 		mac_close(vswp->mh);
1304 	}
1305 
1306 	vswp->mrh = NULL;
1307 	vswp->mh = NULL;
1308 	vswp->txinfo = NULL;
1309 	vswp->mstarted = B_FALSE;
1310 
1311 	mutex_exit(&vswp->mac_lock);
1312 
1313 	D1(vswp, "vsw_mac_detach: exit");
1314 }
1315 
1316 /*
1317  * Depending on the mode specified, the capabilites and capacity
1318  * of the underlying device setup the physical device.
1319  *
1320  * If in layer 3 mode, then do nothing.
1321  *
1322  * If in layer 2 programmed mode attempt to program the unicast address
1323  * associated with the port into the physical device. If this is not
1324  * possible due to resource exhaustion or simply because the device does
1325  * not support multiple unicast addresses then if required fallback onto
1326  * putting the card into promisc mode.
1327  *
1328  * If in promisc mode then simply set the card into promisc mode.
1329  *
1330  * Returns 0 success, 1 on failure.
1331  */
1332 static int
1333 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
1334 {
1335 	mac_multi_addr_t	mac_addr;
1336 	int			err;
1337 
1338 	D1(vswp, "%s: enter", __func__);
1339 
1340 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1341 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1342 
1343 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1344 		return (0);
1345 
1346 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
1347 		return (vsw_set_hw_promisc(vswp, port, type));
1348 	}
1349 
1350 	/*
1351 	 * Attempt to program the unicast address into the HW.
1352 	 */
1353 	mac_addr.mma_addrlen = ETHERADDRL;
1354 	if (type == VSW_VNETPORT) {
1355 		ASSERT(port != NULL);
1356 		ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
1357 	} else {
1358 		READ_ENTER(&vswp->if_lockrw);
1359 		/*
1360 		 * Don't program if the interface is not UP. This
1361 		 * is possible if the address has just been changed
1362 		 * in the MD node, but the interface has not yet been
1363 		 * plumbed.
1364 		 */
1365 		if (!(vswp->if_state & VSW_IF_UP)) {
1366 			RW_EXIT(&vswp->if_lockrw);
1367 			return (0);
1368 		}
1369 		ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
1370 		RW_EXIT(&vswp->if_lockrw);
1371 	}
1372 
1373 	err = vsw_set_hw_addr(vswp, &mac_addr);
1374 	if (err != 0) {
1375 		/*
1376 		 * Mark that attempt should be made to re-config sometime
1377 		 * in future if a port is deleted.
1378 		 */
1379 		vswp->recfg_reqd = B_TRUE;
1380 
1381 		/*
1382 		 * Only 1 mode specified, nothing more to do.
1383 		 */
1384 		if (vswp->smode_num == 1)
1385 			return (err);
1386 
1387 		/*
1388 		 * If promiscuous was next mode specified try to
1389 		 * set the card into that mode.
1390 		 */
1391 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
1392 			(vswp->smode[vswp->smode_idx + 1]
1393 					== VSW_LAYER2_PROMISC)) {
1394 			vswp->smode_idx += 1;
1395 			return (vsw_set_hw_promisc(vswp, port, type));
1396 		}
1397 		return (err);
1398 	}
1399 
1400 	if (type == VSW_VNETPORT) {
1401 		port->addr_slot = mac_addr.mma_slot;
1402 		port->addr_set = VSW_ADDR_HW;
1403 	} else {
1404 		vswp->addr_slot = mac_addr.mma_slot;
1405 		vswp->addr_set = VSW_ADDR_HW;
1406 	}
1407 
1408 	D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x into slot %d "
1409 		"of device %s",
1410 		mac_addr.mma_addr[0], mac_addr.mma_addr[1],
1411 		mac_addr.mma_addr[2], mac_addr.mma_addr[3],
1412 		mac_addr.mma_addr[4], mac_addr.mma_addr[5],
1413 		mac_addr.mma_slot, vswp->physname);
1414 
1415 	D1(vswp, "%s: exit", __func__);
1416 
1417 	return (0);
1418 }
1419 
1420 /*
1421  * If in layer 3 mode do nothing.
1422  *
1423  * If in layer 2 switched mode remove the address from the physical
1424  * device.
1425  *
1426  * If in layer 2 promiscuous mode disable promisc mode.
1427  *
1428  * Returns 0 on success.
1429  */
1430 static int
1431 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
1432 {
1433 	mac_addr_slot_t	slot;
1434 	int		rv;
1435 
1436 	D1(vswp, "%s: enter", __func__);
1437 
1438 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1439 
1440 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
1441 		return (0);
1442 
1443 	switch (type) {
1444 	case VSW_VNETPORT:
1445 		ASSERT(port != NULL);
1446 
1447 		if (port->addr_set == VSW_ADDR_PROMISC) {
1448 			return (vsw_unset_hw_promisc(vswp, port, type));
1449 
1450 		} else if (port->addr_set == VSW_ADDR_HW) {
1451 			slot = port->addr_slot;
1452 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
1453 				port->addr_set = VSW_ADDR_UNSET;
1454 		}
1455 
1456 		break;
1457 
1458 	case VSW_LOCALDEV:
1459 		if (vswp->addr_set == VSW_ADDR_PROMISC) {
1460 			return (vsw_unset_hw_promisc(vswp, NULL, type));
1461 
1462 		} else if (vswp->addr_set == VSW_ADDR_HW) {
1463 			slot = vswp->addr_slot;
1464 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
1465 				vswp->addr_set = VSW_ADDR_UNSET;
1466 		}
1467 
1468 		break;
1469 
1470 	default:
1471 		/* should never happen */
1472 		DERR(vswp, "%s: unknown type %d", __func__, type);
1473 		ASSERT(0);
1474 		return (1);
1475 	}
1476 
1477 	D1(vswp, "%s: exit", __func__);
1478 	return (rv);
1479 }
1480 
1481 /*
1482  * Attempt to program a unicast address into HW.
1483  *
1484  * Returns 0 on sucess, 1 on failure.
1485  */
1486 static int
1487 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
1488 {
1489 	void	*mah;
1490 	int	rv;
1491 
1492 	D1(vswp, "%s: enter", __func__);
1493 
1494 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1495 
1496 	if (vswp->maddr.maddr_handle == NULL)
1497 		return (1);
1498 
1499 	mah = vswp->maddr.maddr_handle;
1500 
1501 	rv = vswp->maddr.maddr_add(mah, mac);
1502 
1503 	if (rv == 0)
1504 		return (0);
1505 
1506 	/*
1507 	 * Its okay for the add to fail because we have exhausted
1508 	 * all the resouces in the hardware device. Any other error
1509 	 * we want to flag.
1510 	 */
1511 	if (rv != ENOSPC) {
1512 		cmn_err(CE_WARN, "!vsw%d: error programming "
1513 			"address %x:%x:%x:%x:%x:%x into HW "
1514 			"err (%d)", vswp->instance,
1515 			mac->mma_addr[0], mac->mma_addr[1],
1516 			mac->mma_addr[2], mac->mma_addr[3],
1517 			mac->mma_addr[4], mac->mma_addr[5], rv);
1518 	}
1519 	D1(vswp, "%s: exit", __func__);
1520 	return (1);
1521 }
1522 
1523 /*
1524  * Remove a unicast mac address which has previously been programmed
1525  * into HW.
1526  *
1527  * Returns 0 on sucess, 1 on failure.
1528  */
1529 static int
1530 vsw_unset_hw_addr(vsw_t *vswp, int slot)
1531 {
1532 	void	*mah;
1533 	int	rv;
1534 
1535 	D1(vswp, "%s: enter", __func__);
1536 
1537 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1538 	ASSERT(slot >= 0);
1539 
1540 	if (vswp->maddr.maddr_handle == NULL)
1541 		return (1);
1542 
1543 	mah = vswp->maddr.maddr_handle;
1544 
1545 	rv = vswp->maddr.maddr_remove(mah, slot);
1546 	if (rv != 0) {
1547 		cmn_err(CE_WARN, "!vsw%d: unable to remove address "
1548 			"from slot %d in device %s (err %d)",
1549 			vswp->instance, slot, vswp->physname, rv);
1550 		return (1);
1551 	}
1552 
1553 	D2(vswp, "removed addr from slot %d in device %s",
1554 		slot, vswp->physname);
1555 
1556 	D1(vswp, "%s: exit", __func__);
1557 	return (0);
1558 }
1559 
1560 /*
1561  * Set network card into promisc mode.
1562  *
1563  * Returns 0 on success, 1 on failure.
1564  */
1565 static int
1566 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
1567 {
1568 	D1(vswp, "%s: enter", __func__);
1569 
1570 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1571 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1572 
1573 	mutex_enter(&vswp->mac_lock);
1574 	if (vswp->mh == NULL) {
1575 		mutex_exit(&vswp->mac_lock);
1576 		return (1);
1577 	}
1578 
1579 	if (vswp->promisc_cnt++ == 0) {
1580 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1581 			vswp->promisc_cnt--;
1582 			mutex_exit(&vswp->mac_lock);
1583 			return (1);
1584 		}
1585 		cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
1586 			"promiscuous mode", vswp->instance, vswp->physname);
1587 	}
1588 	mutex_exit(&vswp->mac_lock);
1589 
1590 	if (type == VSW_VNETPORT) {
1591 		ASSERT(port != NULL);
1592 		port->addr_set = VSW_ADDR_PROMISC;
1593 	} else {
1594 		vswp->addr_set = VSW_ADDR_PROMISC;
1595 	}
1596 
1597 	D1(vswp, "%s: exit", __func__);
1598 
1599 	return (0);
1600 }
1601 
1602 /*
1603  * Turn off promiscuous mode on network card.
1604  *
1605  * Returns 0 on success, 1 on failure.
1606  */
1607 static int
1608 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
1609 {
1610 	vsw_port_list_t 	*plist = &vswp->plist;
1611 
1612 	D2(vswp, "%s: enter", __func__);
1613 
1614 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1615 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1616 
1617 	mutex_enter(&vswp->mac_lock);
1618 	if (vswp->mh == NULL) {
1619 		mutex_exit(&vswp->mac_lock);
1620 		return (1);
1621 	}
1622 
1623 	if (--vswp->promisc_cnt == 0) {
1624 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
1625 			vswp->promisc_cnt++;
1626 			mutex_exit(&vswp->mac_lock);
1627 			return (1);
1628 		}
1629 
1630 		/*
1631 		 * We are exiting promisc mode either because we were
1632 		 * only in promisc mode because we had failed over from
1633 		 * switched mode due to HW resource issues, or the user
1634 		 * wanted the card in promisc mode for all the ports and
1635 		 * the last port is now being deleted. Tweak the message
1636 		 * accordingly.
1637 		 */
1638 		if (plist->num_ports != 0) {
1639 			cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
1640 				"programmed mode", vswp->instance,
1641 				vswp->physname);
1642 		} else {
1643 			cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
1644 				"promiscuous mode", vswp->instance,
1645 				vswp->physname);
1646 		}
1647 	}
1648 	mutex_exit(&vswp->mac_lock);
1649 
1650 	if (type == VSW_VNETPORT) {
1651 		ASSERT(port != NULL);
1652 		ASSERT(port->addr_set == VSW_ADDR_PROMISC);
1653 		port->addr_set = VSW_ADDR_UNSET;
1654 	} else {
1655 		ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
1656 		vswp->addr_set = VSW_ADDR_UNSET;
1657 	}
1658 
1659 	D1(vswp, "%s: exit", __func__);
1660 	return (0);
1661 }
1662 
1663 /*
1664  * Determine whether or not we are operating in our prefered
1665  * mode and if not whether the physical resources now allow us
1666  * to operate in it.
1667  *
1668  * If a port is being removed should only be invoked after port has been
1669  * removed from the port list.
1670  */
1671 static void
1672 vsw_reconfig_hw(vsw_t *vswp)
1673 {
1674 	int			s_idx;
1675 
1676 	D1(vswp, "%s: enter", __func__);
1677 
1678 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1679 
1680 	if (vswp->maddr.maddr_handle == NULL) {
1681 		return;
1682 	}
1683 
1684 	/*
1685 	 * If we are in layer 2 (i.e. switched) or would like to be
1686 	 * in layer 2 then check if any ports or the vswitch itself
1687 	 * need to be programmed into the HW.
1688 	 *
1689 	 * This can happen in two cases - switched was specified as
1690 	 * the prefered mode of operation but we exhausted the HW
1691 	 * resources and so failed over to the next specifed mode,
1692 	 * or switched was the only mode specified so after HW
1693 	 * resources were exhausted there was nothing more we
1694 	 * could do.
1695 	 */
1696 	if (vswp->smode_idx > 0)
1697 		s_idx = vswp->smode_idx - 1;
1698 	else
1699 		s_idx = vswp->smode_idx;
1700 
1701 	if (vswp->smode[s_idx] != VSW_LAYER2) {
1702 		return;
1703 	}
1704 
1705 	D2(vswp, "%s: attempting reconfig..", __func__);
1706 
1707 	/*
1708 	 * First, attempt to set the vswitch mac address into HW,
1709 	 * if required.
1710 	 */
1711 	if (vsw_prog_if(vswp)) {
1712 		return;
1713 	}
1714 
1715 	/*
1716 	 * Next, attempt to set any ports which have not yet been
1717 	 * programmed into HW.
1718 	 */
1719 	if (vsw_prog_ports(vswp)) {
1720 		return;
1721 	}
1722 
1723 	/*
1724 	 * By now we know that have programmed all desired ports etc
1725 	 * into HW, so safe to mark reconfiguration as complete.
1726 	 */
1727 	vswp->recfg_reqd = B_FALSE;
1728 
1729 	vswp->smode_idx = s_idx;
1730 
1731 	D1(vswp, "%s: exit", __func__);
1732 }
1733 
1734 /*
1735  * Check to see if vsw itself is plumbed, and if so whether or not
1736  * its mac address should be written into HW.
1737  *
1738  * Returns 0 if could set address, or didn't have to set it.
1739  * Returns 1 if failed to set address.
1740  */
1741 static int
1742 vsw_prog_if(vsw_t *vswp)
1743 {
1744 	mac_multi_addr_t	addr;
1745 
1746 	D1(vswp, "%s: enter", __func__);
1747 
1748 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1749 
1750 	READ_ENTER(&vswp->if_lockrw);
1751 	if ((vswp->if_state & VSW_IF_UP) &&
1752 		(vswp->addr_set != VSW_ADDR_HW)) {
1753 
1754 		addr.mma_addrlen = ETHERADDRL;
1755 		ether_copy(&vswp->if_addr, &addr.mma_addr);
1756 
1757 		if (vsw_set_hw_addr(vswp, &addr) != 0) {
1758 			RW_EXIT(&vswp->if_lockrw);
1759 			return (1);
1760 		}
1761 
1762 		vswp->addr_slot = addr.mma_slot;
1763 
1764 		/*
1765 		 * If previously when plumbed had had to place
1766 		 * interface into promisc mode, now reverse that.
1767 		 *
1768 		 * Note that interface will only actually be set into
1769 		 * non-promisc mode when last port/interface has been
1770 		 * programmed into HW.
1771 		 */
1772 		if (vswp->addr_set == VSW_ADDR_PROMISC)
1773 			(void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
1774 
1775 		vswp->addr_set = VSW_ADDR_HW;
1776 	}
1777 	RW_EXIT(&vswp->if_lockrw);
1778 
1779 	D1(vswp, "%s: exit", __func__);
1780 	return (0);
1781 }
1782 
1783 /*
1784  * Scan the port list for any ports which have not yet been set
1785  * into HW. For those found attempt to program their mac addresses
1786  * into the physical device.
1787  *
1788  * Returns 0 if able to program all required ports (can be 0) into HW.
1789  * Returns 1 if failed to set at least one mac address.
1790  */
1791 static int
1792 vsw_prog_ports(vsw_t *vswp)
1793 {
1794 	mac_multi_addr_t	addr;
1795 	vsw_port_list_t		*plist = &vswp->plist;
1796 	vsw_port_t		*tp;
1797 	int			rv = 0;
1798 
1799 	D1(vswp, "%s: enter", __func__);
1800 
1801 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
1802 
1803 	READ_ENTER(&plist->lockrw);
1804 	for (tp = plist->head; tp != NULL; tp = tp->p_next) {
1805 		if (tp->addr_set != VSW_ADDR_HW) {
1806 			addr.mma_addrlen = ETHERADDRL;
1807 			ether_copy(&tp->p_macaddr, &addr.mma_addr);
1808 
1809 			if (vsw_set_hw_addr(vswp, &addr) != 0) {
1810 				rv = 1;
1811 				break;
1812 			}
1813 
1814 			tp->addr_slot = addr.mma_slot;
1815 
1816 			/*
1817 			 * If when this port had first attached we had
1818 			 * had to place the interface into promisc mode,
1819 			 * then now reverse that.
1820 			 *
1821 			 * Note that the interface will not actually
1822 			 * change to non-promisc mode until all ports
1823 			 * have been programmed.
1824 			 */
1825 			if (tp->addr_set == VSW_ADDR_PROMISC)
1826 				(void) vsw_unset_hw_promisc(vswp,
1827 						tp, VSW_VNETPORT);
1828 
1829 			tp->addr_set = VSW_ADDR_HW;
1830 		}
1831 	}
1832 	RW_EXIT(&plist->lockrw);
1833 
1834 	D1(vswp, "%s: exit", __func__);
1835 	return (rv);
1836 }
1837 
1838 static void
1839 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
1840 {
1841 	ringp->ring_state = VSW_MAC_RING_FREE;
1842 	ringp->ring_arg = NULL;
1843 	ringp->ring_blank = NULL;
1844 	ringp->ring_vqp = NULL;
1845 	ringp->ring_vswp = vswp;
1846 }
1847 
1848 static void
1849 vsw_mac_ring_tbl_init(vsw_t *vswp)
1850 {
1851 	int		i;
1852 
1853 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
1854 
1855 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
1856 	vswp->mac_ring_tbl  =
1857 		kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t),
1858 		KM_SLEEP);
1859 
1860 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
1861 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
1862 }
1863 
1864 static void
1865 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
1866 {
1867 	int		i;
1868 	vsw_mac_ring_t	*ringp;
1869 
1870 	mutex_enter(&vswp->mac_ring_lock);
1871 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1872 		ringp = &vswp->mac_ring_tbl[i];
1873 
1874 		if (ringp->ring_state != VSW_MAC_RING_FREE) {
1875 			/*
1876 			 * Destroy the queue.
1877 			 */
1878 			vsw_queue_stop(ringp->ring_vqp);
1879 			vsw_queue_destroy(ringp->ring_vqp);
1880 
1881 			/*
1882 			 * Re-initialize the structure.
1883 			 */
1884 			vsw_mac_ring_tbl_entry_init(vswp, ringp);
1885 		}
1886 	}
1887 	mutex_exit(&vswp->mac_ring_lock);
1888 
1889 	mutex_destroy(&vswp->mac_ring_lock);
1890 	kmem_free(vswp->mac_ring_tbl,
1891 		vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
1892 	vswp->mac_ring_tbl_sz = 0;
1893 }
1894 
1895 /*
1896  * Handle resource add callbacks from the driver below.
1897  */
1898 static mac_resource_handle_t
1899 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
1900 {
1901 	vsw_t		*vswp = (vsw_t *)arg;
1902 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
1903 	vsw_mac_ring_t	*ringp;
1904 	vsw_queue_t	*vqp;
1905 	int		i;
1906 
1907 	ASSERT(vswp != NULL);
1908 	ASSERT(mrp != NULL);
1909 	ASSERT(vswp->mac_ring_tbl != NULL);
1910 
1911 	D1(vswp, "%s: enter", __func__);
1912 
1913 	/*
1914 	 * Check to make sure we have the correct resource type.
1915 	 */
1916 	if (mrp->mr_type != MAC_RX_FIFO)
1917 		return (NULL);
1918 
1919 	/*
1920 	 * Find a open entry in the ring table.
1921 	 */
1922 	mutex_enter(&vswp->mac_ring_lock);
1923 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1924 		ringp = &vswp->mac_ring_tbl[i];
1925 
1926 		/*
1927 		 * Check for an empty slot, if found, then setup queue
1928 		 * and thread.
1929 		 */
1930 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
1931 			/*
1932 			 * Create the queue for this ring.
1933 			 */
1934 			vqp = vsw_queue_create();
1935 
1936 			/*
1937 			 * Initialize the ring data structure.
1938 			 */
1939 			ringp->ring_vqp = vqp;
1940 			ringp->ring_arg = mrfp->mrf_arg;
1941 			ringp->ring_blank = mrfp->mrf_blank;
1942 			ringp->ring_state = VSW_MAC_RING_INUSE;
1943 
1944 			/*
1945 			 * Create the worker thread.
1946 			 */
1947 			vqp->vq_worker = thread_create(NULL, 0,
1948 				vsw_queue_worker, ringp, 0, &p0,
1949 				TS_RUN, minclsyspri);
1950 			if (vqp->vq_worker == NULL) {
1951 				vsw_queue_destroy(vqp);
1952 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
1953 				ringp = NULL;
1954 			}
1955 
1956 			if (ringp != NULL) {
1957 				/*
1958 				 * Make sure thread get's running state for
1959 				 * this ring.
1960 				 */
1961 				mutex_enter(&vqp->vq_lock);
1962 				while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
1963 					(vqp->vq_state != VSW_QUEUE_DRAINED)) {
1964 					cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1965 				}
1966 
1967 				/*
1968 				 * If the thread is not running, cleanup.
1969 				 */
1970 				if (vqp->vq_state == VSW_QUEUE_DRAINED) {
1971 					vsw_queue_destroy(vqp);
1972 					vsw_mac_ring_tbl_entry_init(vswp,
1973 						ringp);
1974 					ringp = NULL;
1975 				}
1976 				mutex_exit(&vqp->vq_lock);
1977 			}
1978 
1979 			mutex_exit(&vswp->mac_ring_lock);
1980 			D1(vswp, "%s: exit", __func__);
1981 			return ((mac_resource_handle_t)ringp);
1982 		}
1983 	}
1984 	mutex_exit(&vswp->mac_ring_lock);
1985 
1986 	/*
1987 	 * No slots in the ring table available.
1988 	 */
1989 	D1(vswp, "%s: exit", __func__);
1990 	return (NULL);
1991 }
1992 
1993 static void
1994 vsw_queue_stop(vsw_queue_t *vqp)
1995 {
1996 	mutex_enter(&vqp->vq_lock);
1997 
1998 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
1999 		vqp->vq_state = VSW_QUEUE_STOP;
2000 		cv_signal(&vqp->vq_cv);
2001 
2002 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
2003 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
2004 	}
2005 
2006 	vqp->vq_state = VSW_QUEUE_STOPPED;
2007 
2008 	mutex_exit(&vqp->vq_lock);
2009 }
2010 
2011 static vsw_queue_t *
2012 vsw_queue_create()
2013 {
2014 	vsw_queue_t *vqp;
2015 
2016 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
2017 
2018 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
2019 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
2020 	vqp->vq_first = NULL;
2021 	vqp->vq_last = NULL;
2022 	vqp->vq_state = VSW_QUEUE_STOPPED;
2023 
2024 	return (vqp);
2025 }
2026 
2027 static void
2028 vsw_queue_destroy(vsw_queue_t *vqp)
2029 {
2030 	cv_destroy(&vqp->vq_cv);
2031 	mutex_destroy(&vqp->vq_lock);
2032 	kmem_free(vqp, sizeof (vsw_queue_t));
2033 }
2034 
2035 static void
2036 vsw_queue_worker(vsw_mac_ring_t *rrp)
2037 {
2038 	mblk_t		*mp;
2039 	vsw_queue_t	*vqp = rrp->ring_vqp;
2040 	vsw_t		*vswp = rrp->ring_vswp;
2041 
2042 	mutex_enter(&vqp->vq_lock);
2043 
2044 	ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
2045 
2046 	/*
2047 	 * Set the state to running, since the thread is now active.
2048 	 */
2049 	vqp->vq_state = VSW_QUEUE_RUNNING;
2050 	cv_signal(&vqp->vq_cv);
2051 
2052 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
2053 		/*
2054 		 * Wait for work to do or the state has changed
2055 		 * to not running.
2056 		 */
2057 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
2058 				(vqp->vq_first == NULL)) {
2059 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
2060 		}
2061 
2062 		/*
2063 		 * Process packets that we received from the interface.
2064 		 */
2065 		if (vqp->vq_first != NULL) {
2066 			mp = vqp->vq_first;
2067 
2068 			vqp->vq_first = NULL;
2069 			vqp->vq_last = NULL;
2070 
2071 			mutex_exit(&vqp->vq_lock);
2072 
2073 			/* switch the chain of packets received */
2074 			vswp->vsw_switch_frame(vswp, mp,
2075 						VSW_PHYSDEV, NULL, NULL);
2076 
2077 			mutex_enter(&vqp->vq_lock);
2078 		}
2079 	}
2080 
2081 	/*
2082 	 * We are drained and signal we are done.
2083 	 */
2084 	vqp->vq_state = VSW_QUEUE_DRAINED;
2085 	cv_signal(&vqp->vq_cv);
2086 
2087 	/*
2088 	 * Exit lock and drain the remaining packets.
2089 	 */
2090 	mutex_exit(&vqp->vq_lock);
2091 
2092 	/*
2093 	 * Exit the thread
2094 	 */
2095 	thread_exit();
2096 }
2097 
2098 /*
2099  * static void
2100  * vsw_rx_queue_cb() - Receive callback routine when
2101  *	vsw_multi_ring_enable is non-zero.  Queue the packets
2102  *	to a packet queue for a worker thread to process.
2103  */
2104 static void
2105 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
2106 {
2107 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
2108 	vsw_t		*vswp = (vsw_t *)arg;
2109 	vsw_queue_t	*vqp;
2110 	mblk_t		*bp, *last;
2111 
2112 	ASSERT(mrh != NULL);
2113 	ASSERT(vswp != NULL);
2114 	ASSERT(mp != NULL);
2115 
2116 	D1(vswp, "%s: enter", __func__);
2117 
2118 	/*
2119 	 * Find the last element in the mblk chain.
2120 	 */
2121 	bp = mp;
2122 	do {
2123 		last = bp;
2124 		bp = bp->b_next;
2125 	} while (bp != NULL);
2126 
2127 	/* Get the queue for the packets */
2128 	vqp = ringp->ring_vqp;
2129 
2130 	/*
2131 	 * Grab the lock such we can queue the packets.
2132 	 */
2133 	mutex_enter(&vqp->vq_lock);
2134 
2135 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
2136 		freemsg(mp);
2137 		mutex_exit(&vqp->vq_lock);
2138 		goto vsw_rx_queue_cb_exit;
2139 	}
2140 
2141 	/*
2142 	 * Add the mblk chain to the queue.  If there
2143 	 * is some mblks in the queue, then add the new
2144 	 * chain to the end.
2145 	 */
2146 	if (vqp->vq_first == NULL)
2147 		vqp->vq_first = mp;
2148 	else
2149 		vqp->vq_last->b_next = mp;
2150 
2151 	vqp->vq_last = last;
2152 
2153 	/*
2154 	 * Signal the worker thread that there is work to
2155 	 * do.
2156 	 */
2157 	cv_signal(&vqp->vq_cv);
2158 
2159 	/*
2160 	 * Let go of the lock and exit.
2161 	 */
2162 	mutex_exit(&vqp->vq_lock);
2163 
2164 vsw_rx_queue_cb_exit:
2165 	D1(vswp, "%s: exit", __func__);
2166 }
2167 
2168 /*
2169  * receive callback routine. Invoked by MAC layer when there
2170  * are pkts being passed up from physical device.
2171  *
2172  * PERF: It may be more efficient when the card is in promisc
2173  * mode to check the dest address of the pkts here (against
2174  * the FDB) rather than checking later. Needs to be investigated.
2175  */
2176 static void
2177 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
2178 {
2179 	_NOTE(ARGUNUSED(mrh))
2180 
2181 	vsw_t		*vswp = (vsw_t *)arg;
2182 
2183 	ASSERT(vswp != NULL);
2184 
2185 	D1(vswp, "vsw_rx_cb: enter");
2186 
2187 	/* switch the chain of packets received */
2188 	vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
2189 
2190 	D1(vswp, "vsw_rx_cb: exit");
2191 }
2192 
2193 /*
2194  * Send a message out over the physical device via the MAC layer.
2195  *
2196  * Returns any mblks that it was unable to transmit.
2197  */
2198 static mblk_t *
2199 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
2200 {
2201 	const mac_txinfo_t	*mtp;
2202 	mblk_t			*nextp;
2203 
2204 	mutex_enter(&vswp->mac_lock);
2205 	if (vswp->mh == NULL) {
2206 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
2207 		mutex_exit(&vswp->mac_lock);
2208 		return (mp);
2209 	} else {
2210 		for (;;) {
2211 			nextp = mp->b_next;
2212 			mp->b_next = NULL;
2213 
2214 			mtp = vswp->txinfo;
2215 
2216 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
2217 				mp->b_next = nextp;
2218 				break;
2219 			}
2220 
2221 			if ((mp = nextp) == NULL)
2222 				break;
2223 		}
2224 	}
2225 	mutex_exit(&vswp->mac_lock);
2226 
2227 	return (mp);
2228 }
2229 
2230 /*
2231  * Register with the MAC layer as a network device, so we
2232  * can be plumbed if necessary.
2233  */
2234 static int
2235 vsw_mac_register(vsw_t *vswp)
2236 {
2237 	mac_register_t	*macp;
2238 	int		rv;
2239 
2240 	D1(vswp, "%s: enter", __func__);
2241 
2242 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
2243 		return (EINVAL);
2244 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2245 	macp->m_driver = vswp;
2246 	macp->m_dip = vswp->dip;
2247 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
2248 	macp->m_callbacks = &vsw_m_callbacks;
2249 	macp->m_min_sdu = 0;
2250 	macp->m_max_sdu = ETHERMTU;
2251 	rv = mac_register(macp, &vswp->if_mh);
2252 	mac_free(macp);
2253 	if (rv == 0)
2254 		vswp->if_state |= VSW_IF_REG;
2255 
2256 	D1(vswp, "%s: exit", __func__);
2257 
2258 	return (rv);
2259 }
2260 
2261 static int
2262 vsw_mac_unregister(vsw_t *vswp)
2263 {
2264 	int		rv = 0;
2265 
2266 	D1(vswp, "%s: enter", __func__);
2267 
2268 	WRITE_ENTER(&vswp->if_lockrw);
2269 
2270 	if (vswp->if_state & VSW_IF_REG) {
2271 		rv = mac_unregister(vswp->if_mh);
2272 		if (rv != 0) {
2273 			DWARN(vswp, "%s: unable to unregister from MAC "
2274 				"framework", __func__);
2275 
2276 			RW_EXIT(&vswp->if_lockrw);
2277 			D1(vswp, "%s: fail exit", __func__);
2278 			return (rv);
2279 		}
2280 
2281 		/* mark i/f as down and unregistered */
2282 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
2283 	}
2284 	RW_EXIT(&vswp->if_lockrw);
2285 
2286 	D1(vswp, "%s: exit", __func__);
2287 
2288 	return (rv);
2289 }
2290 
2291 static int
2292 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
2293 {
2294 	vsw_t			*vswp = (vsw_t *)arg;
2295 
2296 	D1(vswp, "%s: enter", __func__);
2297 
2298 	mutex_enter(&vswp->mac_lock);
2299 	if (vswp->mh == NULL) {
2300 		mutex_exit(&vswp->mac_lock);
2301 		return (EINVAL);
2302 	}
2303 
2304 	/* return stats from underlying device */
2305 	*val = mac_stat_get(vswp->mh, stat);
2306 
2307 	mutex_exit(&vswp->mac_lock);
2308 
2309 	return (0);
2310 }
2311 
2312 static void
2313 vsw_m_stop(void *arg)
2314 {
2315 	vsw_t		*vswp = (vsw_t *)arg;
2316 
2317 	D1(vswp, "%s: enter", __func__);
2318 
2319 	WRITE_ENTER(&vswp->if_lockrw);
2320 	vswp->if_state &= ~VSW_IF_UP;
2321 	RW_EXIT(&vswp->if_lockrw);
2322 
2323 	mutex_enter(&vswp->hw_lock);
2324 
2325 	(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
2326 
2327 	if (vswp->recfg_reqd)
2328 		vsw_reconfig_hw(vswp);
2329 
2330 	mutex_exit(&vswp->hw_lock);
2331 
2332 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2333 }
2334 
2335 static int
2336 vsw_m_start(void *arg)
2337 {
2338 	vsw_t		*vswp = (vsw_t *)arg;
2339 
2340 	D1(vswp, "%s: enter", __func__);
2341 
2342 	WRITE_ENTER(&vswp->if_lockrw);
2343 	vswp->if_state |= VSW_IF_UP;
2344 	RW_EXIT(&vswp->if_lockrw);
2345 
2346 	mutex_enter(&vswp->hw_lock);
2347 	(void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
2348 	mutex_exit(&vswp->hw_lock);
2349 
2350 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
2351 	return (0);
2352 }
2353 
2354 /*
2355  * Change the local interface address.
2356  *
2357  * Note: we don't support this entry point. The local
2358  * mac address of the switch can only be changed via its
2359  * MD node properties.
2360  */
2361 static int
2362 vsw_m_unicst(void *arg, const uint8_t *macaddr)
2363 {
2364 	_NOTE(ARGUNUSED(arg, macaddr))
2365 
2366 	return (DDI_FAILURE);
2367 }
2368 
2369 static int
2370 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
2371 {
2372 	vsw_t		*vswp = (vsw_t *)arg;
2373 	mcst_addr_t	*mcst_p = NULL;
2374 	uint64_t	addr = 0x0;
2375 	int		i, ret = 0;
2376 
2377 	D1(vswp, "%s: enter", __func__);
2378 
2379 	/*
2380 	 * Convert address into form that can be used
2381 	 * as hash table key.
2382 	 */
2383 	for (i = 0; i < ETHERADDRL; i++) {
2384 		addr = (addr << 8) | mca[i];
2385 	}
2386 
2387 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
2388 
2389 	if (add) {
2390 		D2(vswp, "%s: adding multicast", __func__);
2391 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2392 			/*
2393 			 * Update the list of multicast addresses
2394 			 * contained within the vsw_t structure to
2395 			 * include this new one.
2396 			 */
2397 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
2398 			if (mcst_p == NULL) {
2399 				DERR(vswp, "%s unable to alloc mem", __func__);
2400 				return (1);
2401 			}
2402 			mcst_p->addr = addr;
2403 
2404 			mutex_enter(&vswp->mca_lock);
2405 			mcst_p->nextp = vswp->mcap;
2406 			vswp->mcap = mcst_p;
2407 			mutex_exit(&vswp->mca_lock);
2408 
2409 			/*
2410 			 * Call into the underlying driver to program the
2411 			 * address into HW.
2412 			 */
2413 			mutex_enter(&vswp->mac_lock);
2414 			if (vswp->mh != NULL) {
2415 				ret = mac_multicst_add(vswp->mh, mca);
2416 				if (ret != 0) {
2417 					cmn_err(CE_WARN, "!vsw%d: unable to "
2418 						"add multicast address",
2419 						vswp->instance);
2420 					mutex_exit(&vswp->mac_lock);
2421 					goto vsw_remove_addr;
2422 				}
2423 			}
2424 			mutex_exit(&vswp->mac_lock);
2425 		} else {
2426 			cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
2427 				"address", vswp->instance);
2428 		}
2429 		return (ret);
2430 	}
2431 
2432 vsw_remove_addr:
2433 
2434 	D2(vswp, "%s: removing multicast", __func__);
2435 	/*
2436 	 * Remove the address from the hash table..
2437 	 */
2438 	if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
2439 
2440 		/*
2441 		 * ..and then from the list maintained in the
2442 		 * vsw_t structure.
2443 		 */
2444 		vsw_del_addr(VSW_LOCALDEV, vswp, addr);
2445 
2446 		mutex_enter(&vswp->mac_lock);
2447 		if (vswp->mh != NULL)
2448 			(void) mac_multicst_remove(vswp->mh, mca);
2449 		mutex_exit(&vswp->mac_lock);
2450 	}
2451 
2452 	D1(vswp, "%s: exit", __func__);
2453 
2454 	return (0);
2455 }
2456 
2457 static int
2458 vsw_m_promisc(void *arg, boolean_t on)
2459 {
2460 	vsw_t		*vswp = (vsw_t *)arg;
2461 
2462 	D1(vswp, "%s: enter", __func__);
2463 
2464 	WRITE_ENTER(&vswp->if_lockrw);
2465 	if (on)
2466 		vswp->if_state |= VSW_IF_PROMISC;
2467 	else
2468 		vswp->if_state &= ~VSW_IF_PROMISC;
2469 	RW_EXIT(&vswp->if_lockrw);
2470 
2471 	D1(vswp, "%s: exit", __func__);
2472 
2473 	return (0);
2474 }
2475 
2476 static mblk_t *
2477 vsw_m_tx(void *arg, mblk_t *mp)
2478 {
2479 	vsw_t		*vswp = (vsw_t *)arg;
2480 
2481 	D1(vswp, "%s: enter", __func__);
2482 
2483 	vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
2484 
2485 	D1(vswp, "%s: exit", __func__);
2486 
2487 	return (NULL);
2488 }
2489 
2490 /*
2491  * Register for machine description (MD) updates.
2492  *
2493  * Returns 0 on success, 1 on failure.
2494  */
2495 static int
2496 vsw_mdeg_register(vsw_t *vswp)
2497 {
2498 	mdeg_prop_spec_t	*pspecp;
2499 	mdeg_node_spec_t	*inst_specp;
2500 	mdeg_handle_t		mdeg_hdl, mdeg_port_hdl;
2501 	size_t			templatesz;
2502 	int			inst, rv;
2503 
2504 	D1(vswp, "%s: enter", __func__);
2505 
2506 	/*
2507 	 * In each 'virtual-device' node in the MD there is a
2508 	 * 'cfg-handle' property which is the MD's concept of
2509 	 * an instance number (this may be completely different from
2510 	 * the device drivers instance #). OBP reads that value and
2511 	 * stores it in the 'reg' property of the appropriate node in
2512 	 * the device tree. So we use the 'reg' value when registering
2513 	 * with the mdeg framework, to ensure we get events for the
2514 	 * correct nodes.
2515 	 */
2516 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
2517 		DDI_PROP_DONTPASS, reg_propname, -1);
2518 	if (inst == -1) {
2519 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from "
2520 			"OBP device tree", vswp->instance, reg_propname);
2521 		return (1);
2522 	}
2523 
2524 	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
2525 
2526 	/*
2527 	 * Allocate and initialize a per-instance copy
2528 	 * of the global property spec array that will
2529 	 * uniquely identify this vsw instance.
2530 	 */
2531 	templatesz = sizeof (vsw_prop_template);
2532 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
2533 
2534 	bcopy(vsw_prop_template, pspecp, templatesz);
2535 
2536 	VSW_SET_MDEG_PROP_INST(pspecp, inst);
2537 
2538 	/* initialize the complete prop spec structure */
2539 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
2540 	inst_specp->namep = "virtual-device";
2541 	inst_specp->specp = pspecp;
2542 
2543 	/*
2544 	 * Register an interest in 'virtual-device' nodes with a
2545 	 * 'name' property of 'virtual-network-switch'
2546 	 */
2547 	rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb,
2548 	    (void *)vswp, &mdeg_hdl);
2549 	if (rv != MDEG_SUCCESS) {
2550 		DERR(vswp, "%s: mdeg_register failed (%d) for vsw node",
2551 			__func__, rv);
2552 		goto mdeg_reg_fail;
2553 	}
2554 
2555 	/*
2556 	 * Register an interest in 'vsw-port' nodes.
2557 	 */
2558 	rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb,
2559 	    (void *)vswp, &mdeg_port_hdl);
2560 	if (rv != MDEG_SUCCESS) {
2561 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
2562 		(void) mdeg_unregister(mdeg_hdl);
2563 		goto mdeg_reg_fail;
2564 	}
2565 
2566 	/* save off data that will be needed later */
2567 	vswp->inst_spec = inst_specp;
2568 	vswp->mdeg_hdl = mdeg_hdl;
2569 	vswp->mdeg_port_hdl = mdeg_port_hdl;
2570 
2571 	D1(vswp, "%s: exit", __func__);
2572 	return (0);
2573 
2574 mdeg_reg_fail:
2575 	cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks",
2576 				vswp->instance);
2577 	kmem_free(pspecp, templatesz);
2578 	kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
2579 
2580 	vswp->mdeg_hdl = NULL;
2581 	vswp->mdeg_port_hdl = NULL;
2582 
2583 	return (1);
2584 }
2585 
2586 static void
2587 vsw_mdeg_unregister(vsw_t *vswp)
2588 {
2589 	D1(vswp, "vsw_mdeg_unregister: enter");
2590 
2591 	if (vswp->mdeg_hdl != NULL)
2592 		(void) mdeg_unregister(vswp->mdeg_hdl);
2593 
2594 	if (vswp->mdeg_port_hdl != NULL)
2595 		(void) mdeg_unregister(vswp->mdeg_port_hdl);
2596 
2597 	if (vswp->inst_spec != NULL) {
2598 		if (vswp->inst_spec->specp != NULL) {
2599 			(void) kmem_free(vswp->inst_spec->specp,
2600 				sizeof (vsw_prop_template));
2601 			vswp->inst_spec->specp = NULL;
2602 		}
2603 
2604 		(void) kmem_free(vswp->inst_spec,
2605 			sizeof (mdeg_node_spec_t));
2606 		vswp->inst_spec = NULL;
2607 	}
2608 
2609 	D1(vswp, "vsw_mdeg_unregister: exit");
2610 }
2611 
2612 /*
2613  * Mdeg callback invoked for the vsw node itself.
2614  */
2615 static int
2616 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2617 {
2618 	vsw_t		*vswp;
2619 	int		idx;
2620 	md_t		*mdp;
2621 	mde_cookie_t	node;
2622 	uint64_t	inst;
2623 	char		*node_name = NULL;
2624 
2625 	if (resp == NULL)
2626 		return (MDEG_FAILURE);
2627 
2628 	vswp = (vsw_t *)cb_argp;
2629 
2630 	D1(vswp, "%s: added %d : removed %d : curr matched %d"
2631 		" : prev matched %d", __func__, resp->added.nelem,
2632 		resp->removed.nelem, resp->match_curr.nelem,
2633 		resp->match_prev.nelem);
2634 
2635 	/*
2636 	 * Expect 'added' to be non-zero if virtual-network-switch
2637 	 * nodes exist in the MD when the driver attaches.
2638 	 */
2639 	for (idx = 0; idx < resp->added.nelem; idx++) {
2640 		mdp = resp->added.mdp;
2641 		node = resp->added.mdep[idx];
2642 
2643 		if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
2644 			DERR(vswp, "%s: unable to get node name for "
2645 				"node(%d) 0x%lx", __func__, idx, node);
2646 			continue;
2647 		}
2648 
2649 		if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
2650 			DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
2651 				__func__, idx);
2652 			continue;
2653 		}
2654 
2655 		D2(vswp, "%s: added node(%d) 0x%lx with name %s "
2656 			"and inst %d", __func__, idx, node, node_name, inst);
2657 
2658 		vsw_get_initial_md_properties(vswp, mdp, node);
2659 	}
2660 
2661 	/*
2662 	 * A non-zero 'match' value indicates that the MD has been
2663 	 * updated and that a virtual-network-switch node is present
2664 	 * which may or may not have been updated. It is up to the clients
2665 	 * to examine their own nodes and determine if they have changed.
2666 	 */
2667 	for (idx = 0; idx < resp->match_curr.nelem; idx++) {
2668 		mdp = resp->match_curr.mdp;
2669 		node = resp->match_curr.mdep[idx];
2670 
2671 		if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
2672 			DERR(vswp, "%s: unable to get node name for "
2673 				"node(%d) 0x%lx", __func__, idx, node);
2674 			continue;
2675 		}
2676 
2677 		if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
2678 			DERR(vswp, "%s: prop(cfg-handle) not found port(%d)",
2679 				__func__, idx);
2680 			continue;
2681 		}
2682 
2683 		D2(vswp, "%s: changed node(%d) 0x%lx with name %s "
2684 			"and inst %d", __func__, idx, node, node_name, inst);
2685 
2686 		vsw_update_md_prop(vswp, mdp, node);
2687 	}
2688 
2689 	return (MDEG_SUCCESS);
2690 }
2691 
2692 /*
2693  * Mdeg callback invoked for changes to the vsw-port nodes
2694  * under the vsw node.
2695  */
2696 static int
2697 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
2698 {
2699 	vsw_t		*vswp;
2700 	int		idx;
2701 	md_t		*mdp;
2702 	mde_cookie_t	node;
2703 	uint64_t	inst;
2704 
2705 	if ((resp == NULL) || (cb_argp == NULL))
2706 		return (MDEG_FAILURE);
2707 
2708 	vswp = (vsw_t *)cb_argp;
2709 
2710 	D2(vswp, "%s: added %d : removed %d : curr matched %d"
2711 		" : prev matched %d", __func__, resp->added.nelem,
2712 		resp->removed.nelem, resp->match_curr.nelem,
2713 		resp->match_prev.nelem);
2714 
2715 	/* process added ports */
2716 	for (idx = 0; idx < resp->added.nelem; idx++) {
2717 		mdp = resp->added.mdp;
2718 		node = resp->added.mdep[idx];
2719 
2720 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
2721 
2722 		if (vsw_port_add(vswp, mdp, &node) != 0) {
2723 			cmn_err(CE_WARN, "!vsw%d: Unable to add new port "
2724 				"(0x%lx)", vswp->instance, node);
2725 		}
2726 	}
2727 
2728 	/* process removed ports */
2729 	for (idx = 0; idx < resp->removed.nelem; idx++) {
2730 		mdp = resp->removed.mdp;
2731 		node = resp->removed.mdep[idx];
2732 
2733 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
2734 			DERR(vswp, "%s: prop(%s) not found in port(%d)",
2735 				__func__, id_propname, idx);
2736 			continue;
2737 		}
2738 
2739 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
2740 
2741 		if (vsw_port_detach(vswp, inst) != 0) {
2742 			cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld",
2743 				vswp->instance, inst);
2744 		}
2745 	}
2746 
2747 	/*
2748 	 * Currently no support for updating already active ports.
2749 	 * So, ignore the match_curr and match_priv arrays for now.
2750 	 */
2751 
2752 	D1(vswp, "%s: exit", __func__);
2753 
2754 	return (MDEG_SUCCESS);
2755 }
2756 
2757 /*
2758  * Read the initial start-of-day values from the specified MD node.
2759  */
2760 static void
2761 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
2762 {
2763 	int		i;
2764 	uint64_t 	macaddr = 0;
2765 
2766 	D1(vswp, "%s: enter", __func__);
2767 
2768 	if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) == 0) {
2769 		/*
2770 		 * Note it is valid for the physname property to
2771 		 * be NULL so check actual name length to determine
2772 		 * if we have a actual device name.
2773 		 */
2774 		if (strlen(vswp->physname) > 0)
2775 			vswp->mdprops |= VSW_MD_PHYSNAME;
2776 	} else {
2777 		cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
2778 			"device from MD", vswp->instance);
2779 		return;
2780 	}
2781 
2782 	/* mac address for vswitch device itself */
2783 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
2784 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
2785 			vswp->instance);
2786 
2787 		/*
2788 		 * Fallback to using the mac address of the physical
2789 		 * device.
2790 		 */
2791 		if (vsw_get_physaddr(vswp) == 0) {
2792 			cmn_err(CE_NOTE, "!vsw%d: Using MAC address from "
2793 				"physical device (%s)", vswp->instance,
2794 				vswp->physname);
2795 		} else {
2796 			cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address"
2797 				"from device %s", vswp->instance,
2798 				vswp->physname);
2799 		}
2800 	} else {
2801 		WRITE_ENTER(&vswp->if_lockrw);
2802 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2803 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
2804 			macaddr >>= 8;
2805 		}
2806 		RW_EXIT(&vswp->if_lockrw);
2807 		vswp->mdprops |= VSW_MD_MACADDR;
2808 	}
2809 
2810 	if (vsw_get_md_smodes(vswp, mdp, node,
2811 				vswp->smode, &vswp->smode_num)) {
2812 		cmn_err(CE_WARN, "vsw%d: Unable to read %s property from "
2813 			"MD, defaulting to programmed mode", vswp->instance,
2814 			smode_propname);
2815 
2816 		for (i = 0; i < NUM_SMODES; i++)
2817 			vswp->smode[i] = VSW_LAYER2;
2818 
2819 		vswp->smode_num = NUM_SMODES;
2820 	} else {
2821 		ASSERT(vswp->smode_num != 0);
2822 		vswp->mdprops |= VSW_MD_SMODE;
2823 	}
2824 
2825 	/*
2826 	 * Unable to setup any switching mode, nothing more
2827 	 * we can do.
2828 	 */
2829 	if (vsw_setup_switching(vswp))
2830 		return;
2831 
2832 	WRITE_ENTER(&vswp->if_lockrw);
2833 	vswp->if_state &= ~VSW_IF_UP;
2834 	RW_EXIT(&vswp->if_lockrw);
2835 	if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) {
2836 		if (vsw_mac_register(vswp) != 0) {
2837 			/*
2838 			 * Treat this as a non-fatal error as we may be
2839 			 * able to operate in some other mode.
2840 			 */
2841 			cmn_err(CE_WARN, "vsw%d: Unable to register as "
2842 				"provider with MAC layer", vswp->instance);
2843 		}
2844 	}
2845 
2846 	D1(vswp, "%s: exit", __func__);
2847 }
2848 
2849 /*
2850  * Check to see if the relevant properties in the specified node have
2851  * changed, and if so take the appropriate action.
2852  *
2853  * If any of the properties are missing or invalid we don't take
2854  * any action, as this function should only be invoked when modifications
2855  * have been made to what we assume is a working configuration, which
2856  * we leave active.
2857  *
2858  * Note it is legal for this routine to be invoked even if none of the
2859  * properties in the port node within the MD have actually changed.
2860  */
2861 static void
2862 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
2863 {
2864 	char		physname[LIFNAMSIZ];
2865 	char		drv[LIFNAMSIZ];
2866 	uint_t		ddi_instance;
2867 	uint8_t		new_smode[NUM_SMODES];
2868 	int		i, smode_num = 0;
2869 	uint64_t 	macaddr = 0;
2870 	vsw_port_list_t *plist = &vswp->plist;
2871 	vsw_port_t	*port = NULL;
2872 	enum		{MD_init = 0x1,
2873 				MD_physname = 0x2,
2874 				MD_macaddr = 0x4,
2875 				MD_smode = 0x8} updated;
2876 
2877 	updated = MD_init;
2878 
2879 	D1(vswp, "%s: enter", __func__);
2880 
2881 	/*
2882 	 * Check if name of physical device in MD has changed.
2883 	 */
2884 	if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) {
2885 		/*
2886 		 * Do basic sanity check on new device name/instance,
2887 		 * if its non NULL. It is valid for the device name to
2888 		 * have changed from a non NULL to a NULL value, i.e.
2889 		 * the vsw is being changed to 'routed' mode.
2890 		 */
2891 		if ((strlen(physname) != 0) &&
2892 			(ddi_parse(physname, drv,
2893 				&ddi_instance) != DDI_SUCCESS)) {
2894 			cmn_err(CE_WARN, "!vsw%d: new device name %s is not"
2895 				" a valid device name/instance",
2896 				vswp->instance, physname);
2897 			goto fail_reconf;
2898 		}
2899 
2900 		if (strcmp(physname, vswp->physname)) {
2901 			D2(vswp, "%s: device name changed from %s to %s",
2902 					__func__, vswp->physname, physname);
2903 
2904 			updated |= MD_physname;
2905 		} else {
2906 			D2(vswp, "%s: device name unchanged at %s",
2907 					__func__, vswp->physname);
2908 		}
2909 	} else {
2910 		cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
2911 			"device from updated MD.", vswp->instance);
2912 		goto fail_reconf;
2913 	}
2914 
2915 	/*
2916 	 * Check if MAC address has changed.
2917 	 */
2918 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
2919 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
2920 			vswp->instance);
2921 		goto fail_reconf;
2922 	} else {
2923 		READ_ENTER(&vswp->if_lockrw);
2924 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2925 			if (vswp->if_addr.ether_addr_octet[i]
2926 							!= (macaddr & 0xFF)) {
2927 				D2(vswp, "%s: octet[%d] 0x%x != 0x%x",
2928 					__func__, i,
2929 					vswp->if_addr.ether_addr_octet[i],
2930 					(macaddr & 0xFF));
2931 				updated |= MD_macaddr;
2932 				break;
2933 			}
2934 			macaddr >>= 8;
2935 		}
2936 		RW_EXIT(&vswp->if_lockrw);
2937 	}
2938 
2939 	/*
2940 	 * Check if switching modes have changed.
2941 	 */
2942 	if (vsw_get_md_smodes(vswp, mdp, node,
2943 				new_smode, &smode_num)) {
2944 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
2945 					vswp->instance, smode_propname);
2946 		goto fail_reconf;
2947 	} else {
2948 		ASSERT(smode_num != 0);
2949 		if (smode_num != vswp->smode_num) {
2950 			D2(vswp, "%s: number of modes changed from %d to %d",
2951 				__func__, vswp->smode_num, smode_num);
2952 		}
2953 
2954 		for (i = 0; i < smode_num; i++) {
2955 			if (new_smode[i] != vswp->smode[i]) {
2956 				D2(vswp, "%s: mode changed from %d to %d",
2957 					__func__, vswp->smode[i], new_smode[i]);
2958 				updated |= MD_smode;
2959 				break;
2960 			}
2961 		}
2962 	}
2963 
2964 	/*
2965 	 * Now make any changes which are needed...
2966 	 */
2967 
2968 	if (updated & (MD_physname | MD_smode)) {
2969 		/*
2970 		 * Disconnect all ports from the current card
2971 		 */
2972 		WRITE_ENTER(&plist->lockrw);
2973 		for (port = plist->head; port != NULL; port = port->p_next) {
2974 			/* Remove address if was programmed into HW. */
2975 			mutex_enter(&vswp->hw_lock);
2976 			if (vsw_unset_hw(vswp, port, VSW_VNETPORT)) {
2977 				mutex_exit(&vswp->hw_lock);
2978 				RW_EXIT(&plist->lockrw);
2979 				goto fail_update;
2980 			}
2981 			mutex_exit(&vswp->hw_lock);
2982 		}
2983 		RW_EXIT(&plist->lockrw);
2984 
2985 		/*
2986 		 * Stop, detach the old device..
2987 		 */
2988 		vsw_mac_detach(vswp);
2989 
2990 		/*
2991 		 * Update phys name.
2992 		 */
2993 		if (updated & MD_physname) {
2994 			cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s",
2995 				vswp->instance, vswp->physname, physname);
2996 			(void) strncpy(vswp->physname,
2997 					physname, strlen(physname) + 1);
2998 
2999 			if (strlen(vswp->physname) > 0)
3000 				vswp->mdprops |= VSW_MD_PHYSNAME;
3001 		}
3002 
3003 		/*
3004 		 * Update array with the new switch mode values.
3005 		 */
3006 		if (updated & MD_smode) {
3007 			for (i = 0; i < smode_num; i++)
3008 				vswp->smode[i] = new_smode[i];
3009 
3010 			vswp->smode_num = smode_num;
3011 			vswp->smode_idx = 0;
3012 		}
3013 
3014 		/*
3015 		 * ..and attach, start the new device.
3016 		 */
3017 		if (vsw_setup_switching(vswp))
3018 			goto fail_update;
3019 
3020 		/*
3021 		 * Connect ports to new card.
3022 		 */
3023 		WRITE_ENTER(&plist->lockrw);
3024 		for (port = plist->head; port != NULL; port = port->p_next) {
3025 			mutex_enter(&vswp->hw_lock);
3026 			if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
3027 				mutex_exit(&vswp->hw_lock);
3028 				RW_EXIT(&plist->lockrw);
3029 				goto fail_update;
3030 			}
3031 			mutex_exit(&vswp->hw_lock);
3032 		}
3033 		RW_EXIT(&plist->lockrw);
3034 	}
3035 
3036 	if (updated & MD_macaddr) {
3037 		cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx",
3038 				vswp->instance, macaddr);
3039 
3040 		WRITE_ENTER(&vswp->if_lockrw);
3041 		for (i = ETHERADDRL - 1; i >= 0; i--) {
3042 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
3043 			macaddr >>= 8;
3044 		}
3045 		RW_EXIT(&vswp->if_lockrw);
3046 
3047 		/*
3048 		 * Remove old address from HW (if programmed) and set
3049 		 * new address.
3050 		 */
3051 		mutex_enter(&vswp->hw_lock);
3052 		(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
3053 		(void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
3054 		mutex_exit(&vswp->hw_lock);
3055 
3056 		/*
3057 		 * Notify the MAC layer of the changed address.
3058 		 */
3059 		mac_unicst_update(vswp->if_mh, (uint8_t *)&vswp->if_addr);
3060 	}
3061 
3062 	return;
3063 
3064 fail_reconf:
3065 	cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance);
3066 	return;
3067 
3068 fail_update:
3069 	cmn_err(CE_WARN, "!vsw%d: update of configuration failed",
3070 			vswp->instance);
3071 }
3072 
3073 /*
3074  * Add a new port to the system.
3075  *
3076  * Returns 0 on success, 1 on failure.
3077  */
3078 int
3079 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
3080 {
3081 	uint64_t		ldc_id;
3082 	uint8_t			*addrp;
3083 	int			i, addrsz;
3084 	int			num_nodes = 0, nchan = 0;
3085 	int			listsz = 0;
3086 	mde_cookie_t		*listp = NULL;
3087 	struct ether_addr	ea;
3088 	uint64_t		macaddr;
3089 	uint64_t		inst = 0;
3090 	vsw_port_t		*port;
3091 
3092 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
3093 		DWARN(vswp, "%s: prop(%s) not found", __func__,
3094 			id_propname);
3095 		return (1);
3096 	}
3097 
3098 	/*
3099 	 * Find the channel endpoint node(s) (which should be under this
3100 	 * port node) which contain the channel id(s).
3101 	 */
3102 	if ((num_nodes = md_node_count(mdp)) <= 0) {
3103 		DERR(vswp, "%s: invalid number of nodes found (%d)",
3104 			__func__, num_nodes);
3105 		return (1);
3106 	}
3107 
3108 	D2(vswp, "%s: %d nodes found", __func__, num_nodes);
3109 
3110 	/* allocate enough space for node list */
3111 	listsz = num_nodes * sizeof (mde_cookie_t);
3112 	listp = kmem_zalloc(listsz, KM_SLEEP);
3113 
3114 	nchan = md_scan_dag(mdp, *node,
3115 		md_find_name(mdp, chan_propname),
3116 		md_find_name(mdp, "fwd"), listp);
3117 
3118 	if (nchan <= 0) {
3119 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
3120 		kmem_free(listp, listsz);
3121 		return (1);
3122 	}
3123 
3124 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
3125 
3126 	/* use property from first node found */
3127 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
3128 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
3129 			id_propname);
3130 		kmem_free(listp, listsz);
3131 		return (1);
3132 	}
3133 
3134 	/* don't need list any more */
3135 	kmem_free(listp, listsz);
3136 
3137 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
3138 
3139 	/* read mac-address property */
3140 	if (md_get_prop_data(mdp, *node, remaddr_propname,
3141 					&addrp, &addrsz)) {
3142 		DWARN(vswp, "%s: prop(%s) not found",
3143 				__func__, remaddr_propname);
3144 		return (1);
3145 	}
3146 
3147 	if (addrsz < ETHERADDRL) {
3148 		DWARN(vswp, "%s: invalid address size", __func__);
3149 		return (1);
3150 	}
3151 
3152 	macaddr = *((uint64_t *)addrp);
3153 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
3154 
3155 	for (i = ETHERADDRL - 1; i >= 0; i--) {
3156 		ea.ether_addr_octet[i] = macaddr & 0xFF;
3157 		macaddr >>= 8;
3158 	}
3159 
3160 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
3161 		DERR(vswp, "%s: failed to attach port", __func__);
3162 		return (1);
3163 	}
3164 
3165 	port = vsw_lookup_port(vswp, (int)inst);
3166 
3167 	/* just successfuly created the port, so it should exist */
3168 	ASSERT(port != NULL);
3169 
3170 	return (0);
3171 }
3172 
3173 /*
3174  * Attach the specified port.
3175  *
3176  * Returns 0 on success, 1 on failure.
3177  */
3178 static int
3179 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
3180 struct ether_addr *macaddr)
3181 {
3182 	vsw_port_list_t		*plist = &vswp->plist;
3183 	vsw_port_t		*port, **prev_port;
3184 	int			i;
3185 
3186 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
3187 
3188 	/* port already exists? */
3189 	READ_ENTER(&plist->lockrw);
3190 	for (port = plist->head; port != NULL; port = port->p_next) {
3191 		if (port->p_instance == p_instance) {
3192 			DWARN(vswp, "%s: port instance %d already attached",
3193 				__func__, p_instance);
3194 			RW_EXIT(&plist->lockrw);
3195 			return (1);
3196 		}
3197 	}
3198 	RW_EXIT(&plist->lockrw);
3199 
3200 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
3201 	port->p_vswp = vswp;
3202 	port->p_instance = p_instance;
3203 	port->p_ldclist.num_ldcs = 0;
3204 	port->p_ldclist.head = NULL;
3205 	port->addr_set = VSW_ADDR_UNSET;
3206 
3207 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
3208 
3209 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
3210 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
3211 
3212 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
3213 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
3214 
3215 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
3216 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
3217 	port->state = VSW_PORT_INIT;
3218 
3219 	if (nids > VSW_PORT_MAX_LDCS) {
3220 		D2(vswp, "%s: using first of %d ldc ids",
3221 			__func__, nids);
3222 		nids = VSW_PORT_MAX_LDCS;
3223 	}
3224 
3225 	D2(vswp, "%s: %d nids", __func__, nids);
3226 	for (i = 0; i < nids; i++) {
3227 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
3228 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
3229 			DERR(vswp, "%s: ldc_attach failed", __func__);
3230 
3231 			rw_destroy(&port->p_ldclist.lockrw);
3232 
3233 			cv_destroy(&port->ref_cv);
3234 			mutex_destroy(&port->ref_lock);
3235 
3236 			cv_destroy(&port->state_cv);
3237 			mutex_destroy(&port->state_lock);
3238 
3239 			mutex_destroy(&port->tx_lock);
3240 			mutex_destroy(&port->mca_lock);
3241 			kmem_free(port, sizeof (vsw_port_t));
3242 			return (1);
3243 		}
3244 	}
3245 
3246 	ether_copy(macaddr, &port->p_macaddr);
3247 
3248 	WRITE_ENTER(&plist->lockrw);
3249 
3250 	/* create the fdb entry for this port/mac address */
3251 	(void) vsw_add_fdb(vswp, port);
3252 
3253 	mutex_enter(&vswp->hw_lock);
3254 	(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
3255 	mutex_exit(&vswp->hw_lock);
3256 
3257 	/* link it into the list of ports for this vsw instance */
3258 	prev_port = (vsw_port_t **)(&plist->head);
3259 	port->p_next = *prev_port;
3260 	*prev_port = port;
3261 	plist->num_ports++;
3262 	RW_EXIT(&plist->lockrw);
3263 
3264 	/*
3265 	 * Initialise the port and any ldc's under it.
3266 	 */
3267 	(void) vsw_init_ldcs(port);
3268 
3269 	D1(vswp, "%s: exit", __func__);
3270 	return (0);
3271 }
3272 
3273 /*
3274  * Detach the specified port.
3275  *
3276  * Returns 0 on success, 1 on failure.
3277  */
3278 static int
3279 vsw_port_detach(vsw_t *vswp, int p_instance)
3280 {
3281 	vsw_port_t	*port = NULL;
3282 	vsw_port_list_t	*plist = &vswp->plist;
3283 
3284 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
3285 
3286 	WRITE_ENTER(&plist->lockrw);
3287 
3288 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
3289 		RW_EXIT(&plist->lockrw);
3290 		return (1);
3291 	}
3292 
3293 	if (vsw_plist_del_node(vswp, port)) {
3294 		RW_EXIT(&plist->lockrw);
3295 		return (1);
3296 	}
3297 
3298 	/* Remove the fdb entry for this port/mac address */
3299 	(void) vsw_del_fdb(vswp, port);
3300 
3301 	/* Remove any multicast addresses.. */
3302 	vsw_del_mcst_port(port);
3303 
3304 	/*
3305 	 * No longer need to hold writer lock on port list now
3306 	 * that we have unlinked the target port from the list.
3307 	 */
3308 	RW_EXIT(&plist->lockrw);
3309 
3310 	/* Remove address if was programmed into HW. */
3311 	mutex_enter(&vswp->hw_lock);
3312 	(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
3313 	if (vswp->recfg_reqd)
3314 		vsw_reconfig_hw(vswp);
3315 	mutex_exit(&vswp->hw_lock);
3316 
3317 	if (vsw_port_delete(port)) {
3318 		return (1);
3319 	}
3320 
3321 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
3322 	return (0);
3323 }
3324 
3325 /*
3326  * Detach all active ports.
3327  *
3328  * Returns 0 on success, 1 on failure.
3329  */
3330 static int
3331 vsw_detach_ports(vsw_t *vswp)
3332 {
3333 	vsw_port_list_t 	*plist = &vswp->plist;
3334 	vsw_port_t		*port = NULL;
3335 
3336 	D1(vswp, "%s: enter", __func__);
3337 
3338 	WRITE_ENTER(&plist->lockrw);
3339 
3340 	while ((port = plist->head) != NULL) {
3341 		if (vsw_plist_del_node(vswp, port)) {
3342 			DERR(vswp, "%s: Error deleting port %d"
3343 				" from port list", __func__,
3344 				port->p_instance);
3345 			RW_EXIT(&plist->lockrw);
3346 			return (1);
3347 		}
3348 
3349 		/* Remove address if was programmed into HW. */
3350 		mutex_enter(&vswp->hw_lock);
3351 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
3352 		mutex_exit(&vswp->hw_lock);
3353 
3354 		/* Remove the fdb entry for this port/mac address */
3355 		(void) vsw_del_fdb(vswp, port);
3356 
3357 		/* Remove any multicast addresses.. */
3358 		vsw_del_mcst_port(port);
3359 
3360 		/*
3361 		 * No longer need to hold the lock on the port list
3362 		 * now that we have unlinked the target port from the
3363 		 * list.
3364 		 */
3365 		RW_EXIT(&plist->lockrw);
3366 		if (vsw_port_delete(port)) {
3367 			DERR(vswp, "%s: Error deleting port %d",
3368 				__func__, port->p_instance);
3369 			return (1);
3370 		}
3371 		WRITE_ENTER(&plist->lockrw);
3372 	}
3373 	RW_EXIT(&plist->lockrw);
3374 
3375 	D1(vswp, "%s: exit", __func__);
3376 
3377 	return (0);
3378 }
3379 
3380 /*
3381  * Delete the specified port.
3382  *
3383  * Returns 0 on success, 1 on failure.
3384  */
3385 static int
3386 vsw_port_delete(vsw_port_t *port)
3387 {
3388 	vsw_ldc_list_t 		*ldcl;
3389 	vsw_t			*vswp = port->p_vswp;
3390 
3391 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
3392 
3393 	(void) vsw_uninit_ldcs(port);
3394 
3395 	/*
3396 	 * Wait for any pending ctrl msg tasks which reference this
3397 	 * port to finish.
3398 	 */
3399 	if (vsw_drain_port_taskq(port))
3400 		return (1);
3401 
3402 	/*
3403 	 * Wait for port reference count to hit zero.
3404 	 */
3405 	mutex_enter(&port->ref_lock);
3406 	while (port->ref_cnt != 0)
3407 		cv_wait(&port->ref_cv, &port->ref_lock);
3408 	mutex_exit(&port->ref_lock);
3409 
3410 	/*
3411 	 * Wait for any active callbacks to finish
3412 	 */
3413 	if (vsw_drain_ldcs(port))
3414 		return (1);
3415 
3416 	ldcl = &port->p_ldclist;
3417 	WRITE_ENTER(&ldcl->lockrw);
3418 	while (ldcl->num_ldcs > 0) {
3419 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
3420 			cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld",
3421 					vswp->instance, ldcl->head->ldc_id);
3422 			RW_EXIT(&ldcl->lockrw);
3423 			return (1);
3424 		}
3425 	}
3426 	RW_EXIT(&ldcl->lockrw);
3427 
3428 	rw_destroy(&port->p_ldclist.lockrw);
3429 
3430 	mutex_destroy(&port->mca_lock);
3431 	mutex_destroy(&port->tx_lock);
3432 	cv_destroy(&port->ref_cv);
3433 	mutex_destroy(&port->ref_lock);
3434 
3435 	cv_destroy(&port->state_cv);
3436 	mutex_destroy(&port->state_lock);
3437 
3438 	kmem_free(port, sizeof (vsw_port_t));
3439 
3440 	D1(vswp, "%s: exit", __func__);
3441 
3442 	return (0);
3443 }
3444 
3445 /*
3446  * Attach a logical domain channel (ldc) under a specified port.
3447  *
3448  * Returns 0 on success, 1 on failure.
3449  */
3450 static int
3451 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
3452 {
3453 	vsw_t 		*vswp = port->p_vswp;
3454 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
3455 	vsw_ldc_t 	*ldcp = NULL;
3456 	ldc_attr_t 	attr;
3457 	ldc_status_t	istatus;
3458 	int 		status = DDI_FAILURE;
3459 	int		rv;
3460 	enum		{ PROG_init = 0x0, PROG_mblks = 0x1,
3461 				PROG_callback = 0x2}
3462 			progress;
3463 
3464 	progress = PROG_init;
3465 
3466 	D1(vswp, "%s: enter", __func__);
3467 
3468 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
3469 	if (ldcp == NULL) {
3470 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
3471 		return (1);
3472 	}
3473 	ldcp->ldc_id = ldc_id;
3474 
3475 	/* allocate pool of receive mblks */
3476 	rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
3477 	if (rv) {
3478 		DWARN(vswp, "%s: unable to create free mblk pool for"
3479 			" channel %ld (rv %d)", __func__, ldc_id, rv);
3480 		kmem_free(ldcp, sizeof (vsw_ldc_t));
3481 		return (1);
3482 	}
3483 
3484 	progress |= PROG_mblks;
3485 
3486 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
3487 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
3488 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
3489 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
3490 	rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL);
3491 	rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL);
3492 
3493 	/* required for handshake with peer */
3494 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
3495 	ldcp->peer_session = 0;
3496 	ldcp->session_status = 0;
3497 
3498 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
3499 	ldcp->hss_id = 1;	/* Initial handshake session id */
3500 
3501 	/* only set for outbound lane, inbound set by peer */
3502 	mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
3503 	mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
3504 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
3505 
3506 	attr.devclass = LDC_DEV_NT_SVC;
3507 	attr.instance = ddi_get_instance(vswp->dip);
3508 	attr.mode = LDC_MODE_UNRELIABLE;
3509 	attr.mtu = VSW_LDC_MTU;
3510 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
3511 	if (status != 0) {
3512 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
3513 		    __func__, ldc_id, status);
3514 		goto ldc_attach_fail;
3515 	}
3516 
3517 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
3518 	if (status != 0) {
3519 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
3520 		    __func__, ldc_id, status);
3521 		(void) ldc_fini(ldcp->ldc_handle);
3522 		goto ldc_attach_fail;
3523 	}
3524 
3525 	progress |= PROG_callback;
3526 
3527 	mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
3528 
3529 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
3530 		DERR(vswp, "%s: ldc_status failed", __func__);
3531 		mutex_destroy(&ldcp->status_lock);
3532 		goto ldc_attach_fail;
3533 	}
3534 
3535 	ldcp->ldc_status = istatus;
3536 	ldcp->ldc_port = port;
3537 	ldcp->ldc_vswp = vswp;
3538 
3539 	/* link it into the list of channels for this port */
3540 	WRITE_ENTER(&ldcl->lockrw);
3541 	ldcp->ldc_next = ldcl->head;
3542 	ldcl->head = ldcp;
3543 	ldcl->num_ldcs++;
3544 	RW_EXIT(&ldcl->lockrw);
3545 
3546 	D1(vswp, "%s: exit", __func__);
3547 	return (0);
3548 
3549 ldc_attach_fail:
3550 	mutex_destroy(&ldcp->ldc_txlock);
3551 	mutex_destroy(&ldcp->ldc_cblock);
3552 
3553 	cv_destroy(&ldcp->drain_cv);
3554 
3555 	rw_destroy(&ldcp->lane_in.dlistrw);
3556 	rw_destroy(&ldcp->lane_out.dlistrw);
3557 
3558 	if (progress & PROG_callback) {
3559 		(void) ldc_unreg_callback(ldcp->ldc_handle);
3560 	}
3561 
3562 	if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) {
3563 		if (vio_destroy_mblks(ldcp->rxh) != 0) {
3564 			/*
3565 			 * Something odd has happened, as the destroy
3566 			 * will only fail if some mblks have been allocated
3567 			 * from the pool already (which shouldn't happen)
3568 			 * and have not been returned.
3569 			 *
3570 			 * Add the pool pointer to a list maintained in
3571 			 * the device instance. Another attempt will be made
3572 			 * to free the pool when the device itself detaches.
3573 			 */
3574 			cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld "
3575 				"failed and cannot destroy associated mblk "
3576 				"pool", vswp->instance, ldc_id);
3577 			ldcp->rxh->nextp =  vswp->rxh;
3578 			vswp->rxh = ldcp->rxh;
3579 		}
3580 	}
3581 	mutex_destroy(&ldcp->drain_cv_lock);
3582 	mutex_destroy(&ldcp->hss_lock);
3583 
3584 	mutex_destroy(&ldcp->lane_in.seq_lock);
3585 	mutex_destroy(&ldcp->lane_out.seq_lock);
3586 	kmem_free(ldcp, sizeof (vsw_ldc_t));
3587 
3588 	return (1);
3589 }
3590 
3591 /*
3592  * Detach a logical domain channel (ldc) belonging to a
3593  * particular port.
3594  *
3595  * Returns 0 on success, 1 on failure.
3596  */
3597 static int
3598 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
3599 {
3600 	vsw_t 		*vswp = port->p_vswp;
3601 	vsw_ldc_t 	*ldcp, *prev_ldcp;
3602 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3603 	int 		rv;
3604 
3605 	prev_ldcp = ldcl->head;
3606 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
3607 		if (ldcp->ldc_id == ldc_id) {
3608 			break;
3609 		}
3610 	}
3611 
3612 	/* specified ldc id not found */
3613 	if (ldcp == NULL) {
3614 		DERR(vswp, "%s: ldcp = NULL", __func__);
3615 		return (1);
3616 	}
3617 
3618 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
3619 
3620 	/*
3621 	 * Before we can close the channel we must release any mapped
3622 	 * resources (e.g. drings).
3623 	 */
3624 	vsw_free_lane_resources(ldcp, INBOUND);
3625 	vsw_free_lane_resources(ldcp, OUTBOUND);
3626 
3627 	/*
3628 	 * If the close fails we are in serious trouble, as won't
3629 	 * be able to delete the parent port.
3630 	 */
3631 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
3632 		DERR(vswp, "%s: error %d closing channel %lld",
3633 			__func__, rv, ldcp->ldc_id);
3634 		return (1);
3635 	}
3636 
3637 	(void) ldc_fini(ldcp->ldc_handle);
3638 
3639 	ldcp->ldc_status = LDC_INIT;
3640 	ldcp->ldc_handle = NULL;
3641 	ldcp->ldc_vswp = NULL;
3642 
3643 	if (ldcp->rxh != NULL) {
3644 		if (vio_destroy_mblks(ldcp->rxh)) {
3645 			/*
3646 			 * Mostly likely some mblks are still in use and
3647 			 * have not been returned to the pool. Add the pool
3648 			 * to the list maintained in the device instance.
3649 			 * Another attempt will be made to destroy the pool
3650 			 * when the device detaches.
3651 			 */
3652 			ldcp->rxh->nextp =  vswp->rxh;
3653 			vswp->rxh = ldcp->rxh;
3654 		}
3655 	}
3656 
3657 	/* unlink it from the list */
3658 	prev_ldcp = ldcp->ldc_next;
3659 	ldcl->num_ldcs--;
3660 
3661 	mutex_destroy(&ldcp->ldc_txlock);
3662 	mutex_destroy(&ldcp->ldc_cblock);
3663 	cv_destroy(&ldcp->drain_cv);
3664 	mutex_destroy(&ldcp->drain_cv_lock);
3665 	mutex_destroy(&ldcp->hss_lock);
3666 	mutex_destroy(&ldcp->lane_in.seq_lock);
3667 	mutex_destroy(&ldcp->lane_out.seq_lock);
3668 	mutex_destroy(&ldcp->status_lock);
3669 	rw_destroy(&ldcp->lane_in.dlistrw);
3670 	rw_destroy(&ldcp->lane_out.dlistrw);
3671 
3672 	kmem_free(ldcp, sizeof (vsw_ldc_t));
3673 
3674 	return (0);
3675 }
3676 
3677 /*
3678  * Open and attempt to bring up the channel. Note that channel
3679  * can only be brought up if peer has also opened channel.
3680  *
3681  * Returns 0 if can open and bring up channel, otherwise
3682  * returns 1.
3683  */
3684 static int
3685 vsw_ldc_init(vsw_ldc_t *ldcp)
3686 {
3687 	vsw_t 		*vswp = ldcp->ldc_vswp;
3688 	ldc_status_t	istatus = 0;
3689 	int		rv;
3690 
3691 	D1(vswp, "%s: enter", __func__);
3692 
3693 	LDC_ENTER_LOCK(ldcp);
3694 
3695 	/* don't start at 0 in case clients don't like that */
3696 	ldcp->next_ident = 1;
3697 
3698 	rv = ldc_open(ldcp->ldc_handle);
3699 	if (rv != 0) {
3700 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
3701 		    __func__, ldcp->ldc_id, rv);
3702 		LDC_EXIT_LOCK(ldcp);
3703 		return (1);
3704 	}
3705 
3706 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
3707 		DERR(vswp, "%s: unable to get status", __func__);
3708 		LDC_EXIT_LOCK(ldcp);
3709 		return (1);
3710 
3711 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
3712 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
3713 		    __func__, ldcp->ldc_id, istatus);
3714 		LDC_EXIT_LOCK(ldcp);
3715 		return (1);
3716 	}
3717 
3718 	mutex_enter(&ldcp->status_lock);
3719 	ldcp->ldc_status = istatus;
3720 	mutex_exit(&ldcp->status_lock);
3721 
3722 	rv = ldc_up(ldcp->ldc_handle);
3723 	if (rv != 0) {
3724 		/*
3725 		 * Not a fatal error for ldc_up() to fail, as peer
3726 		 * end point may simply not be ready yet.
3727 		 */
3728 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
3729 			ldcp->ldc_id, rv);
3730 		LDC_EXIT_LOCK(ldcp);
3731 		return (1);
3732 	}
3733 
3734 	/*
3735 	 * ldc_up() call is non-blocking so need to explicitly
3736 	 * check channel status to see if in fact the channel
3737 	 * is UP.
3738 	 */
3739 	mutex_enter(&ldcp->status_lock);
3740 	if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
3741 		DERR(vswp, "%s: unable to get status", __func__);
3742 		mutex_exit(&ldcp->status_lock);
3743 		LDC_EXIT_LOCK(ldcp);
3744 		return (1);
3745 
3746 	}
3747 
3748 	if (ldcp->ldc_status == LDC_UP) {
3749 		D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
3750 			ldcp->ldc_id, istatus);
3751 		mutex_exit(&ldcp->status_lock);
3752 		LDC_EXIT_LOCK(ldcp);
3753 
3754 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
3755 		return (0);
3756 	}
3757 
3758 	mutex_exit(&ldcp->status_lock);
3759 	LDC_EXIT_LOCK(ldcp);
3760 
3761 	D1(vswp, "%s: exit", __func__);
3762 	return (0);
3763 }
3764 
3765 /* disable callbacks on the channel */
3766 static int
3767 vsw_ldc_uninit(vsw_ldc_t *ldcp)
3768 {
3769 	vsw_t	*vswp = ldcp->ldc_vswp;
3770 	int	rv;
3771 
3772 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
3773 
3774 	LDC_ENTER_LOCK(ldcp);
3775 
3776 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
3777 	if (rv != 0) {
3778 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
3779 			"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
3780 		LDC_EXIT_LOCK(ldcp);
3781 		return (1);
3782 	}
3783 
3784 	mutex_enter(&ldcp->status_lock);
3785 	ldcp->ldc_status = LDC_INIT;
3786 	mutex_exit(&ldcp->status_lock);
3787 
3788 	LDC_EXIT_LOCK(ldcp);
3789 
3790 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
3791 
3792 	return (0);
3793 }
3794 
3795 static int
3796 vsw_init_ldcs(vsw_port_t *port)
3797 {
3798 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3799 	vsw_ldc_t	*ldcp;
3800 
3801 	READ_ENTER(&ldcl->lockrw);
3802 	ldcp =  ldcl->head;
3803 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3804 		(void) vsw_ldc_init(ldcp);
3805 	}
3806 	RW_EXIT(&ldcl->lockrw);
3807 
3808 	return (0);
3809 }
3810 
3811 static int
3812 vsw_uninit_ldcs(vsw_port_t *port)
3813 {
3814 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3815 	vsw_ldc_t	*ldcp;
3816 
3817 	D1(NULL, "vsw_uninit_ldcs: enter\n");
3818 
3819 	READ_ENTER(&ldcl->lockrw);
3820 	ldcp =  ldcl->head;
3821 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3822 		(void) vsw_ldc_uninit(ldcp);
3823 	}
3824 	RW_EXIT(&ldcl->lockrw);
3825 
3826 	D1(NULL, "vsw_uninit_ldcs: exit\n");
3827 
3828 	return (0);
3829 }
3830 
3831 /*
3832  * Wait until the callback(s) associated with the ldcs under the specified
3833  * port have completed.
3834  *
3835  * Prior to this function being invoked each channel under this port
3836  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3837  *
3838  * A short explaination of what we are doing below..
3839  *
3840  * The simplest approach would be to have a reference counter in
3841  * the ldc structure which is increment/decremented by the callbacks as
3842  * they use the channel. The drain function could then simply disable any
3843  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
3844  * there is a tiny window here - before the callback is able to get the lock
3845  * on the channel it is interrupted and this function gets to execute. It
3846  * sees that the ref count is zero and believes its free to delete the
3847  * associated data structures.
3848  *
3849  * We get around this by taking advantage of the fact that before the ldc
3850  * framework invokes a callback it sets a flag to indicate that there is a
3851  * callback active (or about to become active). If when we attempt to
3852  * unregister a callback when this active flag is set then the unregister
3853  * will fail with EWOULDBLOCK.
3854  *
3855  * If the unregister fails we do a cv_timedwait. We will either be signaled
3856  * by the callback as it is exiting (note we have to wait a short period to
3857  * allow the callback to return fully to the ldc framework and it to clear
3858  * the active flag), or by the timer expiring. In either case we again attempt
3859  * the unregister. We repeat this until we can succesfully unregister the
3860  * callback.
3861  *
3862  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
3863  * the case where the callback has finished but the ldc framework has not yet
3864  * cleared the active flag. In this case we would never get a cv_signal.
3865  */
3866 static int
3867 vsw_drain_ldcs(vsw_port_t *port)
3868 {
3869 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
3870 	vsw_ldc_t	*ldcp;
3871 	vsw_t		*vswp = port->p_vswp;
3872 
3873 	D1(vswp, "%s: enter", __func__);
3874 
3875 	READ_ENTER(&ldcl->lockrw);
3876 
3877 	ldcp = ldcl->head;
3878 
3879 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
3880 		/*
3881 		 * If we can unregister the channel callback then we
3882 		 * know that there is no callback either running or
3883 		 * scheduled to run for this channel so move on to next
3884 		 * channel in the list.
3885 		 */
3886 		mutex_enter(&ldcp->drain_cv_lock);
3887 
3888 		/* prompt active callbacks to quit */
3889 		ldcp->drain_state = VSW_LDC_DRAINING;
3890 
3891 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
3892 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
3893 				ldcp->ldc_id);
3894 			mutex_exit(&ldcp->drain_cv_lock);
3895 			continue;
3896 		} else {
3897 			/*
3898 			 * If we end up here we know that either 1) a callback
3899 			 * is currently executing, 2) is about to start (i.e.
3900 			 * the ldc framework has set the active flag but
3901 			 * has not actually invoked the callback yet, or 3)
3902 			 * has finished and has returned to the ldc framework
3903 			 * but the ldc framework has not yet cleared the
3904 			 * active bit.
3905 			 *
3906 			 * Wait for it to finish.
3907 			 */
3908 			while (ldc_unreg_callback(ldcp->ldc_handle)
3909 								== EWOULDBLOCK)
3910 				(void) cv_timedwait(&ldcp->drain_cv,
3911 					&ldcp->drain_cv_lock, lbolt + hz);
3912 
3913 			mutex_exit(&ldcp->drain_cv_lock);
3914 			D2(vswp, "%s: unreg callback for chan %ld after "
3915 				"timeout", __func__, ldcp->ldc_id);
3916 		}
3917 	}
3918 	RW_EXIT(&ldcl->lockrw);
3919 
3920 	D1(vswp, "%s: exit", __func__);
3921 	return (0);
3922 }
3923 
3924 /*
3925  * Wait until all tasks which reference this port have completed.
3926  *
3927  * Prior to this function being invoked each channel under this port
3928  * should have been quiesced via ldc_set_cb_mode(DISABLE).
3929  */
3930 static int
3931 vsw_drain_port_taskq(vsw_port_t *port)
3932 {
3933 	vsw_t		*vswp = port->p_vswp;
3934 
3935 	D1(vswp, "%s: enter", __func__);
3936 
3937 	/*
3938 	 * Mark the port as in the process of being detached, and
3939 	 * dispatch a marker task to the queue so we know when all
3940 	 * relevant tasks have completed.
3941 	 */
3942 	mutex_enter(&port->state_lock);
3943 	port->state = VSW_PORT_DETACHING;
3944 
3945 	if ((vswp->taskq_p == NULL) ||
3946 		(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
3947 			port, DDI_NOSLEEP) != DDI_SUCCESS)) {
3948 		DERR(vswp, "%s: unable to dispatch marker task",
3949 			__func__);
3950 		mutex_exit(&port->state_lock);
3951 		return (1);
3952 	}
3953 
3954 	/*
3955 	 * Wait for the marker task to finish.
3956 	 */
3957 	while (port->state != VSW_PORT_DETACHABLE)
3958 		cv_wait(&port->state_cv, &port->state_lock);
3959 
3960 	mutex_exit(&port->state_lock);
3961 
3962 	D1(vswp, "%s: exit", __func__);
3963 
3964 	return (0);
3965 }
3966 
3967 static void
3968 vsw_marker_task(void *arg)
3969 {
3970 	vsw_port_t	*port = arg;
3971 	vsw_t		*vswp = port->p_vswp;
3972 
3973 	D1(vswp, "%s: enter", __func__);
3974 
3975 	mutex_enter(&port->state_lock);
3976 
3977 	/*
3978 	 * No further tasks should be dispatched which reference
3979 	 * this port so ok to mark it as safe to detach.
3980 	 */
3981 	port->state = VSW_PORT_DETACHABLE;
3982 
3983 	cv_signal(&port->state_cv);
3984 
3985 	mutex_exit(&port->state_lock);
3986 
3987 	D1(vswp, "%s: exit", __func__);
3988 }
3989 
3990 static vsw_port_t *
3991 vsw_lookup_port(vsw_t *vswp, int p_instance)
3992 {
3993 	vsw_port_list_t *plist = &vswp->plist;
3994 	vsw_port_t	*port;
3995 
3996 	for (port = plist->head; port != NULL; port = port->p_next) {
3997 		if (port->p_instance == p_instance) {
3998 			D2(vswp, "vsw_lookup_port: found p_instance\n");
3999 			return (port);
4000 		}
4001 	}
4002 
4003 	return (NULL);
4004 }
4005 
4006 /*
4007  * Search for and remove the specified port from the port
4008  * list. Returns 0 if able to locate and remove port, otherwise
4009  * returns 1.
4010  */
4011 static int
4012 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
4013 {
4014 	vsw_port_list_t *plist = &vswp->plist;
4015 	vsw_port_t	*curr_p, *prev_p;
4016 
4017 	if (plist->head == NULL)
4018 		return (1);
4019 
4020 	curr_p = prev_p = plist->head;
4021 
4022 	while (curr_p != NULL) {
4023 		if (curr_p == port) {
4024 			if (prev_p == curr_p) {
4025 				plist->head = curr_p->p_next;
4026 			} else {
4027 				prev_p->p_next = curr_p->p_next;
4028 			}
4029 			plist->num_ports--;
4030 			break;
4031 		} else {
4032 			prev_p = curr_p;
4033 			curr_p = curr_p->p_next;
4034 		}
4035 	}
4036 	return (0);
4037 }
4038 
4039 /*
4040  * Interrupt handler for ldc messages.
4041  */
4042 static uint_t
4043 vsw_ldc_cb(uint64_t event, caddr_t arg)
4044 {
4045 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
4046 	vsw_t 		*vswp = ldcp->ldc_vswp;
4047 
4048 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4049 
4050 	mutex_enter(&ldcp->ldc_cblock);
4051 
4052 	mutex_enter(&ldcp->status_lock);
4053 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
4054 		mutex_exit(&ldcp->status_lock);
4055 		mutex_exit(&ldcp->ldc_cblock);
4056 		return (LDC_SUCCESS);
4057 	}
4058 	mutex_exit(&ldcp->status_lock);
4059 
4060 	if (event & LDC_EVT_UP) {
4061 		/*
4062 		 * Channel has come up.
4063 		 */
4064 		D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
4065 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
4066 
4067 		vsw_process_conn_evt(ldcp, VSW_CONN_UP);
4068 
4069 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
4070 	}
4071 
4072 	if (event & LDC_EVT_READ) {
4073 		/*
4074 		 * Data available for reading.
4075 		 */
4076 		D2(vswp, "%s: id(ld) event(%llx) data READ",
4077 				__func__, ldcp->ldc_id, event);
4078 
4079 		vsw_process_pkt(ldcp);
4080 
4081 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
4082 
4083 		goto vsw_cb_exit;
4084 	}
4085 
4086 	if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
4087 		D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
4088 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
4089 
4090 		vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4091 	}
4092 
4093 	/*
4094 	 * Catch either LDC_EVT_WRITE which we don't support or any
4095 	 * unknown event.
4096 	 */
4097 	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
4098 					| LDC_EVT_DOWN | LDC_EVT_READ)) {
4099 
4100 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
4101 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
4102 	}
4103 
4104 vsw_cb_exit:
4105 	mutex_exit(&ldcp->ldc_cblock);
4106 
4107 	/*
4108 	 * Let the drain function know we are finishing if it
4109 	 * is waiting.
4110 	 */
4111 	mutex_enter(&ldcp->drain_cv_lock);
4112 	if (ldcp->drain_state == VSW_LDC_DRAINING)
4113 		cv_signal(&ldcp->drain_cv);
4114 	mutex_exit(&ldcp->drain_cv_lock);
4115 
4116 	return (LDC_SUCCESS);
4117 }
4118 
4119 /*
4120  * Reinitialise data structures associated with the channel.
4121  */
4122 static void
4123 vsw_ldc_reinit(vsw_ldc_t *ldcp)
4124 {
4125 	vsw_t		*vswp = ldcp->ldc_vswp;
4126 	vsw_port_t	*port;
4127 	vsw_ldc_list_t	*ldcl;
4128 
4129 	D1(vswp, "%s: enter", __func__);
4130 
4131 	port = ldcp->ldc_port;
4132 	ldcl = &port->p_ldclist;
4133 
4134 	READ_ENTER(&ldcl->lockrw);
4135 
4136 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
4137 		ldcp->lane_in.lstate, ldcp->lane_out.lstate);
4138 
4139 	vsw_free_lane_resources(ldcp, INBOUND);
4140 	vsw_free_lane_resources(ldcp, OUTBOUND);
4141 	RW_EXIT(&ldcl->lockrw);
4142 
4143 	ldcp->lane_in.lstate = 0;
4144 	ldcp->lane_out.lstate = 0;
4145 
4146 	/*
4147 	 * Remove parent port from any multicast groups
4148 	 * it may have registered with. Client must resend
4149 	 * multicast add command after handshake completes.
4150 	 */
4151 	(void) vsw_del_fdb(vswp, port);
4152 
4153 	vsw_del_mcst_port(port);
4154 
4155 	ldcp->peer_session = 0;
4156 	ldcp->session_status = 0;
4157 	ldcp->hcnt = 0;
4158 	ldcp->hphase = VSW_MILESTONE0;
4159 
4160 	D1(vswp, "%s: exit", __func__);
4161 }
4162 
4163 /*
4164  * Process a connection event.
4165  *
4166  * Note - care must be taken to ensure that this function is
4167  * not called with the dlistrw lock held.
4168  */
4169 static void
4170 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
4171 {
4172 	vsw_t		*vswp = ldcp->ldc_vswp;
4173 	vsw_conn_evt_t	*conn = NULL;
4174 
4175 	D1(vswp, "%s: enter", __func__);
4176 
4177 	/*
4178 	 * Check if either a reset or restart event is pending
4179 	 * or in progress. If so just return.
4180 	 *
4181 	 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
4182 	 * being received by the callback handler, or a ECONNRESET error
4183 	 * code being returned from a ldc_read() or ldc_write() call.
4184 	 *
4185 	 * A VSW_CONN_RESTART event occurs when some error checking code
4186 	 * decides that there is a problem with data from the channel,
4187 	 * and that the handshake should be restarted.
4188 	 */
4189 	if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
4190 			(ldstub((uint8_t *)&ldcp->reset_active)))
4191 		return;
4192 
4193 	/*
4194 	 * If it is an LDC_UP event we first check the recorded
4195 	 * state of the channel. If this is UP then we know that
4196 	 * the channel moving to the UP state has already been dealt
4197 	 * with and don't need to dispatch a  new task.
4198 	 *
4199 	 * The reason for this check is that when we do a ldc_up(),
4200 	 * depending on the state of the peer, we may or may not get
4201 	 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
4202 	 * every time we do ldc_up() we explicitly check the channel
4203 	 * status to see has it come up (ldc_up() is asynch and will
4204 	 * complete at some undefined time), and take the appropriate
4205 	 * action.
4206 	 *
4207 	 * The flip side of this is that we may get a LDC_UP event
4208 	 * when we have already seen that the channel is up and have
4209 	 * dealt with that.
4210 	 */
4211 	mutex_enter(&ldcp->status_lock);
4212 	if (evt == VSW_CONN_UP) {
4213 		if ((ldcp->ldc_status == LDC_UP) ||
4214 					(ldcp->reset_active != 0)) {
4215 			mutex_exit(&ldcp->status_lock);
4216 			return;
4217 		}
4218 	}
4219 	mutex_exit(&ldcp->status_lock);
4220 
4221 	/*
4222 	 * The transaction group id allows us to identify and discard
4223 	 * any tasks which are still pending on the taskq and refer
4224 	 * to the handshake session we are about to restart or reset.
4225 	 * These stale messages no longer have any real meaning.
4226 	 */
4227 	mutex_enter(&ldcp->hss_lock);
4228 	ldcp->hss_id++;
4229 	mutex_exit(&ldcp->hss_lock);
4230 
4231 	ASSERT(vswp->taskq_p != NULL);
4232 
4233 	if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
4234 		cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
4235 			" connection event", vswp->instance);
4236 		goto err_exit;
4237 	}
4238 
4239 	conn->evt = evt;
4240 	conn->ldcp = ldcp;
4241 
4242 	if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
4243 		DDI_NOSLEEP) != DDI_SUCCESS) {
4244 		cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
4245 			vswp->instance);
4246 
4247 		kmem_free(conn, sizeof (vsw_conn_evt_t));
4248 		goto err_exit;
4249 	}
4250 
4251 	D1(vswp, "%s: exit", __func__);
4252 	return;
4253 
4254 err_exit:
4255 	/*
4256 	 * Have mostly likely failed due to memory shortage. Clear the flag so
4257 	 * that future requests will at least be attempted and will hopefully
4258 	 * succeed.
4259 	 */
4260 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
4261 		ldcp->reset_active = 0;
4262 }
4263 
4264 /*
4265  * Deal with events relating to a connection. Invoked from a taskq.
4266  */
4267 static void
4268 vsw_conn_task(void *arg)
4269 {
4270 	vsw_conn_evt_t	*conn = (vsw_conn_evt_t *)arg;
4271 	vsw_ldc_t	*ldcp = NULL;
4272 	vsw_t		*vswp = NULL;
4273 	uint16_t	evt;
4274 	ldc_status_t	curr_status;
4275 
4276 	ldcp = conn->ldcp;
4277 	evt = conn->evt;
4278 	vswp = ldcp->ldc_vswp;
4279 
4280 	D1(vswp, "%s: enter", __func__);
4281 
4282 	/* can safely free now have copied out data */
4283 	kmem_free(conn, sizeof (vsw_conn_evt_t));
4284 
4285 	mutex_enter(&ldcp->status_lock);
4286 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
4287 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
4288 			"channel %ld", vswp->instance, ldcp->ldc_id);
4289 		mutex_exit(&ldcp->status_lock);
4290 		return;
4291 	}
4292 
4293 	/*
4294 	 * If we wish to restart the handshake on this channel, then if
4295 	 * the channel is UP we bring it DOWN to flush the underlying
4296 	 * ldc queue.
4297 	 */
4298 	if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
4299 		(void) ldc_down(ldcp->ldc_handle);
4300 
4301 	/*
4302 	 * re-init all the associated data structures.
4303 	 */
4304 	vsw_ldc_reinit(ldcp);
4305 
4306 	/*
4307 	 * Bring the channel back up (note it does no harm to
4308 	 * do this even if the channel is already UP, Just
4309 	 * becomes effectively a no-op).
4310 	 */
4311 	(void) ldc_up(ldcp->ldc_handle);
4312 
4313 	/*
4314 	 * Check if channel is now UP. This will only happen if
4315 	 * peer has also done a ldc_up().
4316 	 */
4317 	if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
4318 		cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
4319 			"channel %ld", vswp->instance, ldcp->ldc_id);
4320 		mutex_exit(&ldcp->status_lock);
4321 		return;
4322 	}
4323 
4324 	ldcp->ldc_status = curr_status;
4325 
4326 	/* channel UP so restart handshake by sending version info */
4327 	if (curr_status == LDC_UP) {
4328 		if (ldcp->hcnt++ > vsw_num_handshakes) {
4329 			cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
4330 				" handshake attempts (%d) on channel %ld",
4331 				vswp->instance, ldcp->hcnt, ldcp->ldc_id);
4332 			mutex_exit(&ldcp->status_lock);
4333 			return;
4334 		}
4335 
4336 		if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
4337 			DDI_NOSLEEP) != DDI_SUCCESS) {
4338 			cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
4339 				vswp->instance);
4340 
4341 			/*
4342 			 * Don't count as valid restart attempt if couldn't
4343 			 * send version msg.
4344 			 */
4345 			if (ldcp->hcnt > 0)
4346 				ldcp->hcnt--;
4347 		}
4348 	}
4349 
4350 	/*
4351 	 * Mark that the process is complete by clearing the flag.
4352 	 *
4353 	 * Note is it possible that the taskq dispatch above may have failed,
4354 	 * most likely due to memory shortage. We still clear the flag so
4355 	 * future attempts will at least be attempted and will hopefully
4356 	 * succeed.
4357 	 */
4358 	if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
4359 		ldcp->reset_active = 0;
4360 
4361 	mutex_exit(&ldcp->status_lock);
4362 
4363 	D1(vswp, "%s: exit", __func__);
4364 }
4365 
4366 /*
4367  * returns 0 if legal for event signified by flag to have
4368  * occured at the time it did. Otherwise returns 1.
4369  */
4370 int
4371 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
4372 {
4373 	vsw_t		*vswp = ldcp->ldc_vswp;
4374 	uint64_t	state;
4375 	uint64_t	phase;
4376 
4377 	if (dir == INBOUND)
4378 		state = ldcp->lane_in.lstate;
4379 	else
4380 		state = ldcp->lane_out.lstate;
4381 
4382 	phase = ldcp->hphase;
4383 
4384 	switch (flag) {
4385 	case VSW_VER_INFO_RECV:
4386 		if (phase > VSW_MILESTONE0) {
4387 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
4388 				" when in state %d\n", ldcp->ldc_id, phase);
4389 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4390 			return (1);
4391 		}
4392 		break;
4393 
4394 	case VSW_VER_ACK_RECV:
4395 	case VSW_VER_NACK_RECV:
4396 		if (!(state & VSW_VER_INFO_SENT)) {
4397 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
4398 				" or VER_NACK when in state %d\n",
4399 				ldcp->ldc_id, phase);
4400 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4401 			return (1);
4402 		} else
4403 			state &= ~VSW_VER_INFO_SENT;
4404 		break;
4405 
4406 	case VSW_ATTR_INFO_RECV:
4407 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
4408 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
4409 				" when in state %d\n", ldcp->ldc_id, phase);
4410 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4411 			return (1);
4412 		}
4413 		break;
4414 
4415 	case VSW_ATTR_ACK_RECV:
4416 	case VSW_ATTR_NACK_RECV:
4417 		if (!(state & VSW_ATTR_INFO_SENT)) {
4418 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
4419 				" or ATTR_NACK when in state %d\n",
4420 				ldcp->ldc_id, phase);
4421 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4422 			return (1);
4423 		} else
4424 			state &= ~VSW_ATTR_INFO_SENT;
4425 		break;
4426 
4427 	case VSW_DRING_INFO_RECV:
4428 		if (phase < VSW_MILESTONE1) {
4429 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
4430 				" when in state %d\n", ldcp->ldc_id, phase);
4431 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4432 			return (1);
4433 		}
4434 		break;
4435 
4436 	case VSW_DRING_ACK_RECV:
4437 	case VSW_DRING_NACK_RECV:
4438 		if (!(state & VSW_DRING_INFO_SENT)) {
4439 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
4440 				" or DRING_NACK when in state %d\n",
4441 				ldcp->ldc_id, phase);
4442 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4443 			return (1);
4444 		} else
4445 			state &= ~VSW_DRING_INFO_SENT;
4446 		break;
4447 
4448 	case VSW_RDX_INFO_RECV:
4449 		if (phase < VSW_MILESTONE3) {
4450 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
4451 				" when in state %d\n", ldcp->ldc_id, phase);
4452 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4453 			return (1);
4454 		}
4455 		break;
4456 
4457 	case VSW_RDX_ACK_RECV:
4458 	case VSW_RDX_NACK_RECV:
4459 		if (!(state & VSW_RDX_INFO_SENT)) {
4460 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
4461 				" or RDX_NACK when in state %d\n",
4462 				ldcp->ldc_id, phase);
4463 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4464 			return (1);
4465 		} else
4466 			state &= ~VSW_RDX_INFO_SENT;
4467 		break;
4468 
4469 	case VSW_MCST_INFO_RECV:
4470 		if (phase < VSW_MILESTONE3) {
4471 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
4472 				" when in state %d\n", ldcp->ldc_id, phase);
4473 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4474 			return (1);
4475 		}
4476 		break;
4477 
4478 	default:
4479 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
4480 				ldcp->ldc_id, flag);
4481 		return (1);
4482 	}
4483 
4484 	if (dir == INBOUND)
4485 		ldcp->lane_in.lstate = state;
4486 	else
4487 		ldcp->lane_out.lstate = state;
4488 
4489 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
4490 
4491 	return (0);
4492 }
4493 
4494 void
4495 vsw_next_milestone(vsw_ldc_t *ldcp)
4496 {
4497 	vsw_t		*vswp = ldcp->ldc_vswp;
4498 
4499 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
4500 		ldcp->ldc_id, ldcp->hphase);
4501 
4502 	DUMP_FLAGS(ldcp->lane_in.lstate);
4503 	DUMP_FLAGS(ldcp->lane_out.lstate);
4504 
4505 	switch (ldcp->hphase) {
4506 
4507 	case VSW_MILESTONE0:
4508 		/*
4509 		 * If we haven't started to handshake with our peer,
4510 		 * start to do so now.
4511 		 */
4512 		if (ldcp->lane_out.lstate == 0) {
4513 			D2(vswp, "%s: (chan %lld) starting handshake "
4514 				"with peer", __func__, ldcp->ldc_id);
4515 			vsw_process_conn_evt(ldcp, VSW_CONN_UP);
4516 		}
4517 
4518 		/*
4519 		 * Only way to pass this milestone is to have successfully
4520 		 * negotiated version info.
4521 		 */
4522 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
4523 			(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
4524 
4525 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
4526 				__func__, ldcp->ldc_id);
4527 
4528 			/*
4529 			 * Next milestone is passed when attribute
4530 			 * information has been successfully exchanged.
4531 			 */
4532 			ldcp->hphase = VSW_MILESTONE1;
4533 			vsw_send_attr(ldcp);
4534 
4535 		}
4536 		break;
4537 
4538 	case VSW_MILESTONE1:
4539 		/*
4540 		 * Only way to pass this milestone is to have successfully
4541 		 * negotiated attribute information.
4542 		 */
4543 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
4544 
4545 			ldcp->hphase = VSW_MILESTONE2;
4546 
4547 			/*
4548 			 * If the peer device has said it wishes to
4549 			 * use descriptor rings then we send it our ring
4550 			 * info, otherwise we just set up a private ring
4551 			 * which we use an internal buffer
4552 			 */
4553 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
4554 				vsw_send_dring_info(ldcp);
4555 		}
4556 		break;
4557 
4558 	case VSW_MILESTONE2:
4559 		/*
4560 		 * If peer has indicated in its attribute message that
4561 		 * it wishes to use descriptor rings then the only way
4562 		 * to pass this milestone is for us to have received
4563 		 * valid dring info.
4564 		 *
4565 		 * If peer is not using descriptor rings then just fall
4566 		 * through.
4567 		 */
4568 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
4569 			(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
4570 			break;
4571 
4572 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
4573 				__func__, ldcp->ldc_id);
4574 
4575 		ldcp->hphase = VSW_MILESTONE3;
4576 		vsw_send_rdx(ldcp);
4577 		break;
4578 
4579 	case VSW_MILESTONE3:
4580 		/*
4581 		 * Pass this milestone when all paramaters have been
4582 		 * successfully exchanged and RDX sent in both directions.
4583 		 *
4584 		 * Mark outbound lane as available to transmit data.
4585 		 */
4586 		if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) &&
4587 			(ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) {
4588 
4589 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
4590 				__func__, ldcp->ldc_id);
4591 			D2(vswp, "%s: ** handshake complete (0x%llx : "
4592 				"0x%llx) **", __func__, ldcp->lane_in.lstate,
4593 				ldcp->lane_out.lstate);
4594 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
4595 			ldcp->hphase = VSW_MILESTONE4;
4596 			ldcp->hcnt = 0;
4597 			DISPLAY_STATE();
4598 		} else {
4599 			D2(vswp, "%s: still in milestone 3 (0x%llx :"
4600 				" 0x%llx", __func__, ldcp->lane_in.lstate,
4601 				ldcp->lane_out.lstate);
4602 		}
4603 		break;
4604 
4605 	case VSW_MILESTONE4:
4606 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
4607 							ldcp->ldc_id);
4608 		break;
4609 
4610 	default:
4611 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
4612 			ldcp->ldc_id, ldcp->hphase);
4613 	}
4614 
4615 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
4616 		ldcp->hphase);
4617 }
4618 
4619 /*
4620  * Check if major version is supported.
4621  *
4622  * Returns 0 if finds supported major number, and if necessary
4623  * adjusts the minor field.
4624  *
4625  * Returns 1 if can't match major number exactly. Sets mjor/minor
4626  * to next lowest support values, or to zero if no other values possible.
4627  */
4628 static int
4629 vsw_supported_version(vio_ver_msg_t *vp)
4630 {
4631 	int	i;
4632 
4633 	D1(NULL, "vsw_supported_version: enter");
4634 
4635 	for (i = 0; i < VSW_NUM_VER; i++) {
4636 		if (vsw_versions[i].ver_major == vp->ver_major) {
4637 			/*
4638 			 * Matching or lower major version found. Update
4639 			 * minor number if necessary.
4640 			 */
4641 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
4642 				D2(NULL, "%s: adjusting minor value"
4643 					" from %d to %d", __func__,
4644 					vp->ver_minor,
4645 					vsw_versions[i].ver_minor);
4646 				vp->ver_minor = vsw_versions[i].ver_minor;
4647 			}
4648 
4649 			return (0);
4650 		}
4651 
4652 		if (vsw_versions[i].ver_major < vp->ver_major) {
4653 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
4654 				D2(NULL, "%s: adjusting minor value"
4655 					" from %d to %d", __func__,
4656 					vp->ver_minor,
4657 					vsw_versions[i].ver_minor);
4658 				vp->ver_minor = vsw_versions[i].ver_minor;
4659 			}
4660 			return (1);
4661 		}
4662 	}
4663 
4664 	/* No match was possible, zero out fields */
4665 	vp->ver_major = 0;
4666 	vp->ver_minor = 0;
4667 
4668 	D1(NULL, "vsw_supported_version: exit");
4669 
4670 	return (1);
4671 }
4672 
4673 /*
4674  * Main routine for processing messages received over LDC.
4675  */
4676 static void
4677 vsw_process_pkt(void *arg)
4678 {
4679 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
4680 	vsw_t 		*vswp = ldcp->ldc_vswp;
4681 	size_t		msglen;
4682 	vio_msg_tag_t	tag;
4683 	def_msg_t	dmsg;
4684 	int 		rv = 0;
4685 
4686 
4687 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4688 
4689 	/*
4690 	 * If channel is up read messages until channel is empty.
4691 	 */
4692 	do {
4693 		msglen = sizeof (dmsg);
4694 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
4695 
4696 		if (rv != 0) {
4697 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
4698 				"len(%d)\n", __func__, ldcp->ldc_id,
4699 							rv, msglen);
4700 		}
4701 
4702 		/* channel has been reset */
4703 		if (rv == ECONNRESET) {
4704 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
4705 			break;
4706 		}
4707 
4708 		if (msglen == 0) {
4709 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
4710 			ldcp->ldc_id);
4711 			break;
4712 		}
4713 
4714 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
4715 		    ldcp->ldc_id, msglen);
4716 
4717 		/*
4718 		 * Figure out what sort of packet we have gotten by
4719 		 * examining the msg tag, and then switch it appropriately.
4720 		 */
4721 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
4722 
4723 		switch (tag.vio_msgtype) {
4724 		case VIO_TYPE_CTRL:
4725 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
4726 			break;
4727 		case VIO_TYPE_DATA:
4728 			vsw_process_data_pkt(ldcp, &dmsg, tag);
4729 			break;
4730 		case VIO_TYPE_ERR:
4731 			vsw_process_err_pkt(ldcp, &dmsg, tag);
4732 			break;
4733 		default:
4734 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
4735 				"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
4736 			break;
4737 		}
4738 	} while (msglen);
4739 
4740 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
4741 }
4742 
4743 /*
4744  * Dispatch a task to process a VIO control message.
4745  */
4746 static void
4747 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
4748 {
4749 	vsw_ctrl_task_t		*ctaskp = NULL;
4750 	vsw_port_t		*port = ldcp->ldc_port;
4751 	vsw_t			*vswp = port->p_vswp;
4752 
4753 	D1(vswp, "%s: enter", __func__);
4754 
4755 	/*
4756 	 * We need to handle RDX ACK messages in-band as once they
4757 	 * are exchanged it is possible that we will get an
4758 	 * immediate (legitimate) data packet.
4759 	 */
4760 	if ((tag.vio_subtype_env == VIO_RDX) &&
4761 		(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
4762 
4763 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
4764 			return;
4765 
4766 		ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
4767 		D2(vswp, "%s (%ld) handling RDX_ACK in place "
4768 			"(ostate 0x%llx : hphase %d)", __func__,
4769 			ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
4770 		vsw_next_milestone(ldcp);
4771 		return;
4772 	}
4773 
4774 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
4775 
4776 	if (ctaskp == NULL) {
4777 		DERR(vswp, "%s: unable to alloc space for ctrl"
4778 			" msg", __func__);
4779 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4780 		return;
4781 	}
4782 
4783 	ctaskp->ldcp = ldcp;
4784 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
4785 	mutex_enter(&ldcp->hss_lock);
4786 	ctaskp->hss_id = ldcp->hss_id;
4787 	mutex_exit(&ldcp->hss_lock);
4788 
4789 	/*
4790 	 * Dispatch task to processing taskq if port is not in
4791 	 * the process of being detached.
4792 	 */
4793 	mutex_enter(&port->state_lock);
4794 	if (port->state == VSW_PORT_INIT) {
4795 		if ((vswp->taskq_p == NULL) ||
4796 			(ddi_taskq_dispatch(vswp->taskq_p,
4797 			vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
4798 							!= DDI_SUCCESS)) {
4799 			DERR(vswp, "%s: unable to dispatch task to taskq",
4800 				__func__);
4801 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4802 			mutex_exit(&port->state_lock);
4803 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4804 			return;
4805 		}
4806 	} else {
4807 		DWARN(vswp, "%s: port %d detaching, not dispatching "
4808 			"task", __func__, port->p_instance);
4809 	}
4810 
4811 	mutex_exit(&port->state_lock);
4812 
4813 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
4814 			ldcp->ldc_id);
4815 	D1(vswp, "%s: exit", __func__);
4816 }
4817 
4818 /*
4819  * Process a VIO ctrl message. Invoked from taskq.
4820  */
4821 static void
4822 vsw_process_ctrl_pkt(void *arg)
4823 {
4824 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
4825 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
4826 	vsw_t 		*vswp = ldcp->ldc_vswp;
4827 	vio_msg_tag_t	tag;
4828 	uint16_t	env;
4829 
4830 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4831 
4832 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
4833 	env = tag.vio_subtype_env;
4834 
4835 	/* stale pkt check */
4836 	mutex_enter(&ldcp->hss_lock);
4837 	if (ctaskp->hss_id < ldcp->hss_id) {
4838 		DWARN(vswp, "%s: discarding stale packet belonging to"
4839 			" earlier (%ld) handshake session", __func__,
4840 			ctaskp->hss_id);
4841 		mutex_exit(&ldcp->hss_lock);
4842 		return;
4843 	}
4844 	mutex_exit(&ldcp->hss_lock);
4845 
4846 	/* session id check */
4847 	if (ldcp->session_status & VSW_PEER_SESSION) {
4848 		if (ldcp->peer_session != tag.vio_sid) {
4849 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
4850 				__func__, ldcp->ldc_id, tag.vio_sid);
4851 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4852 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
4853 			return;
4854 		}
4855 	}
4856 
4857 	/*
4858 	 * Switch on vio_subtype envelope, then let lower routines
4859 	 * decide if its an INFO, ACK or NACK packet.
4860 	 */
4861 	switch (env) {
4862 	case VIO_VER_INFO:
4863 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
4864 		break;
4865 	case VIO_DRING_REG:
4866 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
4867 		break;
4868 	case VIO_DRING_UNREG:
4869 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
4870 		break;
4871 	case VIO_ATTR_INFO:
4872 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
4873 		break;
4874 	case VNET_MCAST_INFO:
4875 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
4876 		break;
4877 	case VIO_RDX:
4878 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
4879 		break;
4880 	default:
4881 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
4882 							__func__, env);
4883 	}
4884 
4885 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
4886 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4887 }
4888 
4889 /*
4890  * Version negotiation. We can end up here either because our peer
4891  * has responded to a handshake message we have sent it, or our peer
4892  * has initiated a handshake with us. If its the former then can only
4893  * be ACK or NACK, if its the later can only be INFO.
4894  *
4895  * If its an ACK we move to the next stage of the handshake, namely
4896  * attribute exchange. If its a NACK we see if we can specify another
4897  * version, if we can't we stop.
4898  *
4899  * If it is an INFO we reset all params associated with communication
4900  * in that direction over this channel (remember connection is
4901  * essentially 2 independent simplex channels).
4902  */
4903 void
4904 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
4905 {
4906 	vio_ver_msg_t	*ver_pkt;
4907 	vsw_t 		*vswp = ldcp->ldc_vswp;
4908 
4909 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4910 
4911 	/*
4912 	 * We know this is a ctrl/version packet so
4913 	 * cast it into the correct structure.
4914 	 */
4915 	ver_pkt = (vio_ver_msg_t *)pkt;
4916 
4917 	switch (ver_pkt->tag.vio_subtype) {
4918 	case VIO_SUBTYPE_INFO:
4919 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
4920 
4921 		/*
4922 		 * Record the session id, which we will use from now
4923 		 * until we see another VER_INFO msg. Even then the
4924 		 * session id in most cases will be unchanged, execpt
4925 		 * if channel was reset.
4926 		 */
4927 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
4928 			(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
4929 			DERR(vswp, "%s: updating session id for chan %lld "
4930 				"from %llx to %llx", __func__, ldcp->ldc_id,
4931 				ldcp->peer_session, ver_pkt->tag.vio_sid);
4932 		}
4933 
4934 		ldcp->peer_session = ver_pkt->tag.vio_sid;
4935 		ldcp->session_status |= VSW_PEER_SESSION;
4936 
4937 		/* Legal message at this time ? */
4938 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
4939 			return;
4940 
4941 		/*
4942 		 * First check the device class. Currently only expect
4943 		 * to be talking to a network device. In the future may
4944 		 * also talk to another switch.
4945 		 */
4946 		if (ver_pkt->dev_class != VDEV_NETWORK) {
4947 			DERR(vswp, "%s: illegal device class %d", __func__,
4948 				ver_pkt->dev_class);
4949 
4950 			ver_pkt->tag.vio_sid = ldcp->local_session;
4951 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4952 
4953 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
4954 
4955 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
4956 					sizeof (vio_ver_msg_t), B_TRUE);
4957 
4958 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
4959 			vsw_next_milestone(ldcp);
4960 			return;
4961 		} else {
4962 			ldcp->dev_class = ver_pkt->dev_class;
4963 		}
4964 
4965 		/*
4966 		 * Now check the version.
4967 		 */
4968 		if (vsw_supported_version(ver_pkt) == 0) {
4969 			/*
4970 			 * Support this major version and possibly
4971 			 * adjusted minor version.
4972 			 */
4973 
4974 			D2(vswp, "%s: accepted ver %d:%d", __func__,
4975 				ver_pkt->ver_major, ver_pkt->ver_minor);
4976 
4977 			/* Store accepted values */
4978 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4979 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4980 
4981 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4982 
4983 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
4984 		} else {
4985 			/*
4986 			 * NACK back with the next lower major/minor
4987 			 * pairing we support (if don't suuport any more
4988 			 * versions then they will be set to zero.
4989 			 */
4990 
4991 			D2(vswp, "%s: replying with ver %d:%d", __func__,
4992 				ver_pkt->ver_major, ver_pkt->ver_minor);
4993 
4994 			/* Store updated values */
4995 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
4996 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
4997 
4998 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
4999 
5000 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
5001 		}
5002 
5003 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
5004 		ver_pkt->tag.vio_sid = ldcp->local_session;
5005 		(void) vsw_send_msg(ldcp, (void *)ver_pkt,
5006 			sizeof (vio_ver_msg_t), B_TRUE);
5007 
5008 		vsw_next_milestone(ldcp);
5009 		break;
5010 
5011 	case VIO_SUBTYPE_ACK:
5012 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
5013 
5014 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
5015 			return;
5016 
5017 		/* Store updated values */
5018 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
5019 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
5020 
5021 
5022 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
5023 		vsw_next_milestone(ldcp);
5024 
5025 		break;
5026 
5027 	case VIO_SUBTYPE_NACK:
5028 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
5029 
5030 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
5031 			return;
5032 
5033 		/*
5034 		 * If our peer sent us a NACK with the ver fields set to
5035 		 * zero then there is nothing more we can do. Otherwise see
5036 		 * if we support either the version suggested, or a lesser
5037 		 * one.
5038 		 */
5039 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
5040 			DERR(vswp, "%s: peer unable to negotiate any "
5041 				"further.", __func__);
5042 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
5043 			vsw_next_milestone(ldcp);
5044 			return;
5045 		}
5046 
5047 		/*
5048 		 * Check to see if we support this major version or
5049 		 * a lower one. If we don't then maj/min will be set
5050 		 * to zero.
5051 		 */
5052 		(void) vsw_supported_version(ver_pkt);
5053 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
5054 			/* Nothing more we can do */
5055 			DERR(vswp, "%s: version negotiation failed.\n",
5056 								__func__);
5057 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
5058 			vsw_next_milestone(ldcp);
5059 		} else {
5060 			/* found a supported major version */
5061 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
5062 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
5063 
5064 			D2(vswp, "%s: resending with updated values (%x, %x)",
5065 				__func__, ver_pkt->ver_major,
5066 				ver_pkt->ver_minor);
5067 
5068 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
5069 			ver_pkt->tag.vio_sid = ldcp->local_session;
5070 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
5071 
5072 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
5073 
5074 			(void) vsw_send_msg(ldcp, (void *)ver_pkt,
5075 				sizeof (vio_ver_msg_t), B_TRUE);
5076 
5077 			vsw_next_milestone(ldcp);
5078 
5079 		}
5080 		break;
5081 
5082 	default:
5083 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5084 			ver_pkt->tag.vio_subtype);
5085 	}
5086 
5087 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
5088 }
5089 
5090 /*
5091  * Process an attribute packet. We can end up here either because our peer
5092  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
5093  * peer has sent us an attribute INFO message
5094  *
5095  * If its an ACK we then move to the next stage of the handshake which
5096  * is to send our descriptor ring info to our peer. If its a NACK then
5097  * there is nothing more we can (currently) do.
5098  *
5099  * If we get a valid/acceptable INFO packet (and we have already negotiated
5100  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
5101  * NACK back and reset channel state to INACTIV.
5102  *
5103  * FUTURE: in time we will probably negotiate over attributes, but for
5104  * the moment unacceptable attributes are regarded as a fatal error.
5105  *
5106  */
5107 void
5108 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
5109 {
5110 	vnet_attr_msg_t		*attr_pkt;
5111 	vsw_t			*vswp = ldcp->ldc_vswp;
5112 	vsw_port_t		*port = ldcp->ldc_port;
5113 	uint64_t		macaddr = 0;
5114 	int			i;
5115 
5116 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5117 
5118 	/*
5119 	 * We know this is a ctrl/attr packet so
5120 	 * cast it into the correct structure.
5121 	 */
5122 	attr_pkt = (vnet_attr_msg_t *)pkt;
5123 
5124 	switch (attr_pkt->tag.vio_subtype) {
5125 	case VIO_SUBTYPE_INFO:
5126 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5127 
5128 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
5129 			return;
5130 
5131 		/*
5132 		 * If the attributes are unacceptable then we NACK back.
5133 		 */
5134 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
5135 
5136 			DERR(vswp, "%s (chan %d): invalid attributes",
5137 				__func__, ldcp->ldc_id);
5138 
5139 			vsw_free_lane_resources(ldcp, INBOUND);
5140 
5141 			attr_pkt->tag.vio_sid = ldcp->local_session;
5142 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5143 
5144 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
5145 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
5146 			(void) vsw_send_msg(ldcp, (void *)attr_pkt,
5147 				sizeof (vnet_attr_msg_t), B_TRUE);
5148 
5149 			vsw_next_milestone(ldcp);
5150 			return;
5151 		}
5152 
5153 		/*
5154 		 * Otherwise store attributes for this lane and update
5155 		 * lane state.
5156 		 */
5157 		ldcp->lane_in.mtu = attr_pkt->mtu;
5158 		ldcp->lane_in.addr = attr_pkt->addr;
5159 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
5160 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
5161 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
5162 
5163 		macaddr = ldcp->lane_in.addr;
5164 		for (i = ETHERADDRL - 1; i >= 0; i--) {
5165 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
5166 			macaddr >>= 8;
5167 		}
5168 
5169 		/* create the fdb entry for this port/mac address */
5170 		(void) vsw_add_fdb(vswp, port);
5171 
5172 		/* setup device specifc xmit routines */
5173 		mutex_enter(&port->tx_lock);
5174 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
5175 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
5176 			port->transmit = vsw_dringsend;
5177 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
5178 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
5179 			vsw_create_privring(ldcp);
5180 			port->transmit = vsw_descrsend;
5181 		}
5182 		mutex_exit(&port->tx_lock);
5183 
5184 		attr_pkt->tag.vio_sid = ldcp->local_session;
5185 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5186 
5187 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
5188 
5189 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
5190 
5191 		(void) vsw_send_msg(ldcp, (void *)attr_pkt,
5192 				sizeof (vnet_attr_msg_t), B_TRUE);
5193 
5194 		vsw_next_milestone(ldcp);
5195 		break;
5196 
5197 	case VIO_SUBTYPE_ACK:
5198 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5199 
5200 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
5201 			return;
5202 
5203 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
5204 		vsw_next_milestone(ldcp);
5205 		break;
5206 
5207 	case VIO_SUBTYPE_NACK:
5208 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5209 
5210 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
5211 			return;
5212 
5213 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
5214 		vsw_next_milestone(ldcp);
5215 		break;
5216 
5217 	default:
5218 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5219 			attr_pkt->tag.vio_subtype);
5220 	}
5221 
5222 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5223 }
5224 
5225 /*
5226  * Process a dring info packet. We can end up here either because our peer
5227  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
5228  * peer has sent us a dring INFO message.
5229  *
5230  * If we get a valid/acceptable INFO packet (and we have already negotiated
5231  * a version) we ACK back and update the lane state, otherwise we NACK back.
5232  *
5233  * FUTURE: nothing to stop client from sending us info on multiple dring's
5234  * but for the moment we will just use the first one we are given.
5235  *
5236  */
5237 void
5238 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
5239 {
5240 	vio_dring_reg_msg_t	*dring_pkt;
5241 	vsw_t			*vswp = ldcp->ldc_vswp;
5242 	ldc_mem_info_t		minfo;
5243 	dring_info_t		*dp, *dbp;
5244 	int			dring_found = 0;
5245 
5246 	/*
5247 	 * We know this is a ctrl/dring packet so
5248 	 * cast it into the correct structure.
5249 	 */
5250 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
5251 
5252 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5253 
5254 	switch (dring_pkt->tag.vio_subtype) {
5255 	case VIO_SUBTYPE_INFO:
5256 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5257 
5258 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
5259 			return;
5260 
5261 		/*
5262 		 * If the dring params are unacceptable then we NACK back.
5263 		 */
5264 		if (vsw_check_dring_info(dring_pkt)) {
5265 
5266 			DERR(vswp, "%s (%lld): invalid dring info",
5267 				__func__, ldcp->ldc_id);
5268 
5269 			vsw_free_lane_resources(ldcp, INBOUND);
5270 
5271 			dring_pkt->tag.vio_sid = ldcp->local_session;
5272 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5273 
5274 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5275 
5276 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5277 
5278 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5279 				sizeof (vio_dring_reg_msg_t), B_TRUE);
5280 
5281 			vsw_next_milestone(ldcp);
5282 			return;
5283 		}
5284 
5285 		/*
5286 		 * Otherwise, attempt to map in the dring using the
5287 		 * cookie. If that succeeds we send back a unique dring
5288 		 * identifier that the sending side will use in future
5289 		 * to refer to this descriptor ring.
5290 		 */
5291 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
5292 
5293 		dp->num_descriptors = dring_pkt->num_descriptors;
5294 		dp->descriptor_size = dring_pkt->descriptor_size;
5295 		dp->options = dring_pkt->options;
5296 		dp->ncookies = dring_pkt->ncookies;
5297 
5298 		/*
5299 		 * Note: should only get one cookie. Enforced in
5300 		 * the ldc layer.
5301 		 */
5302 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
5303 			sizeof (ldc_mem_cookie_t));
5304 
5305 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
5306 			dp->num_descriptors, dp->descriptor_size);
5307 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
5308 			dp->options, dp->ncookies);
5309 
5310 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
5311 			dp->ncookies, dp->num_descriptors,
5312 			dp->descriptor_size, LDC_SHADOW_MAP,
5313 			&(dp->handle))) != 0) {
5314 
5315 			DERR(vswp, "%s: dring_map failed\n", __func__);
5316 
5317 			kmem_free(dp, sizeof (dring_info_t));
5318 			vsw_free_lane_resources(ldcp, INBOUND);
5319 
5320 			dring_pkt->tag.vio_sid = ldcp->local_session;
5321 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5322 
5323 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5324 
5325 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5326 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5327 				sizeof (vio_dring_reg_msg_t), B_TRUE);
5328 
5329 			vsw_next_milestone(ldcp);
5330 			return;
5331 		}
5332 
5333 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
5334 
5335 			DERR(vswp, "%s: dring_addr failed\n", __func__);
5336 
5337 			kmem_free(dp, sizeof (dring_info_t));
5338 			vsw_free_lane_resources(ldcp, INBOUND);
5339 
5340 			dring_pkt->tag.vio_sid = ldcp->local_session;
5341 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
5342 
5343 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
5344 
5345 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
5346 			(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5347 				sizeof (vio_dring_reg_msg_t), B_TRUE);
5348 
5349 			vsw_next_milestone(ldcp);
5350 			return;
5351 		} else {
5352 			/* store the address of the pub part of ring */
5353 			dp->pub_addr = minfo.vaddr;
5354 		}
5355 
5356 		/* no private section as we are importing */
5357 		dp->priv_addr = NULL;
5358 
5359 		/*
5360 		 * Using simple mono increasing int for ident at
5361 		 * the moment.
5362 		 */
5363 		dp->ident = ldcp->next_ident;
5364 		ldcp->next_ident++;
5365 
5366 		dp->end_idx = 0;
5367 		dp->next = NULL;
5368 
5369 		/*
5370 		 * Link it onto the end of the list of drings
5371 		 * for this lane.
5372 		 */
5373 		if (ldcp->lane_in.dringp == NULL) {
5374 			D2(vswp, "%s: adding first INBOUND dring", __func__);
5375 			ldcp->lane_in.dringp = dp;
5376 		} else {
5377 			dbp = ldcp->lane_in.dringp;
5378 
5379 			while (dbp->next != NULL)
5380 				dbp = dbp->next;
5381 
5382 			dbp->next = dp;
5383 		}
5384 
5385 		/* acknowledge it */
5386 		dring_pkt->tag.vio_sid = ldcp->local_session;
5387 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5388 		dring_pkt->dring_ident = dp->ident;
5389 
5390 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
5391 			sizeof (vio_dring_reg_msg_t), B_TRUE);
5392 
5393 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
5394 		vsw_next_milestone(ldcp);
5395 		break;
5396 
5397 	case VIO_SUBTYPE_ACK:
5398 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5399 
5400 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
5401 			return;
5402 
5403 		/*
5404 		 * Peer is acknowledging our dring info and will have
5405 		 * sent us a dring identifier which we will use to
5406 		 * refer to this ring w.r.t. our peer.
5407 		 */
5408 		dp = ldcp->lane_out.dringp;
5409 		if (dp != NULL) {
5410 			/*
5411 			 * Find the ring this ident should be associated
5412 			 * with.
5413 			 */
5414 			if (vsw_dring_match(dp, dring_pkt)) {
5415 				dring_found = 1;
5416 
5417 			} else while (dp != NULL) {
5418 				if (vsw_dring_match(dp, dring_pkt)) {
5419 					dring_found = 1;
5420 					break;
5421 				}
5422 				dp = dp->next;
5423 			}
5424 
5425 			if (dring_found == 0) {
5426 				DERR(NULL, "%s: unrecognised ring cookie",
5427 					__func__);
5428 				vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5429 				return;
5430 			}
5431 
5432 		} else {
5433 			DERR(vswp, "%s: DRING ACK received but no drings "
5434 				"allocated", __func__);
5435 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5436 			return;
5437 		}
5438 
5439 		/* store ident */
5440 		dp->ident = dring_pkt->dring_ident;
5441 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
5442 		vsw_next_milestone(ldcp);
5443 		break;
5444 
5445 	case VIO_SUBTYPE_NACK:
5446 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5447 
5448 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
5449 			return;
5450 
5451 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
5452 		vsw_next_milestone(ldcp);
5453 		break;
5454 
5455 	default:
5456 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5457 			dring_pkt->tag.vio_subtype);
5458 	}
5459 
5460 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
5461 }
5462 
5463 /*
5464  * Process a request from peer to unregister a dring.
5465  *
5466  * For the moment we just restart the handshake if our
5467  * peer endpoint attempts to unregister a dring.
5468  */
5469 void
5470 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
5471 {
5472 	vsw_t			*vswp = ldcp->ldc_vswp;
5473 	vio_dring_unreg_msg_t	*dring_pkt;
5474 
5475 	/*
5476 	 * We know this is a ctrl/dring packet so
5477 	 * cast it into the correct structure.
5478 	 */
5479 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
5480 
5481 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5482 
5483 	switch (dring_pkt->tag.vio_subtype) {
5484 	case VIO_SUBTYPE_INFO:
5485 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5486 
5487 		DWARN(vswp, "%s: restarting handshake..", __func__);
5488 		break;
5489 
5490 	case VIO_SUBTYPE_ACK:
5491 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5492 
5493 		DWARN(vswp, "%s: restarting handshake..", __func__);
5494 		break;
5495 
5496 	case VIO_SUBTYPE_NACK:
5497 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5498 
5499 		DWARN(vswp, "%s: restarting handshake..", __func__);
5500 		break;
5501 
5502 	default:
5503 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5504 			dring_pkt->tag.vio_subtype);
5505 	}
5506 
5507 	vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5508 
5509 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5510 }
5511 
5512 #define	SND_MCST_NACK(ldcp, pkt) \
5513 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5514 	pkt->tag.vio_sid = ldcp->local_session; \
5515 	(void) vsw_send_msg(ldcp, (void *)pkt, \
5516 			sizeof (vnet_mcast_msg_t), B_TRUE);
5517 
5518 /*
5519  * Process a multicast request from a vnet.
5520  *
5521  * Vnet's specify a multicast address that they are interested in. This
5522  * address is used as a key into the hash table which forms the multicast
5523  * forwarding database (mFDB).
5524  *
5525  * The table keys are the multicast addresses, while the table entries
5526  * are pointers to lists of ports which wish to receive packets for the
5527  * specified multicast address.
5528  *
5529  * When a multicast packet is being switched we use the address as a key
5530  * into the hash table, and then walk the appropriate port list forwarding
5531  * the pkt to each port in turn.
5532  *
5533  * If a vnet is no longer interested in a particular multicast grouping
5534  * we simply find the correct location in the hash table and then delete
5535  * the relevant port from the port list.
5536  *
5537  * To deal with the case whereby a port is being deleted without first
5538  * removing itself from the lists in the hash table, we maintain a list
5539  * of multicast addresses the port has registered an interest in, within
5540  * the port structure itself. We then simply walk that list of addresses
5541  * using them as keys into the hash table and remove the port from the
5542  * appropriate lists.
5543  */
5544 static void
5545 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
5546 {
5547 	vnet_mcast_msg_t	*mcst_pkt;
5548 	vsw_port_t		*port = ldcp->ldc_port;
5549 	vsw_t			*vswp = ldcp->ldc_vswp;
5550 	int			i;
5551 
5552 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5553 
5554 	/*
5555 	 * We know this is a ctrl/mcast packet so
5556 	 * cast it into the correct structure.
5557 	 */
5558 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
5559 
5560 	switch (mcst_pkt->tag.vio_subtype) {
5561 	case VIO_SUBTYPE_INFO:
5562 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5563 
5564 		/*
5565 		 * Check if in correct state to receive a multicast
5566 		 * message (i.e. handshake complete). If not reset
5567 		 * the handshake.
5568 		 */
5569 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
5570 			return;
5571 
5572 		/*
5573 		 * Before attempting to add or remove address check
5574 		 * that they are valid multicast addresses.
5575 		 * If not, then NACK back.
5576 		 */
5577 		for (i = 0; i < mcst_pkt->count; i++) {
5578 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
5579 				DERR(vswp, "%s: invalid multicast address",
5580 								__func__);
5581 				SND_MCST_NACK(ldcp, mcst_pkt);
5582 				return;
5583 			}
5584 		}
5585 
5586 		/*
5587 		 * Now add/remove the addresses. If this fails we
5588 		 * NACK back.
5589 		 */
5590 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
5591 			SND_MCST_NACK(ldcp, mcst_pkt);
5592 			return;
5593 		}
5594 
5595 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5596 		mcst_pkt->tag.vio_sid = ldcp->local_session;
5597 
5598 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
5599 
5600 		(void) vsw_send_msg(ldcp, (void *)mcst_pkt,
5601 				sizeof (vnet_mcast_msg_t), B_TRUE);
5602 		break;
5603 
5604 	case VIO_SUBTYPE_ACK:
5605 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
5606 
5607 		/*
5608 		 * We shouldn't ever get a multicast ACK message as
5609 		 * at the moment we never request multicast addresses
5610 		 * to be set on some other device. This may change in
5611 		 * the future if we have cascading switches.
5612 		 */
5613 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
5614 			return;
5615 
5616 				/* Do nothing */
5617 		break;
5618 
5619 	case VIO_SUBTYPE_NACK:
5620 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5621 
5622 		/*
5623 		 * We shouldn't get a multicast NACK packet for the
5624 		 * same reasons as we shouldn't get a ACK packet.
5625 		 */
5626 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
5627 			return;
5628 
5629 				/* Do nothing */
5630 		break;
5631 
5632 	default:
5633 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
5634 			mcst_pkt->tag.vio_subtype);
5635 	}
5636 
5637 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5638 }
5639 
5640 static void
5641 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
5642 {
5643 	vio_rdx_msg_t	*rdx_pkt;
5644 	vsw_t		*vswp = ldcp->ldc_vswp;
5645 
5646 	/*
5647 	 * We know this is a ctrl/rdx packet so
5648 	 * cast it into the correct structure.
5649 	 */
5650 	rdx_pkt = (vio_rdx_msg_t *)pkt;
5651 
5652 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
5653 
5654 	switch (rdx_pkt->tag.vio_subtype) {
5655 	case VIO_SUBTYPE_INFO:
5656 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
5657 
5658 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
5659 			return;
5660 
5661 		rdx_pkt->tag.vio_sid = ldcp->local_session;
5662 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
5663 
5664 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
5665 
5666 		ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
5667 
5668 		(void) vsw_send_msg(ldcp, (void *)rdx_pkt,
5669 			sizeof (vio_rdx_msg_t), B_TRUE);
5670 
5671 		vsw_next_milestone(ldcp);
5672 		break;
5673 
5674 	case VIO_SUBTYPE_ACK:
5675 		/*
5676 		 * Should be handled in-band by callback handler.
5677 		 */
5678 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
5679 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5680 		break;
5681 
5682 	case VIO_SUBTYPE_NACK:
5683 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
5684 
5685 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
5686 			return;
5687 
5688 		ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
5689 		vsw_next_milestone(ldcp);
5690 		break;
5691 
5692 	default:
5693 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
5694 			rdx_pkt->tag.vio_subtype);
5695 	}
5696 
5697 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5698 }
5699 
5700 static void
5701 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
5702 {
5703 	uint16_t	env = tag.vio_subtype_env;
5704 	vsw_t		*vswp = ldcp->ldc_vswp;
5705 
5706 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5707 
5708 	/* session id check */
5709 	if (ldcp->session_status & VSW_PEER_SESSION) {
5710 		if (ldcp->peer_session != tag.vio_sid) {
5711 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
5712 				__func__, ldcp->ldc_id, tag.vio_sid);
5713 			vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5714 			return;
5715 		}
5716 	}
5717 
5718 	/*
5719 	 * It is an error for us to be getting data packets
5720 	 * before the handshake has completed.
5721 	 */
5722 	if (ldcp->hphase != VSW_MILESTONE4) {
5723 		DERR(vswp, "%s: got data packet before handshake complete "
5724 			"hphase %d (%x: %x)", __func__, ldcp->hphase,
5725 			ldcp->lane_in.lstate, ldcp->lane_out.lstate);
5726 		DUMP_FLAGS(ldcp->lane_in.lstate);
5727 		DUMP_FLAGS(ldcp->lane_out.lstate);
5728 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
5729 		return;
5730 	}
5731 
5732 	/*
5733 	 * Switch on vio_subtype envelope, then let lower routines
5734 	 * decide if its an INFO, ACK or NACK packet.
5735 	 */
5736 	if (env == VIO_DRING_DATA) {
5737 		vsw_process_data_dring_pkt(ldcp, dpkt);
5738 	} else if (env == VIO_PKT_DATA) {
5739 		vsw_process_data_raw_pkt(ldcp, dpkt);
5740 	} else if (env == VIO_DESC_DATA) {
5741 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
5742 	} else {
5743 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
5744 							__func__, env);
5745 	}
5746 
5747 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5748 }
5749 
5750 #define	SND_DRING_NACK(ldcp, pkt) \
5751 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
5752 	pkt->tag.vio_sid = ldcp->local_session; \
5753 	(void) vsw_send_msg(ldcp, (void *)pkt, \
5754 			sizeof (vio_dring_msg_t), B_TRUE);
5755 
5756 static void
5757 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
5758 {
5759 	vio_dring_msg_t		*dring_pkt;
5760 	vnet_public_desc_t	*pub_addr = NULL;
5761 	vsw_private_desc_t	*priv_addr = NULL;
5762 	dring_info_t		*dp = NULL;
5763 	vsw_t			*vswp = ldcp->ldc_vswp;
5764 	mblk_t			*mp = NULL;
5765 	mblk_t			*bp = NULL;
5766 	mblk_t			*bpt = NULL;
5767 	size_t			nbytes = 0;
5768 	size_t			off = 0;
5769 	uint64_t		ncookies = 0;
5770 	uint64_t		chain = 0;
5771 	uint64_t		j, len;
5772 	uint32_t		pos, start, datalen;
5773 	uint32_t		range_start, range_end;
5774 	int32_t			end, num, cnt = 0;
5775 	int			i, rv, msg_rv = 0;
5776 	boolean_t		ack_needed = B_FALSE;
5777 	boolean_t		prev_desc_ack = B_FALSE;
5778 	int			read_attempts = 0;
5779 
5780 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5781 
5782 	/*
5783 	 * We know this is a data/dring packet so
5784 	 * cast it into the correct structure.
5785 	 */
5786 	dring_pkt = (vio_dring_msg_t *)dpkt;
5787 
5788 	/*
5789 	 * Switch on the vio_subtype. If its INFO then we need to
5790 	 * process the data. If its an ACK we need to make sure
5791 	 * it makes sense (i.e did we send an earlier data/info),
5792 	 * and if its a NACK then we maybe attempt a retry.
5793 	 */
5794 	switch (dring_pkt->tag.vio_subtype) {
5795 	case VIO_SUBTYPE_INFO:
5796 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
5797 
5798 		READ_ENTER(&ldcp->lane_in.dlistrw);
5799 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
5800 				dring_pkt->dring_ident)) == NULL) {
5801 			RW_EXIT(&ldcp->lane_in.dlistrw);
5802 
5803 			DERR(vswp, "%s(%lld): unable to find dring from "
5804 				"ident 0x%llx", __func__, ldcp->ldc_id,
5805 				dring_pkt->dring_ident);
5806 
5807 			SND_DRING_NACK(ldcp, dring_pkt);
5808 			return;
5809 		}
5810 
5811 		start = pos = dring_pkt->start_idx;
5812 		end = dring_pkt->end_idx;
5813 		len = dp->num_descriptors;
5814 
5815 		range_start = range_end = pos;
5816 
5817 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
5818 			__func__, ldcp->ldc_id, start, end);
5819 
5820 		if (end == -1) {
5821 			num = -1;
5822 		} else if (end >= 0) {
5823 			num = end >= pos ?
5824 				end - pos + 1: (len - pos + 1) + end;
5825 
5826 			/* basic sanity check */
5827 			if (end > len) {
5828 				RW_EXIT(&ldcp->lane_in.dlistrw);
5829 				DERR(vswp, "%s(%lld): endpoint %lld outside "
5830 					"ring length %lld", __func__,
5831 					ldcp->ldc_id, end, len);
5832 
5833 				SND_DRING_NACK(ldcp, dring_pkt);
5834 				return;
5835 			}
5836 		} else {
5837 			RW_EXIT(&ldcp->lane_in.dlistrw);
5838 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
5839 				__func__, ldcp->ldc_id, end);
5840 			SND_DRING_NACK(ldcp, dring_pkt);
5841 			return;
5842 		}
5843 
5844 		while (cnt != num) {
5845 vsw_recheck_desc:
5846 			if ((rv = ldc_mem_dring_acquire(dp->handle,
5847 							pos, pos)) != 0) {
5848 				RW_EXIT(&ldcp->lane_in.dlistrw);
5849 				DERR(vswp, "%s(%lld): unable to acquire "
5850 					"descriptor at pos %d: err %d",
5851 					__func__, pos, ldcp->ldc_id, rv);
5852 				SND_DRING_NACK(ldcp, dring_pkt);
5853 				return;
5854 			}
5855 
5856 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
5857 
5858 			/*
5859 			 * When given a bounded range of descriptors
5860 			 * to process, its an error to hit a descriptor
5861 			 * which is not ready. In the non-bounded case
5862 			 * (end_idx == -1) this simply indicates we have
5863 			 * reached the end of the current active range.
5864 			 */
5865 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
5866 				/* unbound - no error */
5867 				if (end == -1) {
5868 					if (read_attempts == vsw_read_attempts)
5869 						break;
5870 
5871 					delay(drv_usectohz(vsw_desc_delay));
5872 					read_attempts++;
5873 					goto vsw_recheck_desc;
5874 				}
5875 
5876 				/* bounded - error - so NACK back */
5877 				RW_EXIT(&ldcp->lane_in.dlistrw);
5878 				DERR(vswp, "%s(%lld): descriptor not READY "
5879 					"(%d)", __func__, ldcp->ldc_id,
5880 					pub_addr->hdr.dstate);
5881 				SND_DRING_NACK(ldcp, dring_pkt);
5882 				return;
5883 			}
5884 
5885 			DTRACE_PROBE1(read_attempts, int, read_attempts);
5886 
5887 			range_end = pos;
5888 
5889 			/*
5890 			 * If we ACK'd the previous descriptor then now
5891 			 * record the new range start position for later
5892 			 * ACK's.
5893 			 */
5894 			if (prev_desc_ack) {
5895 				range_start = pos;
5896 
5897 				D2(vswp, "%s(%lld): updating range start "
5898 					"to be %d", __func__, ldcp->ldc_id,
5899 					range_start);
5900 
5901 				prev_desc_ack = B_FALSE;
5902 			}
5903 
5904 			/*
5905 			 * Data is padded to align on 8 byte boundary,
5906 			 * datalen is actual data length, i.e. minus that
5907 			 * padding.
5908 			 */
5909 			datalen = pub_addr->nbytes;
5910 
5911 			/*
5912 			 * Does peer wish us to ACK when we have finished
5913 			 * with this descriptor ?
5914 			 */
5915 			if (pub_addr->hdr.ack)
5916 				ack_needed = B_TRUE;
5917 
5918 			D2(vswp, "%s(%lld): processing desc %lld at pos"
5919 				" 0x%llx : dstate 0x%lx : datalen 0x%lx",
5920 				__func__, ldcp->ldc_id, pos, pub_addr,
5921 				pub_addr->hdr.dstate, datalen);
5922 
5923 			/*
5924 			 * Mark that we are starting to process descriptor.
5925 			 */
5926 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
5927 
5928 			mp = vio_allocb(ldcp->rxh);
5929 			if (mp == NULL) {
5930 				/*
5931 				 * No free receive buffers available, so
5932 				 * fallback onto allocb(9F). Make sure that
5933 				 * we get a data buffer which is a multiple
5934 				 * of 8 as this is required by ldc_mem_copy.
5935 				 */
5936 				DTRACE_PROBE(allocb);
5937 				mp = allocb(datalen + VNET_IPALIGN + 8,
5938 								BPRI_MED);
5939 			}
5940 
5941 			/*
5942 			 * Ensure that we ask ldc for an aligned
5943 			 * number of bytes.
5944 			 */
5945 			nbytes = datalen + VNET_IPALIGN;
5946 			if (nbytes & 0x7) {
5947 				off = 8 - (nbytes & 0x7);
5948 				nbytes += off;
5949 			}
5950 
5951 			ncookies = pub_addr->ncookies;
5952 			rv = ldc_mem_copy(ldcp->ldc_handle,
5953 				(caddr_t)mp->b_rptr, 0, &nbytes,
5954 				pub_addr->memcookie, ncookies,
5955 				LDC_COPY_IN);
5956 
5957 			if (rv != 0) {
5958 				DERR(vswp, "%s(%d): unable to copy in "
5959 					"data from %d cookies in desc %d"
5960 					" (rv %d)", __func__, ldcp->ldc_id,
5961 					ncookies, pos, rv);
5962 				freemsg(mp);
5963 
5964 				pub_addr->hdr.dstate = VIO_DESC_DONE;
5965 				(void) ldc_mem_dring_release(dp->handle,
5966 								pos, pos);
5967 				break;
5968 			} else {
5969 				D2(vswp, "%s(%d): copied in %ld bytes"
5970 					" using %d cookies", __func__,
5971 					ldcp->ldc_id, nbytes, ncookies);
5972 			}
5973 
5974 			/* adjust the read pointer to skip over the padding */
5975 			mp->b_rptr += VNET_IPALIGN;
5976 
5977 			/* point to the actual end of data */
5978 			mp->b_wptr = mp->b_rptr + datalen;
5979 
5980 			/* build a chain of received packets */
5981 			if (bp == NULL) {
5982 				/* first pkt */
5983 				bp = mp;
5984 				bp->b_next = bp->b_prev = NULL;
5985 				bpt = bp;
5986 				chain = 1;
5987 			} else {
5988 				mp->b_next = NULL;
5989 				mp->b_prev = bpt;
5990 				bpt->b_next = mp;
5991 				bpt = mp;
5992 				chain++;
5993 			}
5994 
5995 			/* mark we are finished with this descriptor */
5996 			pub_addr->hdr.dstate = VIO_DESC_DONE;
5997 
5998 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
5999 
6000 			/*
6001 			 * Send an ACK back to peer if requested.
6002 			 */
6003 			if (ack_needed) {
6004 				ack_needed = B_FALSE;
6005 
6006 				dring_pkt->start_idx = range_start;
6007 				dring_pkt->end_idx = range_end;
6008 
6009 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
6010 					" requested", __func__, ldcp->ldc_id,
6011 					dring_pkt->start_idx,
6012 					dring_pkt->end_idx);
6013 
6014 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
6015 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
6016 				dring_pkt->tag.vio_sid = ldcp->local_session;
6017 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
6018 						sizeof (vio_dring_msg_t),
6019 						B_FALSE);
6020 
6021 				/*
6022 				 * Check if ACK was successfully sent. If not
6023 				 * we break and deal with that below.
6024 				 */
6025 				if (msg_rv != 0)
6026 					break;
6027 
6028 				prev_desc_ack = B_TRUE;
6029 				range_start = pos;
6030 			}
6031 
6032 			/* next descriptor */
6033 			pos = (pos + 1) % len;
6034 			cnt++;
6035 
6036 			/*
6037 			 * Break out of loop here and stop processing to
6038 			 * allow some other network device (or disk) to
6039 			 * get access to the cpu.
6040 			 */
6041 			if (chain > vsw_chain_len) {
6042 				D3(vswp, "%s(%lld): switching chain of %d "
6043 					"msgs", __func__, ldcp->ldc_id, chain);
6044 				break;
6045 			}
6046 		}
6047 		RW_EXIT(&ldcp->lane_in.dlistrw);
6048 
6049 		/*
6050 		 * If when we attempted to send the ACK we found that the
6051 		 * channel had been reset then now handle this. We deal with
6052 		 * it here as we cannot reset the channel while holding the
6053 		 * dlistrw lock, and we don't want to acquire/release it
6054 		 * continuously in the above loop, as a channel reset should
6055 		 * be a rare event.
6056 		 */
6057 		if (msg_rv == ECONNRESET) {
6058 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
6059 			break;
6060 		}
6061 
6062 		/* send the chain of packets to be switched */
6063 		if (bp != NULL) {
6064 			D3(vswp, "%s(%lld): switching chain of %d msgs",
6065 					__func__, ldcp->ldc_id, chain);
6066 			vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT,
6067 							ldcp->ldc_port, NULL);
6068 		}
6069 
6070 		DTRACE_PROBE1(msg_cnt, int, cnt);
6071 
6072 		/*
6073 		 * We are now finished so ACK back with the state
6074 		 * set to STOPPING so our peer knows we are finished
6075 		 */
6076 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
6077 		dring_pkt->tag.vio_sid = ldcp->local_session;
6078 
6079 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
6080 
6081 		DTRACE_PROBE(stop_process_sent);
6082 
6083 		/*
6084 		 * We have not processed any more descriptors beyond
6085 		 * the last one we ACK'd.
6086 		 */
6087 		if (prev_desc_ack)
6088 			range_start = range_end;
6089 
6090 		dring_pkt->start_idx = range_start;
6091 		dring_pkt->end_idx = range_end;
6092 
6093 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
6094 			__func__, ldcp->ldc_id, dring_pkt->start_idx,
6095 			dring_pkt->end_idx);
6096 
6097 		(void) vsw_send_msg(ldcp, (void *)dring_pkt,
6098 				sizeof (vio_dring_msg_t), B_TRUE);
6099 		break;
6100 
6101 	case VIO_SUBTYPE_ACK:
6102 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
6103 		/*
6104 		 * Verify that the relevant descriptors are all
6105 		 * marked as DONE
6106 		 */
6107 		READ_ENTER(&ldcp->lane_out.dlistrw);
6108 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
6109 			dring_pkt->dring_ident)) == NULL) {
6110 			RW_EXIT(&ldcp->lane_out.dlistrw);
6111 			DERR(vswp, "%s: unknown ident in ACK", __func__);
6112 			return;
6113 		}
6114 
6115 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
6116 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6117 
6118 		start = end = 0;
6119 		start = dring_pkt->start_idx;
6120 		end = dring_pkt->end_idx;
6121 		len = dp->num_descriptors;
6122 
6123 		j = num = 0;
6124 		/* calculate # descriptors taking into a/c wrap around */
6125 		num = end >= start ? end - start + 1: (len - start + 1) + end;
6126 
6127 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
6128 			__func__, ldcp->ldc_id, start, end, num);
6129 
6130 		mutex_enter(&dp->dlock);
6131 		dp->last_ack_recv = end;
6132 		mutex_exit(&dp->dlock);
6133 
6134 		for (i = start; j < num; i = (i + 1) % len, j++) {
6135 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6136 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6137 
6138 			/*
6139 			 * If the last descriptor in a range has the ACK
6140 			 * bit set then we will get two messages from our
6141 			 * peer relating to it. The normal ACK msg and then
6142 			 * a subsequent STOP msg. The first message will have
6143 			 * resulted in the descriptor being reclaimed and
6144 			 * its state set to FREE so when we encounter a non
6145 			 * DONE descriptor we need to check to see if its
6146 			 * because we have just reclaimed it.
6147 			 */
6148 			mutex_enter(&priv_addr->dstate_lock);
6149 			if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
6150 				/* clear all the fields */
6151 				bzero(priv_addr->datap, priv_addr->datalen);
6152 				priv_addr->datalen = 0;
6153 
6154 				pub_addr->hdr.dstate = VIO_DESC_FREE;
6155 				pub_addr->hdr.ack = 0;
6156 
6157 				priv_addr->dstate = VIO_DESC_FREE;
6158 				mutex_exit(&priv_addr->dstate_lock);
6159 
6160 				D3(vswp, "clearing descp %d : pub state "
6161 					"0x%llx : priv state 0x%llx", i,
6162 					pub_addr->hdr.dstate,
6163 					priv_addr->dstate);
6164 
6165 			} else {
6166 				mutex_exit(&priv_addr->dstate_lock);
6167 
6168 				if (dring_pkt->dring_process_state !=
6169 							VIO_DP_STOPPED) {
6170 					DERR(vswp, "%s: descriptor %lld at pos "
6171 						" 0x%llx not DONE (0x%lx)\n",
6172 						__func__, i, pub_addr,
6173 						pub_addr->hdr.dstate);
6174 					RW_EXIT(&ldcp->lane_out.dlistrw);
6175 					return;
6176 				}
6177 			}
6178 		}
6179 
6180 		/*
6181 		 * If our peer is stopping processing descriptors then
6182 		 * we check to make sure it has processed all the descriptors
6183 		 * we have updated. If not then we send it a new message
6184 		 * to prompt it to restart.
6185 		 */
6186 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
6187 			DTRACE_PROBE(stop_process_recv);
6188 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
6189 				__func__, ldcp->ldc_id, dring_pkt->start_idx,
6190 				dring_pkt->end_idx);
6191 
6192 			/*
6193 			 * Check next descriptor in public section of ring.
6194 			 * If its marked as READY then we need to prompt our
6195 			 * peer to start processing the ring again.
6196 			 */
6197 			i = (end + 1) % len;
6198 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
6199 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6200 
6201 			/*
6202 			 * Hold the restart lock across all of this to
6203 			 * make sure that its not possible for us to
6204 			 * decide that a msg needs to be sent in the future
6205 			 * but the sending code having already checked is
6206 			 * about to exit.
6207 			 */
6208 			mutex_enter(&dp->restart_lock);
6209 			mutex_enter(&priv_addr->dstate_lock);
6210 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
6211 
6212 				mutex_exit(&priv_addr->dstate_lock);
6213 
6214 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
6215 				dring_pkt->tag.vio_sid = ldcp->local_session;
6216 
6217 				mutex_enter(&ldcp->lane_out.seq_lock);
6218 				dring_pkt->seq_num = ldcp->lane_out.seq_num++;
6219 				mutex_exit(&ldcp->lane_out.seq_lock);
6220 
6221 				dring_pkt->start_idx = (end + 1) % len;
6222 				dring_pkt->end_idx = -1;
6223 
6224 				D2(vswp, "%s(%lld) : sending restart msg:"
6225 					" %d : %d", __func__, ldcp->ldc_id,
6226 					dring_pkt->start_idx,
6227 					dring_pkt->end_idx);
6228 
6229 				msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt,
6230 					sizeof (vio_dring_msg_t), B_FALSE);
6231 
6232 			} else {
6233 				mutex_exit(&priv_addr->dstate_lock);
6234 				dp->restart_reqd = B_TRUE;
6235 			}
6236 			mutex_exit(&dp->restart_lock);
6237 		}
6238 		RW_EXIT(&ldcp->lane_out.dlistrw);
6239 
6240 		/* only do channel reset after dropping dlistrw lock */
6241 		if (msg_rv == ECONNRESET)
6242 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
6243 
6244 		break;
6245 
6246 	case VIO_SUBTYPE_NACK:
6247 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
6248 						__func__, ldcp->ldc_id);
6249 		/*
6250 		 * Something is badly wrong if we are getting NACK's
6251 		 * for our data pkts. So reset the channel.
6252 		 */
6253 		vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
6254 
6255 		break;
6256 
6257 	default:
6258 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
6259 			ldcp->ldc_id, dring_pkt->tag.vio_subtype);
6260 	}
6261 
6262 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
6263 }
6264 
6265 /*
6266  * VIO_PKT_DATA (a.k.a raw data mode )
6267  *
6268  * Note - currently not supported. Do nothing.
6269  */
6270 static void
6271 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
6272 {
6273 	_NOTE(ARGUNUSED(dpkt))
6274 
6275 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
6276 
6277 	DERR(NULL, "%s (%lld): currently  not supported",
6278 						__func__, ldcp->ldc_id);
6279 
6280 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
6281 }
6282 
6283 /*
6284  * Process an in-band descriptor message (most likely from
6285  * OBP).
6286  */
6287 static void
6288 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
6289 {
6290 	vnet_ibnd_desc_t	*ibnd_desc;
6291 	dring_info_t		*dp = NULL;
6292 	vsw_private_desc_t	*priv_addr = NULL;
6293 	vsw_t			*vswp = ldcp->ldc_vswp;
6294 	mblk_t			*mp = NULL;
6295 	mblk_t			*nmp;
6296 	size_t			nbytes = 0;
6297 	size_t			off = 0;
6298 	uint64_t		idx = 0;
6299 	uint32_t		num = 1, len, datalen = 0;
6300 	uint64_t		ncookies = 0;
6301 	int			i, rv;
6302 	int			j = 0;
6303 
6304 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6305 
6306 	ibnd_desc = (vnet_ibnd_desc_t *)pkt;
6307 
6308 	switch (ibnd_desc->hdr.tag.vio_subtype) {
6309 	case VIO_SUBTYPE_INFO:
6310 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
6311 
6312 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
6313 			return;
6314 
6315 		/*
6316 		 * Data is padded to align on a 8 byte boundary,
6317 		 * nbytes is actual data length, i.e. minus that
6318 		 * padding.
6319 		 */
6320 		datalen = ibnd_desc->nbytes;
6321 
6322 		D2(vswp, "%s(%lld): processing inband desc : "
6323 			": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
6324 
6325 		ncookies = ibnd_desc->ncookies;
6326 
6327 		/*
6328 		 * allocb(9F) returns an aligned data block. We
6329 		 * need to ensure that we ask ldc for an aligned
6330 		 * number of bytes also.
6331 		 */
6332 		nbytes = datalen;
6333 		if (nbytes & 0x7) {
6334 			off = 8 - (nbytes & 0x7);
6335 			nbytes += off;
6336 		}
6337 
6338 		mp = allocb(datalen, BPRI_MED);
6339 		if (mp == NULL) {
6340 			DERR(vswp, "%s(%lld): allocb failed",
6341 					__func__, ldcp->ldc_id);
6342 			return;
6343 		}
6344 
6345 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
6346 			0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
6347 			LDC_COPY_IN);
6348 
6349 		if (rv != 0) {
6350 			DERR(vswp, "%s(%d): unable to copy in data from "
6351 				"%d cookie(s)", __func__,
6352 				ldcp->ldc_id, ncookies);
6353 			freemsg(mp);
6354 			return;
6355 		}
6356 
6357 		D2(vswp, "%s(%d): copied in %ld bytes using %d "
6358 			"cookies", __func__, ldcp->ldc_id, nbytes,
6359 			ncookies);
6360 
6361 		/*
6362 		 * Upper layer is expecting the IP header in the packet to
6363 		 * be 4-bytes aligned, but the OBP is sending packets that
6364 		 * are not aligned.  So, copy the data to another message
6365 		 * such that the alignment requirement is met.
6366 		 */
6367 		nmp = allocb(datalen + VNET_IPALIGN, BPRI_MED);
6368 		if (nmp == NULL) {
6369 			DERR(vswp, "%s(%lld): allocb failed",
6370 				__func__, ldcp->ldc_id);
6371 			freemsg(mp);
6372 			return;
6373 		}
6374 		nmp->b_rptr += VNET_IPALIGN;
6375 		bcopy(mp->b_rptr, nmp->b_rptr, datalen);
6376 		freemsg(mp);
6377 
6378 		/* point to the actual end of data */
6379 		nmp->b_wptr = nmp->b_rptr + datalen;
6380 
6381 		/*
6382 		 * We ACK back every in-band descriptor message we process
6383 		 */
6384 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
6385 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
6386 		(void) vsw_send_msg(ldcp, (void *)ibnd_desc,
6387 				sizeof (vnet_ibnd_desc_t), B_TRUE);
6388 
6389 		/* send the packet to be switched */
6390 		vswp->vsw_switch_frame(vswp, nmp, VSW_VNETPORT,
6391 					ldcp->ldc_port, NULL);
6392 
6393 		break;
6394 
6395 	case VIO_SUBTYPE_ACK:
6396 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
6397 
6398 		/* Verify the ACK is valid */
6399 		idx = ibnd_desc->hdr.desc_handle;
6400 
6401 		if (idx >= VSW_RING_NUM_EL) {
6402 			cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
6403 				"(idx %ld)", vswp->instance, idx);
6404 			return;
6405 		}
6406 
6407 		if ((dp = ldcp->lane_out.dringp) == NULL) {
6408 			DERR(vswp, "%s: no dring found", __func__);
6409 			return;
6410 		}
6411 
6412 		len = dp->num_descriptors;
6413 		/*
6414 		 * If the descriptor we are being ACK'ed for is not the
6415 		 * one we expected, then pkts were lost somwhere, either
6416 		 * when we tried to send a msg, or a previous ACK msg from
6417 		 * our peer. In either case we now reclaim the descriptors
6418 		 * in the range from the last ACK we received up to the
6419 		 * current ACK.
6420 		 */
6421 		if (idx != dp->last_ack_recv) {
6422 			DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
6423 				__func__, dp->last_ack_recv, idx);
6424 			num = idx >= dp->last_ack_recv ?
6425 				idx - dp->last_ack_recv + 1:
6426 				(len - dp->last_ack_recv + 1) + idx;
6427 		}
6428 
6429 		/*
6430 		 * When we sent the in-band message to our peer we
6431 		 * marked the copy in our private ring as READY. We now
6432 		 * check that the descriptor we are being ACK'ed for is in
6433 		 * fact READY, i.e. it is one we have shared with our peer.
6434 		 *
6435 		 * If its not we flag an error, but still reset the descr
6436 		 * back to FREE.
6437 		 */
6438 		for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
6439 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
6440 			mutex_enter(&priv_addr->dstate_lock);
6441 			if (priv_addr->dstate != VIO_DESC_READY) {
6442 				DERR(vswp, "%s: (%ld) desc at index %ld not "
6443 					"READY (0x%lx)", __func__,
6444 					ldcp->ldc_id, idx, priv_addr->dstate);
6445 				DERR(vswp, "%s: bound %d: ncookies %ld : "
6446 					"datalen %ld", __func__,
6447 					priv_addr->bound, priv_addr->ncookies,
6448 					priv_addr->datalen);
6449 			}
6450 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
6451 				ldcp->ldc_id, idx);
6452 			/* release resources associated with sent msg */
6453 			bzero(priv_addr->datap, priv_addr->datalen);
6454 			priv_addr->datalen = 0;
6455 			priv_addr->dstate = VIO_DESC_FREE;
6456 			mutex_exit(&priv_addr->dstate_lock);
6457 		}
6458 		/* update to next expected value */
6459 		dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
6460 
6461 		break;
6462 
6463 	case VIO_SUBTYPE_NACK:
6464 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
6465 
6466 		/*
6467 		 * We should only get a NACK if our peer doesn't like
6468 		 * something about a message we have sent it. If this
6469 		 * happens we just release the resources associated with
6470 		 * the message. (We are relying on higher layers to decide
6471 		 * whether or not to resend.
6472 		 */
6473 
6474 		/* limit check */
6475 		idx = ibnd_desc->hdr.desc_handle;
6476 
6477 		if (idx >= VSW_RING_NUM_EL) {
6478 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
6479 				__func__, idx);
6480 			return;
6481 		}
6482 
6483 		if ((dp = ldcp->lane_out.dringp) == NULL) {
6484 			DERR(vswp, "%s: no dring found", __func__);
6485 			return;
6486 		}
6487 
6488 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
6489 
6490 		/* move to correct location in ring */
6491 		priv_addr += idx;
6492 
6493 		/* release resources associated with sent msg */
6494 		mutex_enter(&priv_addr->dstate_lock);
6495 		bzero(priv_addr->datap, priv_addr->datalen);
6496 		priv_addr->datalen = 0;
6497 		priv_addr->dstate = VIO_DESC_FREE;
6498 		mutex_exit(&priv_addr->dstate_lock);
6499 
6500 		break;
6501 
6502 	default:
6503 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
6504 			ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
6505 	}
6506 
6507 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
6508 }
6509 
6510 static void
6511 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
6512 {
6513 	_NOTE(ARGUNUSED(epkt))
6514 
6515 	vsw_t		*vswp = ldcp->ldc_vswp;
6516 	uint16_t	env = tag.vio_subtype_env;
6517 
6518 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
6519 
6520 	/*
6521 	 * Error vio_subtypes have yet to be defined. So for
6522 	 * the moment we can't do anything.
6523 	 */
6524 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
6525 
6526 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
6527 }
6528 
6529 /*
6530  * Switch the given ethernet frame when operating in layer 2 mode.
6531  *
6532  * vswp: pointer to the vsw instance
6533  * mp: pointer to chain of ethernet frame(s) to be switched
6534  * caller: identifies the source of this frame as:
6535  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
6536  *		2. VSW_PHYSDEV - the physical ethernet device
6537  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
6538  * arg: argument provided by the caller.
6539  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
6540  *		2. for PHYSDEV - NULL
6541  *		3. for LOCALDEV - pointer to to this vsw_t(self)
6542  */
6543 void
6544 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
6545 			vsw_port_t *arg, mac_resource_handle_t mrh)
6546 {
6547 	struct ether_header	*ehp;
6548 	vsw_port_t		*port = NULL;
6549 	mblk_t			*bp, *ret_m;
6550 	mblk_t			*nmp = NULL;
6551 	vsw_port_list_t		*plist = &vswp->plist;
6552 
6553 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
6554 
6555 	/*
6556 	 * PERF: rather than breaking up the chain here, scan it
6557 	 * to find all mblks heading to same destination and then
6558 	 * pass that sub-chain to the lower transmit functions.
6559 	 */
6560 
6561 	/* process the chain of packets */
6562 	bp = mp;
6563 	while (bp) {
6564 		mp = bp;
6565 		bp = bp->b_next;
6566 		mp->b_next = mp->b_prev = NULL;
6567 		ehp = (struct ether_header *)mp->b_rptr;
6568 
6569 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
6570 			__func__, MBLKSIZE(mp), MBLKL(mp));
6571 
6572 		READ_ENTER(&vswp->if_lockrw);
6573 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
6574 			/*
6575 			 * If destination is VSW_LOCALDEV (vsw as an eth
6576 			 * interface) and if the device is up & running,
6577 			 * send the packet up the stack on this host.
6578 			 * If the virtual interface is down, drop the packet.
6579 			 */
6580 			if (caller != VSW_LOCALDEV) {
6581 				if (vswp->if_state & VSW_IF_UP) {
6582 					RW_EXIT(&vswp->if_lockrw);
6583 					mac_rx(vswp->if_mh, mrh, mp);
6584 				} else {
6585 					RW_EXIT(&vswp->if_lockrw);
6586 					/* Interface down, drop pkt */
6587 					freemsg(mp);
6588 				}
6589 			} else {
6590 				RW_EXIT(&vswp->if_lockrw);
6591 				freemsg(mp);
6592 			}
6593 			continue;
6594 		}
6595 		RW_EXIT(&vswp->if_lockrw);
6596 
6597 		READ_ENTER(&plist->lockrw);
6598 		port = vsw_lookup_fdb(vswp, ehp);
6599 		if (port) {
6600 			/*
6601 			 * Mark the port as in-use.
6602 			 */
6603 			mutex_enter(&port->ref_lock);
6604 			port->ref_cnt++;
6605 			mutex_exit(&port->ref_lock);
6606 			RW_EXIT(&plist->lockrw);
6607 
6608 			/*
6609 			 * If plumbed and in promisc mode then copy msg
6610 			 * and send up the stack.
6611 			 */
6612 			READ_ENTER(&vswp->if_lockrw);
6613 			if (VSW_U_P(vswp->if_state)) {
6614 				RW_EXIT(&vswp->if_lockrw);
6615 				nmp = copymsg(mp);
6616 				if (nmp)
6617 					mac_rx(vswp->if_mh, mrh, nmp);
6618 			} else {
6619 				RW_EXIT(&vswp->if_lockrw);
6620 			}
6621 
6622 			/*
6623 			 * If the destination is in FDB, the packet
6624 			 * should be forwarded to the correponding
6625 			 * vsw_port (connected to a vnet device -
6626 			 * VSW_VNETPORT)
6627 			 */
6628 			(void) vsw_portsend(port, mp);
6629 
6630 			/*
6631 			 * Decrement use count in port and check if
6632 			 * should wake delete thread.
6633 			 */
6634 			mutex_enter(&port->ref_lock);
6635 			port->ref_cnt--;
6636 			if (port->ref_cnt == 0)
6637 				cv_signal(&port->ref_cv);
6638 			mutex_exit(&port->ref_lock);
6639 		} else {
6640 			RW_EXIT(&plist->lockrw);
6641 			/*
6642 			 * Destination not in FDB.
6643 			 *
6644 			 * If the destination is broadcast or
6645 			 * multicast forward the packet to all
6646 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
6647 			 * except the caller.
6648 			 */
6649 			if (IS_BROADCAST(ehp)) {
6650 				D3(vswp, "%s: BROADCAST pkt", __func__);
6651 				(void) vsw_forward_all(vswp, mp,
6652 								caller, arg);
6653 			} else if (IS_MULTICAST(ehp)) {
6654 				D3(vswp, "%s: MULTICAST pkt", __func__);
6655 				(void) vsw_forward_grp(vswp, mp,
6656 							caller, arg);
6657 			} else {
6658 				/*
6659 				 * If the destination is unicast, and came
6660 				 * from either a logical network device or
6661 				 * the switch itself when it is plumbed, then
6662 				 * send it out on the physical device and also
6663 				 * up the stack if the logical interface is
6664 				 * in promiscious mode.
6665 				 *
6666 				 * NOTE:  The assumption here is that if we
6667 				 * cannot find the destination in our fdb, its
6668 				 * a unicast address, and came from either a
6669 				 * vnet or down the stack (when plumbed) it
6670 				 * must be destinded for an ethernet device
6671 				 * outside our ldoms.
6672 				 */
6673 				if (caller == VSW_VNETPORT) {
6674 					READ_ENTER(&vswp->if_lockrw);
6675 					if (VSW_U_P(vswp->if_state)) {
6676 						RW_EXIT(&vswp->if_lockrw);
6677 						nmp = copymsg(mp);
6678 						if (nmp)
6679 							mac_rx(vswp->if_mh,
6680 								mrh, nmp);
6681 					} else {
6682 						RW_EXIT(&vswp->if_lockrw);
6683 					}
6684 					if ((ret_m = vsw_tx_msg(vswp, mp))
6685 								!= NULL) {
6686 						DERR(vswp, "%s: drop mblks to "
6687 							"phys dev", __func__);
6688 						freemsg(ret_m);
6689 					}
6690 
6691 				} else if (caller == VSW_PHYSDEV) {
6692 					/*
6693 					 * Pkt seen because card in promisc
6694 					 * mode. Send up stack if plumbed in
6695 					 * promisc mode, else drop it.
6696 					 */
6697 					READ_ENTER(&vswp->if_lockrw);
6698 					if (VSW_U_P(vswp->if_state)) {
6699 						RW_EXIT(&vswp->if_lockrw);
6700 						mac_rx(vswp->if_mh, mrh, mp);
6701 					} else {
6702 						RW_EXIT(&vswp->if_lockrw);
6703 						freemsg(mp);
6704 					}
6705 
6706 				} else if (caller == VSW_LOCALDEV) {
6707 					/*
6708 					 * Pkt came down the stack, send out
6709 					 * over physical device.
6710 					 */
6711 					if ((ret_m = vsw_tx_msg(vswp, mp))
6712 								!= NULL) {
6713 						DERR(vswp, "%s: drop mblks to "
6714 							"phys dev", __func__);
6715 						freemsg(ret_m);
6716 					}
6717 				}
6718 			}
6719 		}
6720 	}
6721 	D1(vswp, "%s: exit\n", __func__);
6722 }
6723 
6724 /*
6725  * Switch ethernet frame when in layer 3 mode (i.e. using IP
6726  * layer to do the routing).
6727  *
6728  * There is a large amount of overlap between this function and
6729  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
6730  * both these functions.
6731  */
6732 void
6733 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
6734 			vsw_port_t *arg, mac_resource_handle_t mrh)
6735 {
6736 	struct ether_header	*ehp;
6737 	vsw_port_t		*port = NULL;
6738 	mblk_t			*bp = NULL;
6739 	vsw_port_list_t		*plist = &vswp->plist;
6740 
6741 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
6742 
6743 	/*
6744 	 * In layer 3 mode should only ever be switching packets
6745 	 * between IP layer and vnet devices. So make sure thats
6746 	 * who is invoking us.
6747 	 */
6748 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
6749 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
6750 		freemsgchain(mp);
6751 		return;
6752 	}
6753 
6754 	/* process the chain of packets */
6755 	bp = mp;
6756 	while (bp) {
6757 		mp = bp;
6758 		bp = bp->b_next;
6759 		mp->b_next = mp->b_prev = NULL;
6760 		ehp = (struct ether_header *)mp->b_rptr;
6761 
6762 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
6763 			__func__, MBLKSIZE(mp), MBLKL(mp));
6764 
6765 		READ_ENTER(&plist->lockrw);
6766 		port = vsw_lookup_fdb(vswp, ehp);
6767 		if (port) {
6768 			/*
6769 			 * Mark port as in-use.
6770 			 */
6771 			mutex_enter(&port->ref_lock);
6772 			port->ref_cnt++;
6773 			mutex_exit(&port->ref_lock);
6774 			RW_EXIT(&plist->lockrw);
6775 
6776 			D2(vswp, "%s: sending to target port", __func__);
6777 			(void) vsw_portsend(port, mp);
6778 
6779 			/*
6780 			 * Finished with port so decrement ref count and
6781 			 * check if should wake delete thread.
6782 			 */
6783 			mutex_enter(&port->ref_lock);
6784 			port->ref_cnt--;
6785 			if (port->ref_cnt == 0)
6786 				cv_signal(&port->ref_cv);
6787 			mutex_exit(&port->ref_lock);
6788 		} else {
6789 			RW_EXIT(&plist->lockrw);
6790 			/*
6791 			 * Destination not in FDB
6792 			 *
6793 			 * If the destination is broadcast or
6794 			 * multicast forward the packet to all
6795 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
6796 			 * except the caller.
6797 			 */
6798 			if (IS_BROADCAST(ehp)) {
6799 				D2(vswp, "%s: BROADCAST pkt", __func__);
6800 				(void) vsw_forward_all(vswp, mp,
6801 								caller, arg);
6802 			} else if (IS_MULTICAST(ehp)) {
6803 				D2(vswp, "%s: MULTICAST pkt", __func__);
6804 				(void) vsw_forward_grp(vswp, mp,
6805 							caller, arg);
6806 			} else {
6807 				/*
6808 				 * Unicast pkt from vnet that we don't have
6809 				 * an FDB entry for, so must be destinded for
6810 				 * the outside world. Attempt to send up to the
6811 				 * IP layer to allow it to deal with it.
6812 				 */
6813 				if (caller == VSW_VNETPORT) {
6814 					READ_ENTER(&vswp->if_lockrw);
6815 					if (vswp->if_state & VSW_IF_UP) {
6816 						RW_EXIT(&vswp->if_lockrw);
6817 						D2(vswp, "%s: sending up",
6818 							__func__);
6819 						mac_rx(vswp->if_mh, mrh, mp);
6820 					} else {
6821 						RW_EXIT(&vswp->if_lockrw);
6822 						/* Interface down, drop pkt */
6823 						D2(vswp, "%s I/F down",
6824 								__func__);
6825 						freemsg(mp);
6826 					}
6827 				}
6828 			}
6829 		}
6830 	}
6831 
6832 	D1(vswp, "%s: exit", __func__);
6833 }
6834 
6835 /*
6836  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
6837  * except the caller (port on which frame arrived).
6838  */
6839 static int
6840 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
6841 {
6842 	vsw_port_list_t	*plist = &vswp->plist;
6843 	vsw_port_t	*portp;
6844 	mblk_t		*nmp = NULL;
6845 	mblk_t		*ret_m = NULL;
6846 	int		skip_port = 0;
6847 
6848 	D1(vswp, "vsw_forward_all: enter\n");
6849 
6850 	/*
6851 	 * Broadcast message from inside ldoms so send to outside
6852 	 * world if in either of layer 2 modes.
6853 	 */
6854 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
6855 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
6856 		((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
6857 
6858 		nmp = dupmsg(mp);
6859 		if (nmp) {
6860 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
6861 				DERR(vswp, "%s: dropping pkt(s) "
6862 				"consisting of %ld bytes of data for"
6863 				" physical device", __func__, MBLKL(ret_m));
6864 			freemsg(ret_m);
6865 			}
6866 		}
6867 	}
6868 
6869 	if (caller == VSW_VNETPORT)
6870 		skip_port = 1;
6871 
6872 	/*
6873 	 * Broadcast message from other vnet (layer 2 or 3) or outside
6874 	 * world (layer 2 only), send up stack if plumbed.
6875 	 */
6876 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
6877 		READ_ENTER(&vswp->if_lockrw);
6878 		if (vswp->if_state & VSW_IF_UP) {
6879 			RW_EXIT(&vswp->if_lockrw);
6880 			nmp = copymsg(mp);
6881 			if (nmp)
6882 				mac_rx(vswp->if_mh, NULL, nmp);
6883 		} else {
6884 			RW_EXIT(&vswp->if_lockrw);
6885 		}
6886 	}
6887 
6888 	/* send it to all VNETPORTs */
6889 	READ_ENTER(&plist->lockrw);
6890 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
6891 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
6892 		/*
6893 		 * Caution ! - don't reorder these two checks as arg
6894 		 * will be NULL if the caller is PHYSDEV. skip_port is
6895 		 * only set if caller is VNETPORT.
6896 		 */
6897 		if ((skip_port) && (portp == arg))
6898 			continue;
6899 		else {
6900 			nmp = dupmsg(mp);
6901 			if (nmp) {
6902 				(void) vsw_portsend(portp, nmp);
6903 			} else {
6904 				DERR(vswp, "vsw_forward_all: nmp NULL");
6905 			}
6906 		}
6907 	}
6908 	RW_EXIT(&plist->lockrw);
6909 
6910 	freemsg(mp);
6911 
6912 	D1(vswp, "vsw_forward_all: exit\n");
6913 	return (0);
6914 }
6915 
6916 /*
6917  * Forward pkts to any devices or interfaces which have registered
6918  * an interest in them (i.e. multicast groups).
6919  */
6920 static int
6921 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
6922 {
6923 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
6924 	mfdb_ent_t		*entp = NULL;
6925 	mfdb_ent_t		*tpp = NULL;
6926 	vsw_port_t 		*port;
6927 	uint64_t		key = 0;
6928 	mblk_t			*nmp = NULL;
6929 	mblk_t			*ret_m = NULL;
6930 	boolean_t		check_if = B_TRUE;
6931 
6932 	/*
6933 	 * Convert address to hash table key
6934 	 */
6935 	KEY_HASH(key, ehp->ether_dhost);
6936 
6937 	D1(vswp, "%s: key 0x%llx", __func__, key);
6938 
6939 	/*
6940 	 * If pkt came from either a vnet or down the stack (if we are
6941 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
6942 	 * over the physical adapter, and then check to see if any other
6943 	 * vnets are interested in it.
6944 	 */
6945 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
6946 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
6947 		((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
6948 		nmp = dupmsg(mp);
6949 		if (nmp) {
6950 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
6951 				DERR(vswp, "%s: dropping pkt(s) "
6952 					"consisting of %ld bytes of "
6953 					"data for physical device",
6954 					__func__, MBLKL(ret_m));
6955 				freemsg(ret_m);
6956 			}
6957 		}
6958 	}
6959 
6960 	READ_ENTER(&vswp->mfdbrw);
6961 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
6962 				(mod_hash_val_t *)&entp) != 0) {
6963 		D3(vswp, "%s: no table entry found for addr 0x%llx",
6964 								__func__, key);
6965 	} else {
6966 		/*
6967 		 * Send to list of devices associated with this address...
6968 		 */
6969 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
6970 
6971 			/* dont send to ourselves */
6972 			if ((caller == VSW_VNETPORT) &&
6973 				(tpp->d_addr == (void *)arg)) {
6974 				port = (vsw_port_t *)tpp->d_addr;
6975 				D3(vswp, "%s: not sending to ourselves"
6976 					" : port %d", __func__,
6977 					port->p_instance);
6978 				continue;
6979 
6980 			} else if ((caller == VSW_LOCALDEV) &&
6981 				(tpp->d_type == VSW_LOCALDEV)) {
6982 				D3(vswp, "%s: not sending back up stack",
6983 					__func__);
6984 				continue;
6985 			}
6986 
6987 			if (tpp->d_type == VSW_VNETPORT) {
6988 				port = (vsw_port_t *)tpp->d_addr;
6989 				D3(vswp, "%s: sending to port %ld for "
6990 					" addr 0x%llx", __func__,
6991 					port->p_instance, key);
6992 
6993 				nmp = dupmsg(mp);
6994 				if (nmp)
6995 					(void) vsw_portsend(port, nmp);
6996 			} else {
6997 				if (vswp->if_state & VSW_IF_UP) {
6998 					nmp = copymsg(mp);
6999 					if (nmp)
7000 						mac_rx(vswp->if_mh, NULL, nmp);
7001 					check_if = B_FALSE;
7002 					D3(vswp, "%s: sending up stack"
7003 						" for addr 0x%llx", __func__,
7004 						key);
7005 				}
7006 			}
7007 		}
7008 	}
7009 
7010 	RW_EXIT(&vswp->mfdbrw);
7011 
7012 	/*
7013 	 * If the pkt came from either a vnet or from physical device,
7014 	 * and if we havent already sent the pkt up the stack then we
7015 	 * check now if we can/should (i.e. the interface is plumbed
7016 	 * and in promisc mode).
7017 	 */
7018 	if ((check_if) &&
7019 		((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
7020 		READ_ENTER(&vswp->if_lockrw);
7021 		if (VSW_U_P(vswp->if_state)) {
7022 			RW_EXIT(&vswp->if_lockrw);
7023 			D3(vswp, "%s: (caller %d) finally sending up stack"
7024 				" for addr 0x%llx", __func__, caller, key);
7025 			nmp = copymsg(mp);
7026 			if (nmp)
7027 				mac_rx(vswp->if_mh, NULL, nmp);
7028 		} else {
7029 			RW_EXIT(&vswp->if_lockrw);
7030 		}
7031 	}
7032 
7033 	freemsg(mp);
7034 
7035 	D1(vswp, "%s: exit", __func__);
7036 
7037 	return (0);
7038 }
7039 
7040 /* transmit the packet over the given port */
7041 static int
7042 vsw_portsend(vsw_port_t *port, mblk_t *mp)
7043 {
7044 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
7045 	vsw_ldc_t 	*ldcp;
7046 	int		status = 0;
7047 
7048 
7049 	READ_ENTER(&ldcl->lockrw);
7050 	/*
7051 	 * Note for now, we have a single channel.
7052 	 */
7053 	ldcp = ldcl->head;
7054 	if (ldcp == NULL) {
7055 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
7056 		freemsg(mp);
7057 		RW_EXIT(&ldcl->lockrw);
7058 		return (1);
7059 	}
7060 
7061 	/*
7062 	 * Send the message out using the appropriate
7063 	 * transmit function which will free mblock when it
7064 	 * is finished with it.
7065 	 */
7066 	mutex_enter(&port->tx_lock);
7067 	if (port->transmit != NULL)
7068 		status = (*port->transmit)(ldcp, mp);
7069 	else {
7070 		freemsg(mp);
7071 	}
7072 	mutex_exit(&port->tx_lock);
7073 
7074 	RW_EXIT(&ldcl->lockrw);
7075 
7076 	return (status);
7077 }
7078 
7079 /*
7080  * Send packet out via descriptor ring to a logical device.
7081  */
7082 static int
7083 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
7084 {
7085 	vio_dring_msg_t		dring_pkt;
7086 	dring_info_t		*dp = NULL;
7087 	vsw_private_desc_t	*priv_desc = NULL;
7088 	vnet_public_desc_t	*pub = NULL;
7089 	vsw_t			*vswp = ldcp->ldc_vswp;
7090 	mblk_t			*bp;
7091 	size_t			n, size;
7092 	caddr_t			bufp;
7093 	int			idx;
7094 	int			status = LDC_TX_SUCCESS;
7095 
7096 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
7097 
7098 	/* TODO: make test a macro */
7099 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
7100 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
7101 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
7102 			"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
7103 			ldcp->lane_out.lstate);
7104 		freemsg(mp);
7105 		return (LDC_TX_FAILURE);
7106 	}
7107 
7108 	/*
7109 	 * Note - using first ring only, this may change
7110 	 * in the future.
7111 	 */
7112 	READ_ENTER(&ldcp->lane_out.dlistrw);
7113 	if ((dp = ldcp->lane_out.dringp) == NULL) {
7114 		RW_EXIT(&ldcp->lane_out.dlistrw);
7115 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
7116 			" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
7117 		freemsg(mp);
7118 		return (LDC_TX_FAILURE);
7119 	}
7120 
7121 	size = msgsize(mp);
7122 	if (size > (size_t)ETHERMAX) {
7123 		RW_EXIT(&ldcp->lane_out.dlistrw);
7124 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
7125 		    ldcp->ldc_id, size);
7126 		freemsg(mp);
7127 		return (LDC_TX_FAILURE);
7128 	}
7129 
7130 	/*
7131 	 * Find a free descriptor
7132 	 *
7133 	 * Note: for the moment we are assuming that we will only
7134 	 * have one dring going from the switch to each of its
7135 	 * peers. This may change in the future.
7136 	 */
7137 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
7138 		D2(vswp, "%s(%lld): no descriptor available for ring "
7139 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
7140 
7141 		/* nothing more we can do */
7142 		status = LDC_TX_NORESOURCES;
7143 		goto vsw_dringsend_free_exit;
7144 	} else {
7145 		D2(vswp, "%s(%lld): free private descriptor found at pos "
7146 			"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
7147 			priv_desc);
7148 	}
7149 
7150 	/* copy data into the descriptor */
7151 	bufp = priv_desc->datap;
7152 	bufp += VNET_IPALIGN;
7153 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
7154 		n = MBLKL(bp);
7155 		bcopy(bp->b_rptr, bufp, n);
7156 		bufp += n;
7157 	}
7158 
7159 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
7160 
7161 	pub = priv_desc->descp;
7162 	pub->nbytes = priv_desc->datalen;
7163 
7164 	mutex_enter(&priv_desc->dstate_lock);
7165 	pub->hdr.dstate = VIO_DESC_READY;
7166 	mutex_exit(&priv_desc->dstate_lock);
7167 
7168 	/*
7169 	 * Determine whether or not we need to send a message to our
7170 	 * peer prompting them to read our newly updated descriptor(s).
7171 	 */
7172 	mutex_enter(&dp->restart_lock);
7173 	if (dp->restart_reqd) {
7174 		dp->restart_reqd = B_FALSE;
7175 		mutex_exit(&dp->restart_lock);
7176 
7177 		/*
7178 		 * Send a vio_dring_msg to peer to prompt them to read
7179 		 * the updated descriptor ring.
7180 		 */
7181 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
7182 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
7183 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
7184 		dring_pkt.tag.vio_sid = ldcp->local_session;
7185 
7186 		/* Note - for now using first ring */
7187 		dring_pkt.dring_ident = dp->ident;
7188 
7189 		mutex_enter(&ldcp->lane_out.seq_lock);
7190 		dring_pkt.seq_num = ldcp->lane_out.seq_num++;
7191 		mutex_exit(&ldcp->lane_out.seq_lock);
7192 
7193 		/*
7194 		 * If last_ack_recv is -1 then we know we've not
7195 		 * received any ack's yet, so this must be the first
7196 		 * msg sent, so set the start to the begining of the ring.
7197 		 */
7198 		mutex_enter(&dp->dlock);
7199 		if (dp->last_ack_recv == -1) {
7200 			dring_pkt.start_idx = 0;
7201 		} else {
7202 			dring_pkt.start_idx = (dp->last_ack_recv + 1) %
7203 						dp->num_descriptors;
7204 		}
7205 		dring_pkt.end_idx = -1;
7206 		mutex_exit(&dp->dlock);
7207 
7208 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
7209 			ldcp->ldc_id, dp, dring_pkt.dring_ident);
7210 		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
7211 			__func__, ldcp->ldc_id, dring_pkt.start_idx,
7212 			dring_pkt.end_idx, dring_pkt.seq_num);
7213 
7214 		RW_EXIT(&ldcp->lane_out.dlistrw);
7215 
7216 		(void) vsw_send_msg(ldcp, (void *)&dring_pkt,
7217 					sizeof (vio_dring_msg_t), B_TRUE);
7218 
7219 		/* free the message block */
7220 		freemsg(mp);
7221 		return (status);
7222 
7223 	} else {
7224 		mutex_exit(&dp->restart_lock);
7225 		D2(vswp, "%s(%lld): updating descp %d", __func__,
7226 			ldcp->ldc_id, idx);
7227 	}
7228 
7229 vsw_dringsend_free_exit:
7230 
7231 	RW_EXIT(&ldcp->lane_out.dlistrw);
7232 
7233 	/* free the message block */
7234 	freemsg(mp);
7235 
7236 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
7237 	return (status);
7238 }
7239 
7240 /*
7241  * Send an in-band descriptor message over ldc.
7242  */
7243 static int
7244 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
7245 {
7246 	vsw_t			*vswp = ldcp->ldc_vswp;
7247 	vnet_ibnd_desc_t	ibnd_msg;
7248 	vsw_private_desc_t	*priv_desc = NULL;
7249 	dring_info_t		*dp = NULL;
7250 	size_t			n, size = 0;
7251 	caddr_t			bufp;
7252 	mblk_t			*bp;
7253 	int			idx, i;
7254 	int			status = LDC_TX_SUCCESS;
7255 	static int		warn_msg = 1;
7256 
7257 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
7258 
7259 	ASSERT(mp != NULL);
7260 
7261 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
7262 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
7263 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
7264 			__func__, ldcp->ldc_id, ldcp->ldc_status,
7265 			ldcp->lane_out.lstate);
7266 		freemsg(mp);
7267 		return (LDC_TX_FAILURE);
7268 	}
7269 
7270 	/*
7271 	 * only expect single dring to exist, which we use
7272 	 * as an internal buffer, rather than a transfer channel.
7273 	 */
7274 	READ_ENTER(&ldcp->lane_out.dlistrw);
7275 	if ((dp = ldcp->lane_out.dringp) == NULL) {
7276 		DERR(vswp, "%s(%lld): no dring for outbound lane",
7277 			__func__, ldcp->ldc_id);
7278 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
7279 			__func__, ldcp->ldc_id, ldcp->ldc_status,
7280 			ldcp->lane_out.lstate);
7281 		RW_EXIT(&ldcp->lane_out.dlistrw);
7282 		freemsg(mp);
7283 		return (LDC_TX_FAILURE);
7284 	}
7285 
7286 	size = msgsize(mp);
7287 	if (size > (size_t)ETHERMAX) {
7288 		RW_EXIT(&ldcp->lane_out.dlistrw);
7289 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
7290 		    ldcp->ldc_id, size);
7291 		freemsg(mp);
7292 		return (LDC_TX_FAILURE);
7293 	}
7294 
7295 	/*
7296 	 * Find a free descriptor in our buffer ring
7297 	 */
7298 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
7299 		RW_EXIT(&ldcp->lane_out.dlistrw);
7300 		if (warn_msg) {
7301 			DERR(vswp, "%s(%lld): no descriptor available for ring "
7302 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
7303 			warn_msg = 0;
7304 		}
7305 
7306 		/* nothing more we can do */
7307 		status = LDC_TX_NORESOURCES;
7308 		goto vsw_descrsend_free_exit;
7309 	} else {
7310 		D2(vswp, "%s(%lld): free private descriptor found at pos "
7311 			"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
7312 			priv_desc);
7313 		warn_msg = 1;
7314 	}
7315 
7316 	/* copy data into the descriptor */
7317 	bufp = priv_desc->datap;
7318 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
7319 		n = MBLKL(bp);
7320 		bcopy(bp->b_rptr, bufp, n);
7321 		bufp += n;
7322 	}
7323 
7324 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
7325 
7326 	/* create and send the in-band descp msg */
7327 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
7328 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
7329 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
7330 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
7331 
7332 	mutex_enter(&ldcp->lane_out.seq_lock);
7333 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
7334 	mutex_exit(&ldcp->lane_out.seq_lock);
7335 
7336 	/*
7337 	 * Copy the mem cookies describing the data from the
7338 	 * private region of the descriptor ring into the inband
7339 	 * descriptor.
7340 	 */
7341 	for (i = 0; i < priv_desc->ncookies; i++) {
7342 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
7343 			sizeof (ldc_mem_cookie_t));
7344 	}
7345 
7346 	ibnd_msg.hdr.desc_handle = idx;
7347 	ibnd_msg.ncookies = priv_desc->ncookies;
7348 	ibnd_msg.nbytes = size;
7349 
7350 	RW_EXIT(&ldcp->lane_out.dlistrw);
7351 
7352 	(void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
7353 			sizeof (vnet_ibnd_desc_t), B_TRUE);
7354 
7355 vsw_descrsend_free_exit:
7356 
7357 	/* free the allocated message blocks */
7358 	freemsg(mp);
7359 
7360 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
7361 	return (status);
7362 }
7363 
7364 static void
7365 vsw_send_ver(void *arg)
7366 {
7367 	vsw_ldc_t	*ldcp = (vsw_ldc_t *)arg;
7368 	vsw_t		*vswp = ldcp->ldc_vswp;
7369 	lane_t		*lp = &ldcp->lane_out;
7370 	vio_ver_msg_t	ver_msg;
7371 
7372 	D1(vswp, "%s enter", __func__);
7373 
7374 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7375 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7376 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
7377 	ver_msg.tag.vio_sid = ldcp->local_session;
7378 
7379 	ver_msg.ver_major = vsw_versions[0].ver_major;
7380 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
7381 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
7382 
7383 	lp->lstate |= VSW_VER_INFO_SENT;
7384 	lp->ver_major = ver_msg.ver_major;
7385 	lp->ver_minor = ver_msg.ver_minor;
7386 
7387 	DUMP_TAG(ver_msg.tag);
7388 
7389 	(void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
7390 
7391 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
7392 }
7393 
7394 static void
7395 vsw_send_attr(vsw_ldc_t *ldcp)
7396 {
7397 	vsw_t			*vswp = ldcp->ldc_vswp;
7398 	lane_t			*lp = &ldcp->lane_out;
7399 	vnet_attr_msg_t		attr_msg;
7400 
7401 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
7402 
7403 	/*
7404 	 * Subtype is set to INFO by default
7405 	 */
7406 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7407 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7408 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
7409 	attr_msg.tag.vio_sid = ldcp->local_session;
7410 
7411 	/* payload copied from default settings for lane */
7412 	attr_msg.mtu = lp->mtu;
7413 	attr_msg.addr_type = lp->addr_type;
7414 	attr_msg.xfer_mode = lp->xfer_mode;
7415 	attr_msg.ack_freq = lp->xfer_mode;
7416 
7417 	READ_ENTER(&vswp->if_lockrw);
7418 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
7419 	RW_EXIT(&vswp->if_lockrw);
7420 
7421 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
7422 
7423 	DUMP_TAG(attr_msg.tag);
7424 
7425 	(void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
7426 
7427 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
7428 }
7429 
7430 /*
7431  * Create dring info msg (which also results in the creation of
7432  * a dring).
7433  */
7434 static vio_dring_reg_msg_t *
7435 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
7436 {
7437 	vio_dring_reg_msg_t	*mp;
7438 	dring_info_t		*dp;
7439 	vsw_t			*vswp = ldcp->ldc_vswp;
7440 
7441 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
7442 
7443 	/*
7444 	 * If we can't create a dring, obviously no point sending
7445 	 * a message.
7446 	 */
7447 	if ((dp = vsw_create_dring(ldcp)) == NULL)
7448 		return (NULL);
7449 
7450 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
7451 
7452 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
7453 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
7454 	mp->tag.vio_subtype_env = VIO_DRING_REG;
7455 	mp->tag.vio_sid = ldcp->local_session;
7456 
7457 	/* payload */
7458 	mp->num_descriptors = dp->num_descriptors;
7459 	mp->descriptor_size = dp->descriptor_size;
7460 	mp->options = dp->options;
7461 	mp->ncookies = dp->ncookies;
7462 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
7463 
7464 	mp->dring_ident = 0;
7465 
7466 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
7467 
7468 	return (mp);
7469 }
7470 
7471 static void
7472 vsw_send_dring_info(vsw_ldc_t *ldcp)
7473 {
7474 	vio_dring_reg_msg_t	*dring_msg;
7475 	vsw_t			*vswp = ldcp->ldc_vswp;
7476 
7477 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
7478 
7479 	dring_msg = vsw_create_dring_info_pkt(ldcp);
7480 	if (dring_msg == NULL) {
7481 		cmn_err(CE_WARN, "!vsw%d: %s: error creating msg",
7482 			vswp->instance, __func__);
7483 		return;
7484 	}
7485 
7486 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
7487 
7488 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
7489 
7490 	(void) vsw_send_msg(ldcp, dring_msg,
7491 		sizeof (vio_dring_reg_msg_t), B_TRUE);
7492 
7493 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
7494 
7495 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
7496 }
7497 
7498 static void
7499 vsw_send_rdx(vsw_ldc_t *ldcp)
7500 {
7501 	vsw_t		*vswp = ldcp->ldc_vswp;
7502 	vio_rdx_msg_t	rdx_msg;
7503 
7504 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
7505 
7506 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
7507 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
7508 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
7509 	rdx_msg.tag.vio_sid = ldcp->local_session;
7510 
7511 	ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
7512 
7513 	DUMP_TAG(rdx_msg.tag);
7514 
7515 	(void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
7516 
7517 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
7518 }
7519 
7520 /*
7521  * Generic routine to send message out over ldc channel.
7522  *
7523  * It is possible that when we attempt to write over the ldc channel
7524  * that we get notified that it has been reset. Depending on the value
7525  * of the handle_reset flag we either handle that event here or simply
7526  * notify the caller that the channel was reset.
7527  */
7528 static int
7529 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset)
7530 {
7531 	int		rv;
7532 	size_t		msglen = size;
7533 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
7534 	vsw_t		*vswp = ldcp->ldc_vswp;
7535 
7536 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
7537 			ldcp->ldc_id, size);
7538 
7539 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
7540 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
7541 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
7542 
7543 	mutex_enter(&ldcp->ldc_txlock);
7544 	do {
7545 		msglen = size;
7546 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
7547 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
7548 
7549 	if ((rv != 0) || (msglen != size)) {
7550 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
7551 			"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
7552 			rv, size, msglen);
7553 	}
7554 	mutex_exit(&ldcp->ldc_txlock);
7555 
7556 	/*
7557 	 * If channel has been reset we either handle it here or
7558 	 * simply report back that it has been reset and let caller
7559 	 * decide what to do.
7560 	 */
7561 	if (rv == ECONNRESET) {
7562 		DWARN(vswp, "%s (%lld) channel reset",
7563 					__func__, ldcp->ldc_id);
7564 
7565 		/*
7566 		 * N.B - must never be holding the dlistrw lock when
7567 		 * we do a reset of the channel.
7568 		 */
7569 		if (handle_reset) {
7570 			vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
7571 		}
7572 	}
7573 
7574 	return (rv);
7575 }
7576 
7577 /*
7578  * Add an entry into FDB, for the given mac address and port_id.
7579  * Returns 0 on success, 1 on failure.
7580  *
7581  * Lock protecting FDB must be held by calling process.
7582  */
7583 static int
7584 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
7585 {
7586 	uint64_t	addr = 0;
7587 
7588 	D1(vswp, "%s: enter", __func__);
7589 
7590 	KEY_HASH(addr, port->p_macaddr);
7591 
7592 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
7593 
7594 	/*
7595 	 * Note: duplicate keys will be rejected by mod_hash.
7596 	 */
7597 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
7598 				(mod_hash_val_t)port) != 0) {
7599 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
7600 		return (1);
7601 	}
7602 
7603 	D1(vswp, "%s: exit", __func__);
7604 	return (0);
7605 }
7606 
7607 /*
7608  * Remove an entry from FDB.
7609  * Returns 0 on success, 1 on failure.
7610  */
7611 static int
7612 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
7613 {
7614 	uint64_t	addr = 0;
7615 
7616 	D1(vswp, "%s: enter", __func__);
7617 
7618 	KEY_HASH(addr, port->p_macaddr);
7619 
7620 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
7621 
7622 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
7623 
7624 	D1(vswp, "%s: enter", __func__);
7625 
7626 	return (0);
7627 }
7628 
7629 /*
7630  * Search fdb for a given mac address.
7631  * Returns pointer to the entry if found, else returns NULL.
7632  */
7633 static vsw_port_t *
7634 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
7635 {
7636 	uint64_t	key = 0;
7637 	vsw_port_t	*port = NULL;
7638 
7639 	D1(vswp, "%s: enter", __func__);
7640 
7641 	KEY_HASH(key, ehp->ether_dhost);
7642 
7643 	D2(vswp, "%s: key = 0x%llx", __func__, key);
7644 
7645 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
7646 				(mod_hash_val_t *)&port) != 0) {
7647 		D2(vswp, "%s: no port found", __func__);
7648 		return (NULL);
7649 	}
7650 
7651 	D1(vswp, "%s: exit", __func__);
7652 
7653 	return (port);
7654 }
7655 
7656 /*
7657  * Add or remove multicast address(es).
7658  *
7659  * Returns 0 on success, 1 on failure.
7660  */
7661 static int
7662 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
7663 {
7664 	mcst_addr_t		*mcst_p = NULL;
7665 	vsw_t			*vswp = port->p_vswp;
7666 	uint64_t		addr = 0x0;
7667 	int			i;
7668 
7669 	D1(vswp, "%s: enter", __func__);
7670 
7671 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
7672 
7673 	mutex_enter(&vswp->mac_lock);
7674 	if (vswp->mh == NULL) {
7675 		mutex_exit(&vswp->mac_lock);
7676 		return (1);
7677 	}
7678 	mutex_exit(&vswp->mac_lock);
7679 
7680 	for (i = 0; i < mcst_pkt->count; i++) {
7681 		/*
7682 		 * Convert address into form that can be used
7683 		 * as hash table key.
7684 		 */
7685 		KEY_HASH(addr, mcst_pkt->mca[i]);
7686 
7687 		/*
7688 		 * Add or delete the specified address/port combination.
7689 		 */
7690 		if (mcst_pkt->set == 0x1) {
7691 			D3(vswp, "%s: adding multicast address 0x%llx for "
7692 				"port %ld", __func__, addr, port->p_instance);
7693 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
7694 				/*
7695 				 * Update the list of multicast
7696 				 * addresses contained within the
7697 				 * port structure to include this new
7698 				 * one.
7699 				 */
7700 				mcst_p = kmem_alloc(sizeof (mcst_addr_t),
7701 								KM_NOSLEEP);
7702 				if (mcst_p == NULL) {
7703 					DERR(vswp, "%s: unable to alloc mem",
7704 						__func__);
7705 					return (1);
7706 				}
7707 
7708 				mcst_p->nextp = NULL;
7709 				mcst_p->addr = addr;
7710 
7711 				mutex_enter(&port->mca_lock);
7712 				mcst_p->nextp = port->mcap;
7713 				port->mcap = mcst_p;
7714 				mutex_exit(&port->mca_lock);
7715 
7716 				/*
7717 				 * Program the address into HW. If the addr
7718 				 * has already been programmed then the MAC
7719 				 * just increments a ref counter (which is
7720 				 * used when the address is being deleted)
7721 				 */
7722 				mutex_enter(&vswp->mac_lock);
7723 				if ((vswp->mh == NULL) ||
7724 					mac_multicst_add(vswp->mh,
7725 						(uchar_t *)&mcst_pkt->mca[i])) {
7726 					mutex_exit(&vswp->mac_lock);
7727 					cmn_err(CE_WARN, "!vsw%d: unable to "
7728 						"add multicast address",
7729 						vswp->instance);
7730 					(void) vsw_del_mcst(vswp, VSW_VNETPORT,
7731 						addr, port);
7732 					vsw_del_addr(VSW_VNETPORT, port, addr);
7733 					return (1);
7734 				}
7735 				mutex_exit(&vswp->mac_lock);
7736 
7737 			} else {
7738 				DERR(vswp, "%s: error adding multicast "
7739 					"address 0x%llx for port %ld",
7740 					__func__, addr, port->p_instance);
7741 				return (1);
7742 			}
7743 		} else {
7744 			/*
7745 			 * Delete an entry from the multicast hash
7746 			 * table and update the address list
7747 			 * appropriately.
7748 			 */
7749 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
7750 				D3(vswp, "%s: deleting multicast address "
7751 					"0x%llx for port %ld", __func__, addr,
7752 					port->p_instance);
7753 
7754 				vsw_del_addr(VSW_VNETPORT, port, addr);
7755 
7756 				/*
7757 				 * Remove the address from HW. The address
7758 				 * will actually only be removed once the ref
7759 				 * count within the MAC layer has dropped to
7760 				 * zero. I.e. we can safely call this fn even
7761 				 * if other ports are interested in this
7762 				 * address.
7763 				 */
7764 				mutex_enter(&vswp->mac_lock);
7765 				if ((vswp->mh == NULL) ||
7766 					mac_multicst_remove(vswp->mh,
7767 						(uchar_t *)&mcst_pkt->mca[i])) {
7768 					mutex_exit(&vswp->mac_lock);
7769 					cmn_err(CE_WARN, "!vsw%d: unable to "
7770 						"remove multicast address",
7771 						vswp->instance);
7772 					return (1);
7773 				}
7774 				mutex_exit(&vswp->mac_lock);
7775 
7776 			} else {
7777 				DERR(vswp, "%s: error deleting multicast "
7778 					"addr 0x%llx for port %ld",
7779 					__func__, addr, port->p_instance);
7780 				return (1);
7781 			}
7782 		}
7783 	}
7784 	D1(vswp, "%s: exit", __func__);
7785 	return (0);
7786 }
7787 
7788 /*
7789  * Add a new multicast entry.
7790  *
7791  * Search hash table based on address. If match found then
7792  * update associated val (which is chain of ports), otherwise
7793  * create new key/val (addr/port) pair and insert into table.
7794  */
7795 static int
7796 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
7797 {
7798 	int		dup = 0;
7799 	int		rv = 0;
7800 	mfdb_ent_t	*ment = NULL;
7801 	mfdb_ent_t	*tmp_ent = NULL;
7802 	mfdb_ent_t	*new_ent = NULL;
7803 	void		*tgt = NULL;
7804 
7805 	if (devtype == VSW_VNETPORT) {
7806 		/*
7807 		 * Being invoked from a vnet.
7808 		 */
7809 		ASSERT(arg != NULL);
7810 		tgt = arg;
7811 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
7812 			((vsw_port_t *)arg)->p_instance, addr);
7813 	} else {
7814 		/*
7815 		 * We are being invoked via the m_multicst mac entry
7816 		 * point.
7817 		 */
7818 		D2(NULL, "%s: address 0x%llx", __func__, addr);
7819 		tgt = (void *)vswp;
7820 	}
7821 
7822 	WRITE_ENTER(&vswp->mfdbrw);
7823 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
7824 				(mod_hash_val_t *)&ment) != 0) {
7825 
7826 		/* address not currently in table */
7827 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
7828 		ment->d_addr = (void *)tgt;
7829 		ment->d_type = devtype;
7830 		ment->nextp = NULL;
7831 
7832 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
7833 			(mod_hash_val_t)ment) != 0) {
7834 			DERR(vswp, "%s: hash table insertion failed", __func__);
7835 			kmem_free(ment, sizeof (mfdb_ent_t));
7836 			rv = 1;
7837 		} else {
7838 			D2(vswp, "%s: added initial entry for 0x%llx to "
7839 				"table", __func__, addr);
7840 		}
7841 	} else {
7842 		/*
7843 		 * Address in table. Check to see if specified port
7844 		 * is already associated with the address. If not add
7845 		 * it now.
7846 		 */
7847 		tmp_ent = ment;
7848 		while (tmp_ent != NULL) {
7849 			if (tmp_ent->d_addr == (void *)tgt) {
7850 				if (devtype == VSW_VNETPORT) {
7851 					DERR(vswp, "%s: duplicate port entry "
7852 						"found for portid %ld and key "
7853 						"0x%llx", __func__,
7854 						((vsw_port_t *)arg)->p_instance,
7855 						addr);
7856 				} else {
7857 					DERR(vswp, "%s: duplicate entry found"
7858 						"for key 0x%llx",
7859 						__func__, addr);
7860 				}
7861 				rv = 1;
7862 				dup = 1;
7863 				break;
7864 			}
7865 			tmp_ent = tmp_ent->nextp;
7866 		}
7867 
7868 		/*
7869 		 * Port not on list so add it to end now.
7870 		 */
7871 		if (0 == dup) {
7872 			D2(vswp, "%s: added entry for 0x%llx to table",
7873 				__func__, addr);
7874 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
7875 			new_ent->d_addr = (void *)tgt;
7876 			new_ent->d_type = devtype;
7877 			new_ent->nextp = NULL;
7878 
7879 			tmp_ent = ment;
7880 			while (tmp_ent->nextp != NULL)
7881 				tmp_ent = tmp_ent->nextp;
7882 
7883 			tmp_ent->nextp = new_ent;
7884 		}
7885 	}
7886 
7887 	RW_EXIT(&vswp->mfdbrw);
7888 	return (rv);
7889 }
7890 
7891 /*
7892  * Remove a multicast entry from the hashtable.
7893  *
7894  * Search hash table based on address. If match found, scan
7895  * list of ports associated with address. If specified port
7896  * found remove it from list.
7897  */
7898 static int
7899 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
7900 {
7901 	mfdb_ent_t	*ment = NULL;
7902 	mfdb_ent_t	*curr_p, *prev_p;
7903 	void		*tgt = NULL;
7904 
7905 	D1(vswp, "%s: enter", __func__);
7906 
7907 	if (devtype == VSW_VNETPORT) {
7908 		tgt = (vsw_port_t *)arg;
7909 		D2(vswp, "%s: removing port %d from mFDB for address"
7910 			" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
7911 			addr);
7912 	} else {
7913 		D2(vswp, "%s: removing entry", __func__);
7914 		tgt = (void *)vswp;
7915 	}
7916 
7917 	WRITE_ENTER(&vswp->mfdbrw);
7918 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
7919 				(mod_hash_val_t *)&ment) != 0) {
7920 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
7921 		RW_EXIT(&vswp->mfdbrw);
7922 		return (1);
7923 	}
7924 
7925 	prev_p = curr_p = ment;
7926 
7927 	while (curr_p != NULL) {
7928 		if (curr_p->d_addr == (void *)tgt) {
7929 			if (devtype == VSW_VNETPORT) {
7930 				D2(vswp, "%s: port %d found", __func__,
7931 					((vsw_port_t *)tgt)->p_instance);
7932 			} else {
7933 				D2(vswp, "%s: instance found", __func__);
7934 			}
7935 
7936 			if (prev_p == curr_p) {
7937 				/*
7938 				 * head of list, if no other element is in
7939 				 * list then destroy this entry, otherwise
7940 				 * just replace it with updated value.
7941 				 */
7942 				ment = curr_p->nextp;
7943 				kmem_free(curr_p, sizeof (mfdb_ent_t));
7944 				if (ment == NULL) {
7945 					(void) mod_hash_destroy(vswp->mfdb,
7946 							(mod_hash_val_t)addr);
7947 				} else {
7948 					(void) mod_hash_replace(vswp->mfdb,
7949 							(mod_hash_key_t)addr,
7950 							(mod_hash_val_t)ment);
7951 				}
7952 			} else {
7953 				/*
7954 				 * Not head of list, no need to do
7955 				 * replacement, just adjust list pointers.
7956 				 */
7957 				prev_p->nextp = curr_p->nextp;
7958 				kmem_free(curr_p, sizeof (mfdb_ent_t));
7959 			}
7960 			break;
7961 		}
7962 
7963 		prev_p = curr_p;
7964 		curr_p = curr_p->nextp;
7965 	}
7966 
7967 	RW_EXIT(&vswp->mfdbrw);
7968 
7969 	D1(vswp, "%s: exit", __func__);
7970 
7971 	return (0);
7972 }
7973 
7974 /*
7975  * Port is being deleted, but has registered an interest in one
7976  * or more multicast groups. Using the list of addresses maintained
7977  * within the port structure find the appropriate entry in the hash
7978  * table and remove this port from the list of interested ports.
7979  */
7980 static void
7981 vsw_del_mcst_port(vsw_port_t *port)
7982 {
7983 	mcst_addr_t	*mcst_p = NULL;
7984 	vsw_t		*vswp = port->p_vswp;
7985 
7986 	D1(vswp, "%s: enter", __func__);
7987 
7988 	mutex_enter(&port->mca_lock);
7989 	while (port->mcap != NULL) {
7990 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
7991 					port->mcap->addr, port);
7992 
7993 		mcst_p = port->mcap->nextp;
7994 		kmem_free(port->mcap, sizeof (mcst_addr_t));
7995 		port->mcap = mcst_p;
7996 	}
7997 	mutex_exit(&port->mca_lock);
7998 
7999 	D1(vswp, "%s: exit", __func__);
8000 }
8001 
8002 /*
8003  * This vsw instance is detaching, but has registered an interest in one
8004  * or more multicast groups. Using the list of addresses maintained
8005  * within the vsw structure find the appropriate entry in the hash
8006  * table and remove this instance from the list of interested ports.
8007  */
8008 static void
8009 vsw_del_mcst_vsw(vsw_t *vswp)
8010 {
8011 	mcst_addr_t	*next_p = NULL;
8012 
8013 	D1(vswp, "%s: enter", __func__);
8014 
8015 	mutex_enter(&vswp->mca_lock);
8016 
8017 	while (vswp->mcap != NULL) {
8018 		DERR(vswp, "%s: deleting addr 0x%llx",
8019 			__func__, vswp->mcap->addr);
8020 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
8021 				vswp->mcap->addr, NULL);
8022 
8023 		next_p = vswp->mcap->nextp;
8024 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
8025 		vswp->mcap = next_p;
8026 	}
8027 
8028 	vswp->mcap = NULL;
8029 	mutex_exit(&vswp->mca_lock);
8030 
8031 	D1(vswp, "%s: exit", __func__);
8032 }
8033 
8034 
8035 /*
8036  * Remove the specified address from the list of address maintained
8037  * in this port node.
8038  */
8039 static void
8040 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
8041 {
8042 	vsw_t		*vswp = NULL;
8043 	vsw_port_t	*port = NULL;
8044 	mcst_addr_t	*prev_p = NULL;
8045 	mcst_addr_t	*curr_p = NULL;
8046 
8047 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
8048 		__func__, devtype, addr);
8049 
8050 	if (devtype == VSW_VNETPORT) {
8051 		port = (vsw_port_t *)arg;
8052 		mutex_enter(&port->mca_lock);
8053 		prev_p = curr_p = port->mcap;
8054 	} else {
8055 		vswp = (vsw_t *)arg;
8056 		mutex_enter(&vswp->mca_lock);
8057 		prev_p = curr_p = vswp->mcap;
8058 	}
8059 
8060 	while (curr_p != NULL) {
8061 		if (curr_p->addr == addr) {
8062 			D2(NULL, "%s: address found", __func__);
8063 			/* match found */
8064 			if (prev_p == curr_p) {
8065 				/* list head */
8066 				if (devtype == VSW_VNETPORT)
8067 					port->mcap = curr_p->nextp;
8068 				else
8069 					vswp->mcap = curr_p->nextp;
8070 			} else {
8071 				prev_p->nextp = curr_p->nextp;
8072 			}
8073 			kmem_free(curr_p, sizeof (mcst_addr_t));
8074 			break;
8075 		} else {
8076 			prev_p = curr_p;
8077 			curr_p = curr_p->nextp;
8078 		}
8079 	}
8080 
8081 	if (devtype == VSW_VNETPORT)
8082 		mutex_exit(&port->mca_lock);
8083 	else
8084 		mutex_exit(&vswp->mca_lock);
8085 
8086 	D1(NULL, "%s: exit", __func__);
8087 }
8088 
8089 /*
8090  * Creates a descriptor ring (dring) and links it into the
8091  * link of outbound drings for this channel.
8092  *
8093  * Returns NULL if creation failed.
8094  */
8095 static dring_info_t *
8096 vsw_create_dring(vsw_ldc_t *ldcp)
8097 {
8098 	vsw_private_desc_t	*priv_addr = NULL;
8099 	vsw_t			*vswp = ldcp->ldc_vswp;
8100 	ldc_mem_info_t		minfo;
8101 	dring_info_t		*dp, *tp;
8102 	int			i;
8103 
8104 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
8105 
8106 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
8107 
8108 	/* create public section of ring */
8109 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
8110 			VSW_PUB_SIZE, &dp->handle)) != 0) {
8111 
8112 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
8113 			"failed", ldcp->ldc_id);
8114 		goto create_fail_exit;
8115 	}
8116 
8117 	ASSERT(dp->handle != NULL);
8118 
8119 	/*
8120 	 * Get the base address of the public section of the ring.
8121 	 */
8122 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
8123 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
8124 			ldcp->ldc_id);
8125 		goto dring_fail_exit;
8126 	} else {
8127 		ASSERT(minfo.vaddr != 0);
8128 		dp->pub_addr = minfo.vaddr;
8129 	}
8130 
8131 	dp->num_descriptors = VSW_RING_NUM_EL;
8132 	dp->descriptor_size = VSW_PUB_SIZE;
8133 	dp->options = VIO_TX_DRING;
8134 	dp->ncookies = 1;	/* guaranteed by ldc */
8135 
8136 	/*
8137 	 * create private portion of ring
8138 	 */
8139 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
8140 		(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
8141 
8142 	if (vsw_setup_ring(ldcp, dp)) {
8143 		DERR(vswp, "%s: unable to setup ring", __func__);
8144 		goto dring_fail_exit;
8145 	}
8146 
8147 	/* haven't used any descriptors yet */
8148 	dp->end_idx = 0;
8149 	dp->last_ack_recv = -1;
8150 
8151 	/* bind dring to the channel */
8152 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
8153 		LDC_SHADOW_MAP, LDC_MEM_RW,
8154 		&dp->cookie[0], &dp->ncookies)) != 0) {
8155 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
8156 			"%lld", ldcp->ldc_id);
8157 		goto dring_fail_exit;
8158 	}
8159 
8160 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
8161 	dp->restart_reqd = B_TRUE;
8162 
8163 	/*
8164 	 * Only ever create rings for outgoing lane. Link it onto
8165 	 * end of list.
8166 	 */
8167 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
8168 	if (ldcp->lane_out.dringp == NULL) {
8169 		D2(vswp, "vsw_create_dring: adding first outbound ring");
8170 		ldcp->lane_out.dringp = dp;
8171 	} else {
8172 		tp = ldcp->lane_out.dringp;
8173 		while (tp->next != NULL)
8174 			tp = tp->next;
8175 
8176 		tp->next = dp;
8177 	}
8178 	RW_EXIT(&ldcp->lane_out.dlistrw);
8179 
8180 	return (dp);
8181 
8182 dring_fail_exit:
8183 	(void) ldc_mem_dring_destroy(dp->handle);
8184 
8185 create_fail_exit:
8186 	if (dp->priv_addr != NULL) {
8187 		priv_addr = dp->priv_addr;
8188 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
8189 			if (priv_addr->memhandle != NULL)
8190 				(void) ldc_mem_free_handle(
8191 						priv_addr->memhandle);
8192 			priv_addr++;
8193 		}
8194 		kmem_free(dp->priv_addr,
8195 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
8196 	}
8197 	mutex_destroy(&dp->dlock);
8198 
8199 	kmem_free(dp, sizeof (dring_info_t));
8200 	return (NULL);
8201 }
8202 
8203 /*
8204  * Create a ring consisting of just a private portion and link
8205  * it into the list of rings for the outbound lane.
8206  *
8207  * These type of rings are used primarily for temporary data
8208  * storage (i.e. as data buffers).
8209  */
8210 void
8211 vsw_create_privring(vsw_ldc_t *ldcp)
8212 {
8213 	dring_info_t		*dp, *tp;
8214 	vsw_t			*vswp = ldcp->ldc_vswp;
8215 
8216 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
8217 
8218 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
8219 
8220 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
8221 
8222 	/* no public section */
8223 	dp->pub_addr = NULL;
8224 
8225 	dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
8226 					VSW_RING_NUM_EL), KM_SLEEP);
8227 
8228 	dp->num_descriptors = VSW_RING_NUM_EL;
8229 
8230 	if (vsw_setup_ring(ldcp, dp)) {
8231 		DERR(vswp, "%s: setup of ring failed", __func__);
8232 		kmem_free(dp->priv_addr,
8233 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
8234 		mutex_destroy(&dp->dlock);
8235 		kmem_free(dp, sizeof (dring_info_t));
8236 		return;
8237 	}
8238 
8239 	/* haven't used any descriptors yet */
8240 	dp->end_idx = 0;
8241 
8242 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
8243 	dp->restart_reqd = B_TRUE;
8244 
8245 	/*
8246 	 * Only ever create rings for outgoing lane. Link it onto
8247 	 * end of list.
8248 	 */
8249 	WRITE_ENTER(&ldcp->lane_out.dlistrw);
8250 	if (ldcp->lane_out.dringp == NULL) {
8251 		D2(vswp, "%s: adding first outbound privring", __func__);
8252 		ldcp->lane_out.dringp = dp;
8253 	} else {
8254 		tp = ldcp->lane_out.dringp;
8255 		while (tp->next != NULL)
8256 			tp = tp->next;
8257 
8258 		tp->next = dp;
8259 	}
8260 	RW_EXIT(&ldcp->lane_out.dlistrw);
8261 
8262 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
8263 }
8264 
8265 /*
8266  * Setup the descriptors in the dring. Returns 0 on success, 1 on
8267  * failure.
8268  */
8269 int
8270 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
8271 {
8272 	vnet_public_desc_t	*pub_addr = NULL;
8273 	vsw_private_desc_t	*priv_addr = NULL;
8274 	vsw_t			*vswp = ldcp->ldc_vswp;
8275 	uint64_t		*tmpp;
8276 	uint64_t		offset = 0;
8277 	uint32_t		ncookies = 0;
8278 	static char		*name = "vsw_setup_ring";
8279 	int			i, j, nc, rv;
8280 
8281 	priv_addr = dp->priv_addr;
8282 	pub_addr = dp->pub_addr;
8283 
8284 	/* public section may be null but private should never be */
8285 	ASSERT(priv_addr != NULL);
8286 
8287 	/*
8288 	 * Allocate the region of memory which will be used to hold
8289 	 * the data the descriptors will refer to.
8290 	 */
8291 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
8292 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
8293 
8294 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
8295 		dp->data_sz, dp->data_addr);
8296 
8297 	tmpp = (uint64_t *)dp->data_addr;
8298 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
8299 
8300 	/*
8301 	 * Initialise some of the private and public (if they exist)
8302 	 * descriptor fields.
8303 	 */
8304 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
8305 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
8306 
8307 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
8308 			&priv_addr->memhandle)) != 0) {
8309 			DERR(vswp, "%s: alloc mem handle failed", name);
8310 			goto setup_ring_cleanup;
8311 		}
8312 
8313 		priv_addr->datap = (void *)tmpp;
8314 
8315 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
8316 			(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
8317 			LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
8318 			&(priv_addr->memcookie[0]), &ncookies);
8319 		if (rv != 0) {
8320 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
8321 				"(rv %d)", name, ldcp->ldc_id, rv);
8322 			goto setup_ring_cleanup;
8323 		}
8324 		priv_addr->bound = 1;
8325 
8326 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
8327 			name, i, priv_addr->memcookie[0].addr,
8328 			priv_addr->memcookie[0].size);
8329 
8330 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
8331 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
8332 				"invalid num of cookies (%d) for size 0x%llx",
8333 				name, ldcp->ldc_id, ncookies,
8334 				VSW_RING_EL_DATA_SZ);
8335 
8336 			goto setup_ring_cleanup;
8337 		} else {
8338 			for (j = 1; j < ncookies; j++) {
8339 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
8340 					&(priv_addr->memcookie[j]));
8341 				if (rv != 0) {
8342 					DERR(vswp, "%s: ldc_mem_nextcookie "
8343 						"failed rv (%d)", name, rv);
8344 					goto setup_ring_cleanup;
8345 				}
8346 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
8347 					"size 0x%llx", name, j,
8348 					priv_addr->memcookie[j].addr,
8349 					priv_addr->memcookie[j].size);
8350 			}
8351 
8352 		}
8353 		priv_addr->ncookies = ncookies;
8354 		priv_addr->dstate = VIO_DESC_FREE;
8355 
8356 		if (pub_addr != NULL) {
8357 
8358 			/* link pub and private sides */
8359 			priv_addr->descp = pub_addr;
8360 
8361 			pub_addr->ncookies = priv_addr->ncookies;
8362 
8363 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
8364 				bcopy(&priv_addr->memcookie[nc],
8365 					&pub_addr->memcookie[nc],
8366 					sizeof (ldc_mem_cookie_t));
8367 			}
8368 
8369 			pub_addr->hdr.dstate = VIO_DESC_FREE;
8370 			pub_addr++;
8371 		}
8372 
8373 		/*
8374 		 * move to next element in the dring and the next
8375 		 * position in the data buffer.
8376 		 */
8377 		priv_addr++;
8378 		tmpp += offset;
8379 	}
8380 
8381 	return (0);
8382 
8383 setup_ring_cleanup:
8384 	priv_addr = dp->priv_addr;
8385 
8386 	for (j = 0; j < i; j++) {
8387 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
8388 		(void) ldc_mem_free_handle(priv_addr->memhandle);
8389 
8390 		mutex_destroy(&priv_addr->dstate_lock);
8391 
8392 		priv_addr++;
8393 	}
8394 	kmem_free(dp->data_addr, dp->data_sz);
8395 
8396 	return (1);
8397 }
8398 
8399 /*
8400  * Searches the private section of a ring for a free descriptor,
8401  * starting at the location of the last free descriptor found
8402  * previously.
8403  *
8404  * Returns 0 if free descriptor is available, and updates state
8405  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
8406  *
8407  * FUTURE: might need to return contiguous range of descriptors
8408  * as dring info msg assumes all will be contiguous.
8409  */
8410 static int
8411 vsw_dring_find_free_desc(dring_info_t *dringp,
8412 		vsw_private_desc_t **priv_p, int *idx)
8413 {
8414 	vsw_private_desc_t	*addr = NULL;
8415 	int			num = VSW_RING_NUM_EL;
8416 	int			ret = 1;
8417 
8418 	D1(NULL, "%s enter\n", __func__);
8419 
8420 	ASSERT(dringp->priv_addr != NULL);
8421 
8422 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
8423 			__func__, dringp, dringp->end_idx);
8424 
8425 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
8426 
8427 	mutex_enter(&addr->dstate_lock);
8428 	if (addr->dstate == VIO_DESC_FREE) {
8429 		addr->dstate = VIO_DESC_READY;
8430 		*priv_p = addr;
8431 		*idx = dringp->end_idx;
8432 		dringp->end_idx = (dringp->end_idx + 1) % num;
8433 		ret = 0;
8434 
8435 	}
8436 	mutex_exit(&addr->dstate_lock);
8437 
8438 	/* ring full */
8439 	if (ret == 1) {
8440 		D2(NULL, "%s: no desp free: started at %d", __func__,
8441 			dringp->end_idx);
8442 	}
8443 
8444 	D1(NULL, "%s: exit\n", __func__);
8445 
8446 	return (ret);
8447 }
8448 
8449 /*
8450  * Map from a dring identifier to the ring itself. Returns
8451  * pointer to ring or NULL if no match found.
8452  *
8453  * Should be called with dlistrw rwlock held as reader.
8454  */
8455 static dring_info_t *
8456 vsw_ident2dring(lane_t *lane, uint64_t ident)
8457 {
8458 	dring_info_t	*dp = NULL;
8459 
8460 	if ((dp = lane->dringp) == NULL) {
8461 		return (NULL);
8462 	} else {
8463 		if (dp->ident == ident)
8464 			return (dp);
8465 
8466 		while (dp != NULL) {
8467 			if (dp->ident == ident)
8468 				break;
8469 			dp = dp->next;
8470 		}
8471 	}
8472 
8473 	return (dp);
8474 }
8475 
8476 /*
8477  * Set the default lane attributes. These are copied into
8478  * the attr msg we send to our peer. If they are not acceptable
8479  * then (currently) the handshake ends.
8480  */
8481 static void
8482 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
8483 {
8484 	bzero(lp, sizeof (lane_t));
8485 
8486 	READ_ENTER(&vswp->if_lockrw);
8487 	ether_copy(&(vswp->if_addr), &(lp->addr));
8488 	RW_EXIT(&vswp->if_lockrw);
8489 
8490 	lp->mtu = VSW_MTU;
8491 	lp->addr_type = ADDR_TYPE_MAC;
8492 	lp->xfer_mode = VIO_DRING_MODE;
8493 	lp->ack_freq = 0;	/* for shared mode */
8494 
8495 	mutex_enter(&lp->seq_lock);
8496 	lp->seq_num = VNET_ISS;
8497 	mutex_exit(&lp->seq_lock);
8498 }
8499 
8500 /*
8501  * Verify that the attributes are acceptable.
8502  *
8503  * FUTURE: If some attributes are not acceptable, change them
8504  * our desired values.
8505  */
8506 static int
8507 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
8508 {
8509 	int	ret = 0;
8510 
8511 	D1(NULL, "vsw_check_attr enter\n");
8512 
8513 	/*
8514 	 * Note we currently only support in-band descriptors
8515 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
8516 	 */
8517 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
8518 			(pkt->xfer_mode != VIO_DRING_MODE)) {
8519 		D2(NULL, "vsw_check_attr: unknown mode %x\n",
8520 			pkt->xfer_mode);
8521 		ret = 1;
8522 	}
8523 
8524 	/* Only support MAC addresses at moment. */
8525 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
8526 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
8527 			"or address 0x%llx\n", pkt->addr_type,
8528 			pkt->addr);
8529 		ret = 1;
8530 	}
8531 
8532 	/*
8533 	 * MAC address supplied by device should match that stored
8534 	 * in the vsw-port OBP node. Need to decide what to do if they
8535 	 * don't match, for the moment just warn but don't fail.
8536 	 */
8537 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
8538 		DERR(NULL, "vsw_check_attr: device supplied address "
8539 			"0x%llx doesn't match node address 0x%llx\n",
8540 			pkt->addr, port->p_macaddr);
8541 	}
8542 
8543 	/*
8544 	 * Ack freq only makes sense in pkt mode, in shared
8545 	 * mode the ring descriptors say whether or not to
8546 	 * send back an ACK.
8547 	 */
8548 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
8549 				(pkt->ack_freq > 0)) {
8550 		D2(NULL, "vsw_check_attr: non zero ack freq "
8551 			" in SHM mode\n");
8552 		ret = 1;
8553 	}
8554 
8555 	/*
8556 	 * Note: for the moment we only support ETHER
8557 	 * frames. This may change in the future.
8558 	 */
8559 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
8560 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
8561 			pkt->mtu);
8562 		ret = 1;
8563 	}
8564 
8565 	D1(NULL, "vsw_check_attr exit\n");
8566 
8567 	return (ret);
8568 }
8569 
8570 /*
8571  * Returns 1 if there is a problem, 0 otherwise.
8572  */
8573 static int
8574 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
8575 {
8576 	_NOTE(ARGUNUSED(pkt))
8577 
8578 	int	ret = 0;
8579 
8580 	D1(NULL, "vsw_check_dring_info enter\n");
8581 
8582 	if ((pkt->num_descriptors == 0) ||
8583 		(pkt->descriptor_size == 0) ||
8584 		(pkt->ncookies != 1)) {
8585 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
8586 		ret = 1;
8587 	}
8588 
8589 	D1(NULL, "vsw_check_dring_info exit\n");
8590 
8591 	return (ret);
8592 }
8593 
8594 /*
8595  * Returns 1 if two memory cookies match. Otherwise returns 0.
8596  */
8597 static int
8598 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
8599 {
8600 	if ((m1->addr != m2->addr) ||
8601 		(m2->size != m2->size)) {
8602 		return (0);
8603 	} else {
8604 		return (1);
8605 	}
8606 }
8607 
8608 /*
8609  * Returns 1 if ring described in reg message matches that
8610  * described by dring_info structure. Otherwise returns 0.
8611  */
8612 static int
8613 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
8614 {
8615 	if ((msg->descriptor_size != dp->descriptor_size) ||
8616 		(msg->num_descriptors != dp->num_descriptors) ||
8617 		(msg->ncookies != dp->ncookies) ||
8618 		!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
8619 		return (0);
8620 	} else {
8621 		return (1);
8622 	}
8623 
8624 }
8625 
8626 static caddr_t
8627 vsw_print_ethaddr(uint8_t *a, char *ebuf)
8628 {
8629 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
8630 	    a[0], a[1], a[2], a[3], a[4], a[5]);
8631 	return (ebuf);
8632 }
8633 
8634 /*
8635  * Reset and free all the resources associated with
8636  * the channel.
8637  */
8638 static void
8639 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
8640 {
8641 	dring_info_t		*dp, *dpp;
8642 	lane_t			*lp = NULL;
8643 	int			rv = 0;
8644 
8645 	ASSERT(ldcp != NULL);
8646 
8647 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
8648 
8649 	if (dir == INBOUND) {
8650 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
8651 			" of channel %lld", __func__, ldcp->ldc_id);
8652 		lp = &ldcp->lane_in;
8653 	} else {
8654 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
8655 			" of channel %lld", __func__, ldcp->ldc_id);
8656 		lp = &ldcp->lane_out;
8657 	}
8658 
8659 	lp->lstate = VSW_LANE_INACTIV;
8660 	mutex_enter(&lp->seq_lock);
8661 	lp->seq_num = VNET_ISS;
8662 	mutex_exit(&lp->seq_lock);
8663 	if (lp->dringp) {
8664 		if (dir == INBOUND) {
8665 			WRITE_ENTER(&lp->dlistrw);
8666 			dp = lp->dringp;
8667 			while (dp != NULL) {
8668 				dpp = dp->next;
8669 				if (dp->handle != NULL)
8670 					(void) ldc_mem_dring_unmap(dp->handle);
8671 				kmem_free(dp, sizeof (dring_info_t));
8672 				dp = dpp;
8673 			}
8674 			RW_EXIT(&lp->dlistrw);
8675 		} else {
8676 			/*
8677 			 * unbind, destroy exported dring, free dring struct
8678 			 */
8679 			WRITE_ENTER(&lp->dlistrw);
8680 			dp = lp->dringp;
8681 			rv = vsw_free_ring(dp);
8682 			RW_EXIT(&lp->dlistrw);
8683 		}
8684 		if (rv == 0) {
8685 			lp->dringp = NULL;
8686 		}
8687 	}
8688 
8689 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
8690 }
8691 
8692 /*
8693  * Free ring and all associated resources.
8694  *
8695  * Should be called with dlistrw rwlock held as writer.
8696  */
8697 static int
8698 vsw_free_ring(dring_info_t *dp)
8699 {
8700 	vsw_private_desc_t	*paddr = NULL;
8701 	dring_info_t		*dpp;
8702 	int			i, rv = 1;
8703 
8704 	while (dp != NULL) {
8705 		mutex_enter(&dp->dlock);
8706 		dpp = dp->next;
8707 		if (dp->priv_addr != NULL) {
8708 			/*
8709 			 * First unbind and free the memory handles
8710 			 * stored in each descriptor within the ring.
8711 			 */
8712 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
8713 				paddr = (vsw_private_desc_t *)
8714 						dp->priv_addr + i;
8715 				if (paddr->memhandle != NULL) {
8716 					if (paddr->bound == 1) {
8717 						rv = ldc_mem_unbind_handle(
8718 							paddr->memhandle);
8719 
8720 						if (rv != 0) {
8721 							DERR(NULL, "error "
8722 							"unbinding handle for "
8723 							"ring 0x%llx at pos %d",
8724 							dp, i);
8725 							mutex_exit(&dp->dlock);
8726 							return (rv);
8727 						}
8728 						paddr->bound = 0;
8729 					}
8730 
8731 					rv = ldc_mem_free_handle(
8732 							paddr->memhandle);
8733 					if (rv != 0) {
8734 						DERR(NULL, "error freeing "
8735 							"handle for ring "
8736 							"0x%llx at pos %d",
8737 							dp, i);
8738 						mutex_exit(&dp->dlock);
8739 						return (rv);
8740 					}
8741 					paddr->memhandle = NULL;
8742 				}
8743 				mutex_destroy(&paddr->dstate_lock);
8744 			}
8745 			kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
8746 					* VSW_RING_NUM_EL));
8747 		}
8748 
8749 		/*
8750 		 * Now unbind and destroy the ring itself.
8751 		 */
8752 		if (dp->handle != NULL) {
8753 			(void) ldc_mem_dring_unbind(dp->handle);
8754 			(void) ldc_mem_dring_destroy(dp->handle);
8755 		}
8756 
8757 		if (dp->data_addr != NULL) {
8758 			kmem_free(dp->data_addr, dp->data_sz);
8759 		}
8760 
8761 		mutex_exit(&dp->dlock);
8762 		mutex_destroy(&dp->dlock);
8763 		mutex_destroy(&dp->restart_lock);
8764 		kmem_free(dp, sizeof (dring_info_t));
8765 
8766 		dp = dpp;
8767 	}
8768 	return (0);
8769 }
8770 
8771 /*
8772  * Debugging routines
8773  */
8774 static void
8775 display_state(void)
8776 {
8777 	vsw_t		*vswp;
8778 	vsw_port_list_t	*plist;
8779 	vsw_port_t 	*port;
8780 	vsw_ldc_list_t	*ldcl;
8781 	vsw_ldc_t 	*ldcp;
8782 
8783 	cmn_err(CE_NOTE, "***** system state *****");
8784 
8785 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
8786 		plist = &vswp->plist;
8787 		READ_ENTER(&plist->lockrw);
8788 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
8789 			vswp->instance, plist->num_ports);
8790 
8791 		for (port = plist->head; port != NULL; port = port->p_next) {
8792 			ldcl = &port->p_ldclist;
8793 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
8794 				port->p_instance, ldcl->num_ldcs);
8795 			READ_ENTER(&ldcl->lockrw);
8796 			ldcp = ldcl->head;
8797 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
8798 				cmn_err(CE_CONT, "chan %lu : dev %d : "
8799 					"status %d : phase %u\n",
8800 					ldcp->ldc_id, ldcp->dev_class,
8801 					ldcp->ldc_status, ldcp->hphase);
8802 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
8803 					"psession %lu\n",
8804 					ldcp->ldc_id,
8805 					ldcp->local_session,
8806 					ldcp->peer_session);
8807 
8808 				cmn_err(CE_CONT, "Inbound lane:\n");
8809 				display_lane(&ldcp->lane_in);
8810 				cmn_err(CE_CONT, "Outbound lane:\n");
8811 				display_lane(&ldcp->lane_out);
8812 			}
8813 			RW_EXIT(&ldcl->lockrw);
8814 		}
8815 		RW_EXIT(&plist->lockrw);
8816 	}
8817 	cmn_err(CE_NOTE, "***** system state *****");
8818 }
8819 
8820 static void
8821 display_lane(lane_t *lp)
8822 {
8823 	dring_info_t	*drp;
8824 
8825 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
8826 		lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
8827 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
8828 		lp->addr_type, lp->addr, lp->xfer_mode);
8829 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
8830 
8831 	cmn_err(CE_CONT, "Dring info:\n");
8832 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
8833 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
8834 			drp->num_descriptors, drp->descriptor_size);
8835 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
8836 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
8837 			(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
8838 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
8839 			drp->ident, drp->end_idx);
8840 		display_ring(drp);
8841 	}
8842 }
8843 
8844 static void
8845 display_ring(dring_info_t *dringp)
8846 {
8847 	uint64_t		i;
8848 	uint64_t		priv_count = 0;
8849 	uint64_t		pub_count = 0;
8850 	vnet_public_desc_t	*pub_addr = NULL;
8851 	vsw_private_desc_t	*priv_addr = NULL;
8852 
8853 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
8854 		if (dringp->pub_addr != NULL) {
8855 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
8856 
8857 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
8858 				pub_count++;
8859 		}
8860 
8861 		if (dringp->priv_addr != NULL) {
8862 			priv_addr =
8863 				(vsw_private_desc_t *)dringp->priv_addr + i;
8864 
8865 			if (priv_addr->dstate == VIO_DESC_FREE)
8866 				priv_count++;
8867 		}
8868 	}
8869 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
8870 			i, priv_count, pub_count);
8871 }
8872 
8873 static void
8874 dump_flags(uint64_t state)
8875 {
8876 	int	i;
8877 
8878 	typedef struct flag_name {
8879 		int	flag_val;
8880 		char	*flag_name;
8881 	} flag_name_t;
8882 
8883 	flag_name_t	flags[] = {
8884 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
8885 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
8886 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
8887 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
8888 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
8889 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
8890 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
8891 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
8892 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
8893 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
8894 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
8895 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
8896 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
8897 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
8898 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
8899 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
8900 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
8901 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
8902 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
8903 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
8904 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
8905 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
8906 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
8907 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
8908 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
8909 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
8910 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
8911 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
8912 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
8913 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
8914 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
8915 
8916 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
8917 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
8918 		if (state & flags[i].flag_val)
8919 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
8920 	}
8921 }
8922